select event_entity, event_type, sum(if(page_id is null, 1, 0)) as null_page_id, count(1) as total from wmf.mediawiki_history where snapshot='2020-06' and event_entity in ('page', 'revision') group by event_entity, event_type ;
event_entity | event_type | null_page_id | total |
page | merge | 0 | 28018 |
page | delete | 11231245 | 60155501 |
page | restore | 73426 | 1571489 |
page | create-page | 0 | 47032453 |
page | move | 299279 | 26674648 |
revision | create | 27201068 | 4818723080 |
page | create | 11231244 | 538243985 |
I also found other types of events with duplicates. page_id = 0, and other duplicates:
select event_entity, event_type, event_user_id, event_user_text_historical, event_timestamp, page_id, revision_id, user_id from wmf.mediawiki_history where snapshot='2020-06' group by event_entity, event_type, event_user_id, event_user_text_historical, event_timestamp, page_id, revision_id, user_id having count(1) > 1 ;
Here's an example of an alterblocks problem. That last record has the same start and end timestamp, causing two events in the denormalized table to have the same timestamp, and making it necessary to look at other fields for uniqueness:
select * from wmf.mediawiki_history where snapshot = '2020-06' and wiki_db = 'enwiki' and event_entity = 'user' and event_type = 'alterblocks' and event_user_id = 1272505 and event_user_text_historical = 'ST47' and event_timestamp = '2009-02-04 03:59:18.0' and wiki_db = 'enwiki' and user_id = 6144686 limit 1000 ;
wiki_db | user_id | user_text_historical | user_text | user_groups_historical | user_groups | user_blocks_historical | user_blocks | is_bot_by_historical | is_bot_by | user_registration_timestamp | user_creation_timestamp | user_first_edit_timestamp | created_by_self | created_by_system | created_by_peer | anonymous | start_timestamp | end_timestamp | caused_by_event_type | caused_by_user_id | caused_by_user_text | caused_by_anonymous_user | caused_by_block_expiration | inferred_from | source_log_id | source_log_comment | source_log_params | snapshot |
enwiki | 6144686 | Jack Merridoodoo | Jack Merridoodoo | [] | [] | [] | [] | [] | [] | 2008-01-08 08:34:38.0 | 2008-01-08 08:34:38.0 | 2008-01-08 08:52:42.0 | true | false | false | false | 2008-01-08 08:34:38.0 | 2008-01-08 09:10:09.0 | create | 6144686 | Jack Merridoodoo | false | NULL | NULL | 12958868 | {"unparsed":"6144686"} | 2020-06 | |
enwiki | 6144686 | Jack Merridoodoo | Jack Merridoodoo | [] | [] | ["nocreate"] | [] | [] | [] | 2008-01-08 08:34:38.0 | 2008-01-08 08:34:38.0 | 2008-01-08 08:52:42.0 | true | false | false | false | 2008-01-08 09:10:09.0 | 2008-07-28 20:26:26.0 | alterblocks | rEMFR125785563531 | Gogo Dodo | false | indefinite | NULL | 12959074 | [[Wikipedia:Vandalism/gVandalism]]-only account | {"unparsed":"indefinite nocreate"} | 2020-06 |
enwiki | 6144686 | Jack Merridoodoo | Jack Merridoodoo | [] | [] | [] | [] | [] | [] | 2008-01-08 08:34:38.0 | 2008-01-08 08:34:38.0 | 2008-01-08 08:52:42.0 | true | false | false | false | 2008-07-28 20:26:26.0 | 2008-07-28 20:34:41.0 | alterblocks | 1807070 | East718 | false | NULL | NULL | 16452718 | unblocking to reblock | {"unparsed":""} | 2020-06 |
enwiki | 6144686 | Jack Merridoodoo | Jack Merridoodoo | [] | [] | ["nocreate","noemail"] | [] | [] | [] | 2008-01-08 08:34:38.0 | 2008-01-08 08:34:38.0 | 2008-01-08 08:52:42.0 | true | false | false | false | 2008-07-28 20:34:41.0 | 2008-11-15 03:32:02.0 | alterblocks | 1807070 | East718 | false | indefinite | NULL | 16453396 | grawp | {"unparsed":"infinite nocreate,noemail"} | 2020-06 |
enwiki | 6144686 | Jack Merridoodoo | Jack Merridoodoo | [] | [] | ["nocreate","noemail"] | [] | [] | [] | 2008-01-08 08:34:38.0 | 2008-01-08 08:34:38.0 | 2008-01-08 08:52:42.0 | true | false | false | false | 2008-11-15 03:32:02.0 | 2009-02-04 03:59:18.0 | alterblocks | 3153598 | Fran Rogers | false | indefinite | NULL | 18256521 | [[User:JarlaxleArtemis]] | {"unparsed":"indefinite nocreate,noemail"} | 2020-06 |
enwiki | 6144686 | Jack Merridoodoo | Jack Merridoodoo | [] | [] | ["nocreate","noemail","nousertalk"] | [] | [] | [] | 2008-01-08 08:34:38.0 | 2008-01-08 08:34:38.0 | 2008-01-08 08:52:42.0 | true | false | false | false | 2009-02-04 03:59:18.0 | NULL | alterblocks | 1272505 | ST47 | false | indefinite | NULL | 20134780 | [[User:JarlaxleArtemis]] | {"unparsed":"indefinite nocreate,noemail,nousertalk"} | 2020-06 |
enwiki | 6144686 | Jack Merridoodoo | Jack Merridoodoo | [] | [] | [] | [] | [] | [] | 2008-01-08 08:34:38.0 | 2008-01-08 08:34:38.0 | 2008-01-08 08:52:42.0 | true | false | false | false | 2009-02-04 03:59:18.0 | 2009-02-04 03:59:18.0 | alterblocks | 1272505 | ST47 | false | NULL | NULL | 20134779 | {"unparsed":""} | 2020-06 | |