While investigating for T258532 I found data duplication in July 2020 for the event.mediawiki_revision_create data. This duplication is probably due to the event-producer since the only changing fields in the various rows are meta.dt. meta.id, meta.request_id.
-- In presto
-- Checking meta.id has no duplicate
SELECT COUNT(1) as dup_meta_ids
FROM (
SELECT COUNT(1) AS meta_id_count_gt_1
FROM event.mediawiki_revision_create
WHERE year = 2020 and month = 9 and day = 6
GROUP BY meta.id
HAVING COUNT(1) > 1
) subquery;
-- dup_meta_ids
-- --------------
-- 0
-- Checking rev_id has duplicates
SELECT COUNT(1) as dup_rev_ids
FROM (
SELECT COUNT(1) AS rev_id_count_gt_1
FROM event.mediawiki_revision_create
WHERE year = 2020 and month = 9 and day = 6
GROUP BY database, rev_id
HAVING COUNT(1) > 1
) subquery;
-- dup_rev_ids
-- -------------
-- 176
-- Looking for wiki, rev_id to check
SELECT
database,
rev_id,
COUNT(1) AS rev_id_count_gt_1
FROM event.mediawiki_revision_create
WHERE year = 2020 and month = 9 and day = 6
GROUP BY database, rev_id
HAVING COUNT(1) > 1
LIMIT 10;
-- database | rev_id | rev_id_count_gt_1
-- ---------------+-----------+-------------------
-- mediawikiwiki | 4086810 | 2
-- dawiki | 10455662 | 2
-- enwiki | 977066159 | 2
-- ruwikinews | 8066753 | 2
-- commonswiki | 449919046 | 2
-- plwiki | 60787792 | 2
-- plwiki | 60790202 | 2
-- plwiki | 60787723 | 2
-- elwiki | 8421375 | 2
-- enwiki | 977066086 | 2
-- Take an exemple
SELECT meta
FROM event.mediawiki_revision_create
WHERE year = 2020 and month = 9 and day = 6
AND database = 'enwiki' AND rev_id = 977066159;
-- {domain=en.wikipedia.org, dt=2020-09-06T18:38:44Z, id=47b330c8-2524-4601-ac16-97e359074527, request_id=7454961e-f74a-4928-b141-be6c7120feb8, schema_uri=null, topic=null, uri=https://en.wikipedia.org/wiki/Silvia
-- {domain=en.wikipedia.org, dt=2020-09-06T18:38:44Z, id=4e0f81bc-e025-4fee-bd68-0d18bc5bc90e, request_id=525e6f86-0401-4cdf-9044-a36c1a5dd372, schema_uri=null, topic=null, uri=https://en.wikipedia.org/wiki/Silvia