Page MenuHomePhabricator
Paste P40208

editors_dataset_sample_data
ActivePublic

Authored by Mayakp.wiki on Nov 18 2022, 10:11 PM.
with
productive_edit as (
select date(from_iso8601_timestamp(replace(event_timestamp,' ','T'))) as dt,
cardinality(event_user_is_bot_by_historical) > 0 as is_bot,
-- instead of doing a complicated join to server side account creation here,
-- we need to incorporate data from ssac into mw history and
-- have a single authoritative user creation date
-- for now, using this:
date(from_iso8601_timestamp(replace(
coalesce(event_user_creation_timestamp, event_user_registration_timestamp),' ','T'
))) as user_creation_date,
event_user_id as user_id,
wiki_db,
-- note: for revision_tags logic below, maybe join to a proper tag classification
filter(
array_distinct(array[
if(cardinality(array_intersect(revision_tags, array['x', 'x1', 'x2'])) > 0, 'feature X', null),
if(cardinality(array_intersect(revision_tags, array['y', 'y1', 'y2'])) > 0, 'feature Y', null),
if(cardinality(array_intersect(revision_tags, array['visualeditor', 'y1', 'y2'])) > 0, 'visual edit', null),
if(cardinality(array_intersect(revision_tags, array['discussiontools','discussiontools-visual', 'discussiontools-reply', 'discussiontools-source'])) > 0, 'discussion tools', null),
if(cardinality(array_intersect(revision_tags, array['mobile edit', 'mobile web edit', 'ios app edit', 'mobile app edit', 'android app edit'])) > 0, 'mobile platform', null),
if(cardinality(array_intersect(revision_tags, array['mobile edit', 'mobile web edit', 'ios app edit', 'mobile app edit', 'android app edit'])) = 0, 'desktop platform', null)
]),
x -> x is not null
) as tags_array,
case
when contains(revision_tags, 'mobile edit') then 'mobile'
else 'desktop'
end as platform, -- note: join to a proper tag classification
case
when contains(revision_tags, 'discussion tools') then 'discussion tools'
when contains(revision_tags, 'visual edit') then 'visual editor'
else 'no feature'
end as editing_feature,
event_user_groups AS user_role,
case
when event_user_is_created_by_self= true then 'self'
when event_user_is_created_by_system= true then 'system'
when event_user_is_created_by_peer= true then 'peer'
else 'bot'
end as account_created_by
from wmf.mediawiki_history
where event_entity = 'revision'
-- for now, filter out anon users
and event_user_id is not null
and event_type = 'create'
and wiki_db in ('rowiki', 'huwiki', 'etwiki')
and snapshot = '2022-09'
-- if it's reverted within 48 hours, don't count it
and (revision_seconds_to_identity_revert is null
or revision_seconds_to_identity_revert > 172800)
-- same for page deletion
-- TODO: validate that revision_deleted_by_page_deletion_timestamp is always after event_timestamp
and (revision_deleted_by_page_deletion_timestamp is null
or date_diff(
'second',
from_iso8601_timestamp(replace(event_timestamp,' ','T')),
from_iso8601_timestamp(replace(revision_deleted_by_page_deletion_timestamp,' ','T'))
) > 172800
)
limit 1000
),
daily_summary as (
select dt,
is_bot,
-- this should be unique, but just in case it's not I don't want to group by it
min(user_creation_date) as user_creation_date,
user_id,
wiki_db,
array_distinct(flatten(array_agg(tags_array))) as tags_array,
platform,
editing_feature,
count() as edit_count,
-- same here, this should be a unique value, but I don't want to group by it so I'm aggregating
-- TODO: this is NOT unique, find out why
max(date_diff('day', user_creation_date, dt)) as days_since_user_create,
user_role,
account_created_by
from productive_edit
-- TODO: open up phab task for user_creation_date IS NULL
where user_creation_date is not null
-- TODO: open up phab task for user_creation_date > dt
and user_creation_date <= dt
group by dt, is_bot, user_id, wiki_db, user_role, account_created_by, platform, editing_feature
),
with_cummulative_edit_count as (
select dt,
is_bot,
user_creation_date,
user_id,
wiki_db,
sum(edit_count) over (order by dt) as cummulative_edit_count,
dt = user_creation_date as account_created_today,
array_distinct(flatten(array_agg(tags_array))) as tags_array,
editing_feature,
platform,
days_since_user_create,
edit_count,
user_role,
account_created_by
from daily_summary
group by dt, is_bot, user_creation_date, user_id, wiki_db, days_since_user_create, edit_count, user_role, account_created_by, platform, editing_feature
)
select dt,
is_bot,
user_creation_date,
user_id,
wiki_db,
case
when cummulative_edit_count < 5 then '1-4'
when cummulative_edit_count < 100 then '5-99'
when cummulative_edit_count < 1000 then '100-999'
when cummulative_edit_count < 10000 then '1000-9999'
else '10000+'
end as career_edit_bucket,
account_created_today,
platform,
case
when days_since_user_create < 1 then 'Under 1 day'
when days_since_user_create < 8 then '1-7 days'
when days_since_user_create < 31 then '8-30 days'
when days_since_user_create < 91 then '31 to 90 days'
when days_since_user_create < 365 then '90 days to 1 year'
when days_since_user_create < 1095 then '1-3 years'
when days_since_user_create < 3650 then '3-10 years'
else 'Over 10 years'
end as tenure_bucket,
editing_feature,
edit_count,
user_role,
account_created_by,
tags_array
from with_cummulative_edit_count
;
/*
-----------------------------------------------+----------------+
wiki_db | varchar |
event_entity | varchar |
event_type | varchar |
event_timestamp | varchar |
event_comment | varchar |
event_user_id | bigint |
event_user_text_historical | varchar |
event_user_text | varchar |
event_user_blocks_historical | array(varchar) |
event_user_blocks | array(varchar) |
event_user_groups_historical | array(varchar) |
event_user_groups | array(varchar) |
event_user_is_bot_by_historical | array(varchar) |
event_user_is_bot_by | array(varchar) |
event_user_is_created_by_self | boolean |
event_user_is_created_by_system | boolean |
event_user_is_created_by_peer | boolean |
event_user_is_anonymous | boolean |
event_user_registration_timestamp | varchar |
event_user_creation_timestamp | varchar |
event_user_first_edit_timestamp | varchar |
event_user_revision_count | bigint |
event_user_seconds_since_previous_revision | bigint |
page_id | bigint |
page_artificial_id | varchar |
page_title_historical | varchar |
page_title | varchar |
page_namespace_historical | integer |
page_namespace_is_content_historical | boolean |
page_namespace | integer |
page_namespace_is_content | boolean |
page_is_redirect | boolean |
page_is_deleted | boolean |
page_creation_timestamp | varchar |
page_first_edit_timestamp | varchar |
page_revision_count | bigint |
page_seconds_since_previous_revision | bigint |
user_id | bigint |
user_text_historical | varchar |
user_text | varchar |
user_blocks_historical | array(varchar) |
user_blocks | array(varchar) |
user_groups_historical | array(varchar) |
user_groups | array(varchar) |
user_is_bot_by_historical | array(varchar) |
user_is_bot_by | array(varchar) |
user_is_created_by_self | boolean |
user_is_created_by_system | boolean |
user_is_created_by_peer | boolean |
user_is_anonymous | boolean |
user_registration_timestamp | varchar | from user table
user_creation_timestamp | varchar | from logging table
user_first_edit_timestamp | varchar |
revision_id | bigint |
revision_parent_id | bigint |
revision_minor_edit | boolean |
revision_deleted_parts | array(varchar) |
revision_deleted_parts_are_suppressed | boolean |
revision_text_bytes | bigint |
revision_text_bytes_diff | bigint |
revision_text_sha1 | varchar |
revision_content_model | varchar |
revision_content_format | varchar |
revision_is_deleted_by_page_deletion | boolean |
revision_deleted_by_page_deletion_timestamp | varchar |
revision_is_identity_reverted | boolean |
revision_first_identity_reverting_revision_id | bigint |
revision_seconds_to_identity_revert | bigint |
revision_is_identity_revert | boolean |
revision_is_from_before_page_creation | boolean |
revision_tags | array(varchar) |
snapshot | varchar |
*/