Page MenuHomePhabricator
Paste P8621

Pageviews of articles on idwiki whose other languages version were translated by Toledo (2/2 script for T222154)
ActivePublic

Authored by chelsyx on Jun 17 2019, 7:21 PM.
# Start a pyspark shell
# pyspark2 --master yarn --executor-memory 16G --executor-cores 4 --driver-memory 16G
# a dataset created by joal that links page IDs with wikidata item ID. See https://phabricator.wikimedia.org/T215616
wikidataParquetPath = '/user/joal/wmf/data/wmf/wikidata/item_page_link/20190204'
spark.read.parquet(wikidataParquetPath).createOrReplaceTempView('wikidata')
idwiki_pv = spark.sql("""
WITH toledo_articles AS (
select distinct page_id, database_code
from chelsyx.toledo_pageid
),
idwiki_matched_articles AS (
select d2.page_id
from toledo_articles t join wikidata d1 on (t.page_id=d1.page_id and t.database_code=d1.wiki_db)
join wikidata d2 on (d1.item_id=d2.item_id and d2.wiki_db='idwiki' and d2.page_namespace=0)
)
select CONCAT(v.year, '-', LPAD(v.month, 2, '0'), '-', LPAD(v.day, 2, '0')) AS date,
m.page_id is not null AS is_translated,
sum(v.view_count) as pageviews
from idwiki_matched_articles m right join wmf.pageview_hourly v on
(m.page_id=v.page_id and v.namespace_id=0 and v.project='id.wikipedia' and
v.agent_type='user' and v.referer_class = 'external (search engine)' and v.access_method = 'mobile web')
where (v.year=2018 and v.month>8)
or
(v.year=2019 and v.month<3)
group by year, month, day, m.page_id is not null
""")
idwiki_pv_df = idwiki_pv.toPandas()
idwiki_pv_df.to_csv('data/idwiki_pv.tsv', sep='\t', index=False)