Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Paste
P8621
Pageviews of articles on idwiki whose other languages version were translated by Toledo (2/2 script for T222154)
Active
Public
Actions
Authored by
•
chelsyx
on Jun 17 2019, 7:21 PM.
Edit Paste
Archive Paste
View Raw File
Subscribe
Mute Notifications
Award Token
Flag For Later
Tags
None
Referenced Files
F29594284: raw.txt
Jun 17 2019, 7:21 PM
2019-06-17 19:21:15 (UTC+0)
Subscribers
None
# Start a pyspark shell
# pyspark2 --master yarn --executor-memory 16G --executor-cores 4 --driver-memory 16G
# a dataset created by joal that links page IDs with wikidata item ID. See https://phabricator.wikimedia.org/T215616
wikidataParquetPath
=
'/user/joal/wmf/data/wmf/wikidata/item_page_link/20190204'
spark
.
read
.
parquet
(
wikidataParquetPath
)
.
createOrReplaceTempView
(
'wikidata'
)
idwiki_pv
=
spark
.
sql
(
"""
WITH toledo_articles AS (
select distinct page_id, database_code
from chelsyx.toledo_pageid
),
idwiki_matched_articles AS (
select d2.page_id
from toledo_articles t join wikidata d1 on (t.page_id=d1.page_id and t.database_code=d1.wiki_db)
join wikidata d2 on (d1.item_id=d2.item_id and d2.wiki_db='idwiki' and d2.page_namespace=0)
)
select CONCAT(v.year, '-', LPAD(v.month, 2, '0'), '-', LPAD(v.day, 2, '0')) AS date,
m.page_id is not null AS is_translated,
sum(v.view_count) as pageviews
from idwiki_matched_articles m right join wmf.pageview_hourly v on
(m.page_id=v.page_id and v.namespace_id=0 and v.project='id.wikipedia' and
v.agent_type='user' and v.referer_class = 'external (search engine)' and v.access_method = 'mobile web')
where (v.year=2018 and v.month>8)
or
(v.year=2019 and v.month<3)
group by year, month, day, m.page_id is not null
"""
)
idwiki_pv_df
=
idwiki_pv
.
toPandas
()
idwiki_pv_df
.
to_csv
(
'data/idwiki_pv.tsv'
,
sep
=
'
\t
'
,
index
=
False
)
Event Timeline
•
chelsyx
created this paste.
Jun 17 2019, 7:21 PM
2019-06-17 19:21:15 (UTC+0)
•
chelsyx
mentioned this in
T222154: Determine the pageview (and if possible, search impression) impact of automatic SERP translations
.
Jun 17 2019, 7:26 PM
2019-06-17 19:26:31 (UTC+0)
Log In to Comment