Page MenuHomePhabricator
Paste P8620

Get page IDs of articles translated by Toledo (1/2 scripts for T222154)
ActivePublic

Authored by chelsyx on Jun 17 2019, 7:13 PM.
Tags
None
Referenced Files
F29594270: raw.txt
Jun 17 2019, 7:13 PM
Subscribers
None
#! /bin/bash
DESCRIPTION="Get page IDs of articles translated by Toledo"
THISSCRIPTFILE=`basename "$0"`
RESULTSFILE=~/${THISSCRIPTFILE%.*}_result.txt
{ date ; echo = ; TZ='America/Los_Angeles' date ; echo generated by $THISSCRIPTFILE on $HOSTNAME ;
d="2019-03-18"
while [ "$d" != 2019-06-15 ]
do
echo $d
DAY=$(date -d "$d" '+%d')
MONTH=$(date -d "$d" '+%m')
beeline --verbose=true -e "
SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict;
CREATE EXTERNAL TABLE IF NOT EXISTS chelsyx.toledo_pageid
(
\`page_id\` bigint COMMENT 'The ID of the article',
\`database_code\` string COMMENT 'The code of the wiki'
)
PARTITIONED BY (
\`year\` int COMMENT 'Unpadded year',
\`month\` int COMMENT 'Unpadded month',
\`day\` int COMMENT 'Unpadded day'
)
STORED AS PARQUET
LOCATION '/user/chelsyx/toledo_pageid'
;
INSERT INTO TABLE chelsyx.toledo_pageid partition(year,month,day)
select distinct r.page_id, w.database_code, r.year, r.month, r.day
from wmf.webrequest r join canonical_data.wikis w on CONCAT(r.pageview_info['project'], '.org') = w.domain_name
where
year = 2019
and month = ${MONTH}
and day = ${DAY}
and webrequest_source = 'text'
and is_pageview
and namespace_id=0
and x_analytics_map['translationengine'] = 'GT'
and parse_url(referer, 'QUERY') like '%client=srp%'
and (regexp_extract(parse_url(referer, 'QUERY'), '(^|[&?])hl=([^&]*)', 2) = 'id'
or regexp_extract(parse_url(referer, 'QUERY'), '(^|[&?])tl=([^&]*)', 2) = 'id')
;
"
d=$(date -I -d "$d + 1 day")
done
} &> $RESULTSFILE

Event Timeline

chelsyx changed the title of this paste from Pageviews of articles on idwiki that may be affected by Toledo (script for T222154) to Get page IDs of articles translated by Toledo (1/2 scripts for T222154).Jun 17 2019, 7:16 PM