Page MenuHomePhabricator
Authored By
chelsyx
Jun 17 2019, 7:13 PM
Size
1 KB
Referenced Files
None
Subscribers
None
#! /bin/bash
DESCRIPTION="Get page IDs of articles translated by Toledo"
THISSCRIPTFILE=`basename "$0"`
RESULTSFILE=~/${THISSCRIPTFILE%.*}_result.txt
{ date ; echo = ; TZ='America/Los_Angeles' date ; echo generated by $THISSCRIPTFILE on $HOSTNAME ;
d="2019-03-18"
while [ "$d" != 2019-06-15 ]
do
echo $d
DAY=$(date -d "$d" '+%d')
MONTH=$(date -d "$d" '+%m')
beeline --verbose=true -e "
SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict;
CREATE EXTERNAL TABLE IF NOT EXISTS chelsyx.toledo_pageid
(
\`page_id\` bigint COMMENT 'The ID of the article',
\`database_code\` string COMMENT 'The code of the wiki'
)
PARTITIONED BY (
\`year\` int COMMENT 'Unpadded year',
\`month\` int COMMENT 'Unpadded month',
\`day\` int COMMENT 'Unpadded day'
)
STORED AS PARQUET
LOCATION '/user/chelsyx/toledo_pageid'
;
INSERT INTO TABLE chelsyx.toledo_pageid partition(year,month,day)
select distinct r.page_id, w.database_code, r.year, r.month, r.day
from wmf.webrequest r join canonical_data.wikis w on CONCAT(r.pageview_info['project'], '.org') = w.domain_name
where
year = 2019
and month = ${MONTH}
and day = ${DAY}
and webrequest_source = 'text'
and is_pageview
and namespace_id=0
and x_analytics_map['translationengine'] = 'GT'
and parse_url(referer, 'QUERY') like '%client=srp%'
and (regexp_extract(parse_url(referer, 'QUERY'), '(^|[&?])hl=([^&]*)', 2) = 'id'
or regexp_extract(parse_url(referer, 'QUERY'), '(^|[&?])tl=([^&]*)', 2) = 'id')
;
"
d=$(date -I -d "$d + 1 day")
done
} &> $RESULTSFILE

File Metadata

Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
7648800
Default Alt Text
raw.txt (1 KB)

Event Timeline