Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Paste
P8620
Get page IDs of articles translated by Toledo (1/2 scripts for T222154)
Active
Public
Actions
Authored by
•
chelsyx
on Jun 17 2019, 7:13 PM.
Edit Paste
Archive Paste
View Raw File
Subscribe
Mute Notifications
Award Token
Flag For Later
Tags
None
Subscribers
None
#! /bin/bash
DESCRIPTION
=
"Get page IDs of articles translated by Toledo"
THISSCRIPTFILE
=
`
basename
"
$0
"
`
RESULTSFILE
=
~/
${
THISSCRIPTFILE
%.*
}
_result.txt
{
date
;
echo
=
;
TZ
=
'America/Los_Angeles'
date
;
echo
generated by
$THISSCRIPTFILE
on
$HOSTNAME
;
d
=
"2019-03-18"
while
[
"
$d
"
!
=
2019
-06-15
]
do
echo
$d
DAY
=
$(
date -d
"
$d
"
'+%d'
)
MONTH
=
$(
date -d
"
$d
"
'+%m'
)
beeline --verbose
=
true
-e
"
SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict;
CREATE EXTERNAL TABLE IF NOT EXISTS chelsyx.toledo_pageid
(
\`page_id\` bigint COMMENT 'The ID of the article',
\`database_code\` string COMMENT 'The code of the wiki'
)
PARTITIONED BY (
\`year\` int COMMENT 'Unpadded year',
\`month\` int COMMENT 'Unpadded month',
\`day\` int COMMENT 'Unpadded day'
)
STORED AS PARQUET
LOCATION '/user/chelsyx/toledo_pageid'
;
INSERT INTO TABLE chelsyx.toledo_pageid partition(year,month,day)
select distinct r.page_id, w.database_code, r.year, r.month, r.day
from wmf.webrequest r join canonical_data.wikis w on CONCAT(r.pageview_info['project'], '.org') = w.domain_name
where
year = 2019
and month =
${
MONTH
}
and day =
${
DAY
}
and webrequest_source = 'text'
and is_pageview
and namespace_id=0
and x_analytics_map['translationengine'] = 'GT'
and parse_url(referer, 'QUERY') like '%client=srp%'
and (regexp_extract(parse_url(referer, 'QUERY'), '(^|[&?])hl=([^&]*)', 2) = 'id'
or regexp_extract(parse_url(referer, 'QUERY'), '(^|[&?])tl=([^&]*)', 2) = 'id')
;
"
d
=
$(
date -I -d
"
$d
+ 1 day"
)
done
}
&
>
$RESULTSFILE
Event Timeline
•
chelsyx
created this paste.
Jun 17 2019, 7:13 PM
•
chelsyx
changed the title of this paste from
Pageviews of articles on idwiki that may be affected by Toledo (script for T222154)
to
Get page IDs of articles translated by Toledo (1/2 scripts for T222154)
.
Jun 17 2019, 7:16 PM
•
chelsyx
mentioned this in
T222154: Determine the pageview (and if possible, search impression) impact of automatic SERP translations
.
Jun 17 2019, 7:26 PM
Log In to Comment