raw.txt
No One
Actions

Authored By

	• chelsyx
	Jun 17 2019, 7:13 PM

Size

1 KB

Referenced Files

None

Subscribers

None

raw.txt
View Options

	#! /bin/bash
	DESCRIPTION="Get page IDs of articles translated by Toledo"
	THISSCRIPTFILE=`basename "$0"`
	RESULTSFILE=~/${THISSCRIPTFILE%.*}_result.txt
	{ date ; echo = ; TZ='America/Los_Angeles' date ; echo generated by $THISSCRIPTFILE on $HOSTNAME ;
	d="2019-03-18"
	while [ "$d" != 2019-06-15 ]
	do
	echo $d
	DAY=$(date -d "$d" '+%d')
	MONTH=$(date -d "$d" '+%m')
	beeline --verbose=true -e "
	SET hive.exec.dynamic.partition = true;
	SET hive.exec.dynamic.partition.mode = nonstrict;
	CREATE EXTERNAL TABLE IF NOT EXISTS chelsyx.toledo_pageid
	(
	\`page_id\` bigint COMMENT 'The ID of the article',
	\`database_code\` string COMMENT 'The code of the wiki'
	)
	PARTITIONED BY (
	\`year\` int COMMENT 'Unpadded year',
	\`month\` int COMMENT 'Unpadded month',
	\`day\` int COMMENT 'Unpadded day'
	)
	STORED AS PARQUET
	LOCATION '/user/chelsyx/toledo_pageid'
	;

	INSERT INTO TABLE chelsyx.toledo_pageid partition(year,month,day)

	select distinct r.page_id, w.database_code, r.year, r.month, r.day
	from wmf.webrequest r join canonical_data.wikis w on CONCAT(r.pageview_info['project'], '.org') = w.domain_name
	where
	year = 2019
	and month = ${MONTH}
	and day = ${DAY}
	and webrequest_source = 'text'
	and is_pageview
	and namespace_id=0
	and x_analytics_map['translationengine'] = 'GT'
	and parse_url(referer, 'QUERY') like '%client=srp%'
	and (regexp_extract(parse_url(referer, 'QUERY'), '(^\|[&?])hl=([^&]*)', 2) = 'id'
	or regexp_extract(parse_url(referer, 'QUERY'), '(^\|[&?])tl=([^&]*)', 2) = 'id')
	;
	"
	d=$(date -I -d "$d + 1 day")
	done
	} &> $RESULTSFILE