Page MenuHomePhabricator

page_title_hourly.hql

Authored By
JAllemandou
Jul 3 2015, 2:47 PM
Size
1 KB
Referenced Files
None
Subscribers
None

page_title_hourly.hql

--
-- Script generating one day of page_title hourly datasets in gzipped tsv format (no headers).
-- This script purposedly exclude bots and undefined page_title ('-').
-- This script removes data points where view_count < 100 (k-anonymity).
-- Columns: day, time, project, language_variant, page_title, view_count
-- Usage: hive -f page_title_hourly.hql -d destination_directory=/tmp/test -d year=2015 -d month=7 -d day=1
--
SET hive.exec.compress.output=true;
SET mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
SET mapred.reduce.tasks = 1;
INSERT OVERWRITE DIRECTORY "${destination_directory}"
SELECT
CONCAT_WS("\t",
CONCAT(
LPAD(year, 4, "0"), "-",
LPAD(month, 2, "0"), "-",
LPAD(day, 2, "0")
),
CONCAT(LPAD(hour, 2, "0"), ":00:00"),
project,
language_variant,
page_title,
CAST(SUM(view_count) AS STRING)
) line
FROM wmf.pageview_hourly
WHERE year=${year}
AND month=${month}
AND day=${day}
AND agent_type = 'user'
AND page_title != '-'
GROUP BY
year,
month,
day,
hour,
project,
language_variant,
page_title
HAVING SUM(view_count) > 99
ORDER BY line
LIMIT 1000000000
;

File Metadata

Mime Type
text/plain
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
180870
Default Alt Text
page_title_hourly.hql (1 KB)

Event Timeline