page_title_hourly.hql
JAllemandou (joal)
Actions

Authored By

	JAllemandou
	Jul 3 2015, 2:47 PM

Size

1 KB

Referenced Files

None

Subscribers

None

page_title_hourly.hql
View Options

	--
	-- Script generating one day of page_title hourly datasets in gzipped tsv format (no headers).
	-- This script purposedly exclude bots and undefined page_title ('-').
	-- This script removes data points where view_count < 100 (k-anonymity).
	-- Columns: day, time, project, language_variant, page_title, view_count
	-- Usage: hive -f page_title_hourly.hql -d destination_directory=/tmp/test -d year=2015 -d month=7 -d day=1
	--


	SET hive.exec.compress.output=true;
	SET mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
	SET mapred.reduce.tasks = 1;

	INSERT OVERWRITE DIRECTORY "${destination_directory}"
	SELECT
	CONCAT_WS("\t",
	CONCAT(
	LPAD(year, 4, "0"), "-",
	LPAD(month, 2, "0"), "-",
	LPAD(day, 2, "0")
	),
	CONCAT(LPAD(hour, 2, "0"), ":00:00"),
	project,
	language_variant,
	page_title,
	CAST(SUM(view_count) AS STRING)
	) line
	FROM wmf.pageview_hourly
	WHERE year=${year}
	AND month=${month}
	AND day=${day}
	AND agent_type = 'user'
	AND page_title != '-'
	GROUP BY
	year,
	month,
	day,
	hour,
	project,
	language_variant,
	page_title
	HAVING SUM(view_count) > 99
	ORDER BY line
	LIMIT 1000000000
	;