Paste P7945

Query: top 100+ viewed mainspace pages per wiki in 2018
ActivePublic

Authored by Tbayer on Mon, Dec 31, 7:56 PM.
SET mapred.job.queue.name=nice;
WITH counted AS ( -- adapted from https://phabricator.wikimedia.org/T183903#4824489 :
SELECT project,
page_title,
SUM(view_count) as views, -- Some mainspace views are wrongly logged without namespace_id (NULL)
SUM(IF(namespace_id = 0,view_count,0)) AS ns0views,
SUM(IF(access_method != 'desktop', view_count, 0))/SUM(view_count) AS mobile_ratio
FROM wmf.pageview_hourly
WHERE year=2018
AND agent_type = 'user'
AND page_title != '-'
GROUP BY project, page_title
HAVING ns0views >= 100 -- Some small projects may have very low traffic pages in the top X
),
ns0ranked AS (
SELECT project,
page_title,
views,
ns0views,
mobile_ratio,
rank() OVER (PARTITION BY project ORDER BY ns0views DESC) as ranking
FROM counted
)
SELECT project,
REGEXP_REPLACE(page_title,'_',' ') AS page,
CONCAT('https://', project, '.org/wiki/', page_title) AS desktopurl,
views,
ROUND(100 * mobile_ratio, 2) AS mobile_percentage
FROM ns0ranked
WHERE ranking <= 150 -- In case ranking by all views differs a bit (cf. above)
ORDER BY project ASC, views DESC LIMIT 1000000
Tbayer created this paste.Mon, Dec 31, 7:56 PM
Tbayer added a comment.Mon, Jan 7, 8:55 PM

To use this (on e.g. stat1004), save it as a bash script file (e.g. using nano myhivequery.bash), make the file executable, then launch it in a screen session: screen ./myhivequery.bash