!set outputformat table
# Notes:
# - /api/rest_v1/page/pdf/ is 22 characters, which is where the magic 23 comes from.
# - Lowercase the extracted namespace in the namespaces CTE as we can't use column aliases in
# certain clauses (see https://issues.apache.org/jira/browse/HIVE-1683).
# - The list of accepted namespaces was created by running the following while browsing
# https://en.wikipedia.org:
#
# Object.keys( mw.config.get( 'wgNamespaceIds' ) )
# .map( name => `"${name}"` )
# .join( ', ' )
with namespaces as (
select
lower(case when
instr(uri_path, "%3A") > 0
then
substring(uri_path, 23, instr(uri_path, "%3A") - 23)
else
"main"
end) as namespace
from
wmf.webrequest
where
year = 2018
and month = 11
and day = 25
and hour >= 0 and hour < 6
and normalized_host.project_family = "wikipedia"
and normalized_host.project = "en"
and uri_path like "/api/rest_v1/page/pdf/%"
)
select
namespace,
count(*) as n
from
namespaces
where
namespace in (
"media", "special", "main", "talk", "user", "user_talk", "wikipedia", "wikipedia_talk", "file",
"file_talk", "mediawiki", "mediawiki_talk", "template", "template_talk", "help", "help_talk",
"category", "category_talk", "portal", "portal_talk", "book", "book_talk", "draft",
"draft_talk", "timedtext", "timedtext_talk", "module", "module_talk", "gadget", "gadget_talk",
"gadget_definition", "gadget_definition_talk", "wp", "wt", "image", "image_talk", "project",
"project_talk"
)
group by
namespace
order by
n desc
limit
1000
;