Page MenuHomePhabricator
Authored By
mpopov
Aug 16 2017, 9:43 PM
Size
3 KB
Referenced Files
None
Subscribers
None

namespaces.hql

USE wmf_raw;
ADD JAR hdfs:///wmf/refinery/current/artifacts/refinery-hive.jar;
CREATE TEMPORARY FUNCTION is_spider as 'org.wikimedia.analytics.refinery.hive.IsSpiderUDF';
CREATE TEMPORARY FUNCTION ua_parse as 'org.wikimedia.analytics.refinery.hive.GetUAPropertiesUDF';
CREATE TEMPORARY FUNCTION host_parse as 'org.wikimedia.analytics.refinery.hive.GetHostPropertiesUDF';
WITH profile_all AS (
-- We need to construct a '&ns0=1&...&ns####=1& that covers all the namespaces within a wiki when the user uses the "All" search profile
SELECT
dbname AS wiki,
CONCAT('&ns0', REGEXP_REPLACE(CONCAT('ns', CONCAT_WS('ns', COLLECT_SET(CAST(namespace AS STRING)))), 'ns', '=1&ns')) AS ns_qs
FROM mediawiki_project_namespace_map
WHERE snapshot = '2017-07' AND namespace > 0
GROUP BY dbname
),
requests AS (
SELECT
id, wiki_id,
CASE WHEN INSTR(payload_qs, 'profile=advanced') > 0 THEN payload_qs
-- Build a new "queryString" for searches using the non-advanced profiles:
WHEN (INSTR(payload_qs, 'profile=') = 0 OR INSTR(payload_qs, 'profile=default') > 0) THEN CONCAT(payload_qs, '&ns0=1')
WHEN INSTR(payload_qs, 'profile=images') > 0 THEN CONCAT(payload_qs, '&ns6=1')
WHEN INSTR(payload_qs, 'profile=all') > 0 THEN CONCAT(payload_qs, profile_all.ns_qs)
END AS query_string
FROM (
SELECT id AS id, wikiid AS wiki_id, payload['queryString'] AS payload_qs
FROM CirrusSearchRequestSet
WHERE
year = ${year} AND month = ${month} AND day = ${day}
AND source = 'web' AND INSTR(payload['queryString'], 'fulltext=1') > 0
-- Filter out searches with weird profiles like Translations and Discussions ("profile=thread"):
AND (INSTR(payload['queryString'], 'profile=') = 0 OR payload['queryString'] RLIKE 'profile=((advanced)|(default)|(images)|(all))')
-- Remove possible bots:
AND NOT (
useragent IS NULL
OR useragent = ''
OR ua_parse(useragent)['device_family'] = 'Spider'
OR is_spider(useragent)
OR ip = '127.0.0.1'
OR useragent RLIKE 'https?://'
OR INSTR(useragent, 'www.') > 0
OR INSTR(useragent, 'github') > 0
OR LOWER(useragent) RLIKE '([a-z0-9._%-]+@[a-z0-9.-]+\\.(com|us|net|org|edu|gov|io|ly|co|uk))'
OR (
ua_parse(useragent)['browser_family'] = 'Other'
AND ua_parse(useragent)['device_family'] = 'Other'
AND ua_parse(useragent)['os_family'] = 'Other'
)
)
) csrs
LEFT JOIN profile_all ON csrs.wiki_id = profile_all.wiki
),
searches AS (
SELECT DISTINCT
id, wiki_id,
STR_TO_MAP(query_string, '&', '=') AS query_map
FROM requests
),
exploded_searches AS (
SELECT
s.id, s.wiki_id,
exp.key AS query_key,
exp.val AS query_value
FROM searches s
LATERAL VIEW EXPLODE(s.query_map) exp AS key, val
),
wiki_map AS (
SELECT DISTINCT
dbname AS wiki,
host_parse(hostname) AS normalized_hostname,
CONCAT('ns', namespace) AS ns,
IF(namespace = 0, "Article", namespace_canonical_name) AS canonical_namespace
FROM mediawiki_project_namespace_map
WHERE snapshot = '2017-07'
-- Only include namespaces that have an ASCII canonical name (ns0's name is "" instead of "Article" in the db):
AND (namespace_canonical_name RLIKE '^[A-Za-z\\s]+$' OR namespace = 0)
)
SELECT
project, language,
id AS search_id,
namespace AS namespace_searched
FROM (
SELECT
es.id AS id, es.wiki_id AS wiki,
wm.normalized_hostname.project_class AS project,
wm.normalized_hostname.project AS language,
wm.canonical_namespace AS namespace
FROM exploded_searches es
LEFT JOIN wiki_map wm
ON es.wiki_id = wm.wiki
AND es.query_key = wm.ns
) ns
WHERE namespace IS NOT NULL;

File Metadata

Mime Type
text/plain
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
4855091
Default Alt Text
namespaces.hql (3 KB)

Event Timeline