Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F9090851
namespaces.hql
mpopov (Mikhail Popov)
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
mpopov
Aug 16 2017, 9:43 PM
2017-08-16 21:43:51 (UTC+0)
Size
3 KB
Referenced Files
None
Subscribers
None
namespaces.hql
View Options
USE
wmf_raw
;
ADD
JAR
hdfs
:
///
wmf
/
refinery
/
current
/
artifacts
/
refinery
-
hive
.
jar
;
CREATE
TEMPORARY
FUNCTION
is_spider
as
'org.wikimedia.analytics.refinery.hive.IsSpiderUDF'
;
CREATE
TEMPORARY
FUNCTION
ua_parse
as
'org.wikimedia.analytics.refinery.hive.GetUAPropertiesUDF'
;
CREATE
TEMPORARY
FUNCTION
host_parse
as
'org.wikimedia.analytics.refinery.hive.GetHostPropertiesUDF'
;
WITH
profile_all
AS
(
-- We need to construct a '&ns0=1&...&ns####=1& that covers all the namespaces within a wiki when the user uses the "All" search profile
SELECT
dbname
AS
wiki
,
CONCAT
(
'&ns0'
,
REGEXP_REPLACE
(
CONCAT
(
'ns'
,
CONCAT_WS
(
'ns'
,
COLLECT_SET
(
CAST
(
namespace
AS
STRING
)))),
'ns'
,
'=1&ns'
))
AS
ns_qs
FROM
mediawiki_project_namespace_map
WHERE
snapshot
=
'2017-07'
AND
namespace
>
0
GROUP
BY
dbname
),
requests
AS
(
SELECT
id
,
wiki_id
,
CASE
WHEN
INSTR
(
payload_qs
,
'profile=advanced'
)
>
0
THEN
payload_qs
-- Build a new "queryString" for searches using the non-advanced profiles:
WHEN
(
INSTR
(
payload_qs
,
'profile='
)
=
0
OR
INSTR
(
payload_qs
,
'profile=default'
)
>
0
)
THEN
CONCAT
(
payload_qs
,
'&ns0=1'
)
WHEN
INSTR
(
payload_qs
,
'profile=images'
)
>
0
THEN
CONCAT
(
payload_qs
,
'&ns6=1'
)
WHEN
INSTR
(
payload_qs
,
'profile=all'
)
>
0
THEN
CONCAT
(
payload_qs
,
profile_all
.
ns_qs
)
END
AS
query_string
FROM
(
SELECT
id
AS
id
,
wikiid
AS
wiki_id
,
payload
[
'queryString'
]
AS
payload_qs
FROM
CirrusSearchRequestSet
WHERE
year
=
${
year
}
AND
month
=
${
month
}
AND
day
=
${
day
}
AND
source
=
'web'
AND
INSTR
(
payload
[
'queryString'
],
'fulltext=1'
)
>
0
-- Filter out searches with weird profiles like Translations and Discussions ("profile=thread"):
AND
(
INSTR
(
payload
[
'queryString'
],
'profile='
)
=
0
OR
payload
[
'queryString'
]
RLIKE
'profile=((advanced)|(default)|(images)|(all))'
)
-- Remove possible bots:
AND
NOT
(
useragent
IS
NULL
OR
useragent
=
''
OR
ua_parse
(
useragent
)[
'device_family'
]
=
'Spider'
OR
is_spider
(
useragent
)
OR
ip
=
'127.0.0.1'
OR
useragent
RLIKE
'https?://'
OR
INSTR
(
useragent
,
'www.'
)
>
0
OR
INSTR
(
useragent
,
'github'
)
>
0
OR
LOWER
(
useragent
)
RLIKE
'([a-z0-9._%-]+@[a-z0-9.-]+\\.(com|us|net|org|edu|gov|io|ly|co|uk))'
OR
(
ua_parse
(
useragent
)[
'browser_family'
]
=
'Other'
AND
ua_parse
(
useragent
)[
'device_family'
]
=
'Other'
AND
ua_parse
(
useragent
)[
'os_family'
]
=
'Other'
)
)
)
csrs
LEFT
JOIN
profile_all
ON
csrs
.
wiki_id
=
profile_all
.
wiki
),
searches
AS
(
SELECT
DISTINCT
id
,
wiki_id
,
STR_TO_MAP
(
query_string
,
'&'
,
'='
)
AS
query_map
FROM
requests
),
exploded_searches
AS
(
SELECT
s
.
id
,
s
.
wiki_id
,
exp
.
key
AS
query_key
,
exp
.
val
AS
query_value
FROM
searches
s
LATERAL
VIEW
EXPLODE
(
s
.
query_map
)
exp
AS
key
,
val
),
wiki_map
AS
(
SELECT
DISTINCT
dbname
AS
wiki
,
host_parse
(
hostname
)
AS
normalized_hostname
,
CONCAT
(
'ns'
,
namespace
)
AS
ns
,
IF
(
namespace
=
0
,
"Article"
,
namespace_canonical_name
)
AS
canonical_namespace
FROM
mediawiki_project_namespace_map
WHERE
snapshot
=
'2017-07'
-- Only include namespaces that have an ASCII canonical name (ns0's name is "" instead of "Article" in the db):
AND
(
namespace_canonical_name
RLIKE
'^[A-Za-z\\s]+$'
OR
namespace
=
0
)
)
SELECT
project
,
language
,
id
AS
search_id
,
namespace
AS
namespace_searched
FROM
(
SELECT
es
.
id
AS
id
,
es
.
wiki_id
AS
wiki
,
wm
.
normalized_hostname
.
project_class
AS
project
,
wm
.
normalized_hostname
.
project
AS
language
,
wm
.
canonical_namespace
AS
namespace
FROM
exploded_searches
es
LEFT
JOIN
wiki_map
wm
ON
es
.
wiki_id
=
wm
.
wiki
AND
es
.
query_key
=
wm
.
ns
)
ns
WHERE
namespace
IS
NOT
NULL
;
File Metadata
Details
Attached
Mime Type
text/plain
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
4855091
Default Alt Text
namespaces.hql (3 KB)
Attached To
Mode
T165861: Use search log to find currently existing namespace combinations
Attached
Detach File
Event Timeline
Log In to Comment