Page MenuHomePhabricator
Paste P5371

query repeats by session fro 2/13/17 to 5/2/17
ActivePublic

Authored by EBernhardson on May 3 2017, 7:06 PM.
hive (default)> add jar hdfs://analytics-hadoop/wmf/refinery/current/artifacts/refinery-hive.jar;
hive (default)> create temporary function stemmer as 'org.wikimedia.analytics.refinery.hive.StemmerUDF';
hive (default)> select
wikiid,
count(distinct norm_query) as all_distinct_norm_q,
sum(num_idents) as all_num_idents,
count(distinct if(num_idents >= 10, norm_query, "")) as 10repeats_distinct_norm_q,
sum(if(num_idents >= 10, num_idents, 0)) as 10repeats_num_idents,
count(distinct if(num_idents >= 35, norm_query, "")) as 35repeats_distinct_norm_q,
sum(if(num_idents >= 35, num_idents, 0)) as 35repeats_num_idents
from (
select wikiid, stemmer(query, substring(wikiid, 1, 2)) as norm_query, count(distinct session_id) as num_idents
from discovery.query_clicks_daily
where year=2017
and array_contains(array('enwiki', 'dewiki', 'frwiki'), wikiid)
and q_by_ip_day <= 50
group by wikiid, stemmer(query, substring(wikiid, 1, 2))
) x
group by wikiid;
| wikiid | all_distinct_norm_q | all_num_idents | 10repeats_distinct_norm_q | 10repeats_num_idents | 35repeats_distinct_norm_q | 35repeats_num_idents
| dewiki | 3,446,793 | 9,044,803 | 122,045 | 3,666,926 (40.5%) | 24,152 | 2,043,388 (22.5%)
| enwiki | 14,902,929 | 35,618,078 | 424,737 | 13,208,461 (37.0%) | 80,317 | 7,544,884 (21.1%)
| frwiki | 1,700,603 | 3,406,500 | 36,772 | 906,881 (26.6%) | 5,367 | 401,268 (11.7%)