Page MenuHomePhabricator

Analyze the varience of user-agent's, country, and other useful metrics of google refered traffic with and without a search query available in referer
Closed, ResolvedPublic4 Story Points

Description

One possible way to build out a set of queries that are useful to users would be to use queries that users use on bing and google to end up at our site.

Many users coming to wikipedia don't include the search query in their referrer though, perhaps because they both use <meta name="referrer" content="origin">. It would be interesting to analyze what the difference is between the requests that have a search term in the query and those that do not, to determine if those queries could be used as a representative sample.

Event Timeline

Restricted Application added subscribers: StudiesWorld, Aklapper. · View Herald TranscriptFeb 26 2016, 12:36 AM

Potentially relevant,

            domain  referrals     w/q   % w/q
       cse.google.pl      227      227 1.000
       cse.google.nl      216      216 1.000
       cse.google.hu      288      288 1.000
       cse.google.fr      218      218 1.000
       cse.google.de      173      173 1.000
    cse.google.co.uk      406      406 1.000
   cse.google.com.br      156      156 1.000
    cse.google.co.il      322      322 1.000
       cse.google.at      147      147 1.000
      cse.google.com     3675     3670 0.999
www.google.interia.pl      295      278 0.942
       www.google.cd     6696     6143 0.917
   www.google.com.ng    59269    52170 0.880
       www.google.bi      357      311 0.871
       www.google.td      989      857 0.867
       www.google.cg     2359     2021 0.857
encrypted.google.com     3633     3101 0.854
       www.google.bf     1954     1653 0.846
       www.google.bj     2725     2274 0.834
   www.google.com.et     3221     2646 0.821
       www.google.rw     2324     1884 0.811
       www.google.tg      946      735 0.777
   www.google.com.sl      630      479 0.760
   www.google.com.gh    18496    13952 0.754
    www.google.co.tz     7419     5568 0.751
    www.google.co.zm     3026     2197 0.726
       www.google.mg     1188      829 0.698
    www.google.co.mz     5009     3480 0.695
   www.google.com.na     2537     1735 0.684
    www.google.co.zw     2343     1520 0.649
       www.google.ml     1883     1197 0.636
   www.google.com.bd     2191     1377 0.628
       www.google.cm     4108     2393 0.583
    www.google.co.za    83821    48358 0.577
      www.google.cat      778      406 0.522
   www.google.com.mm      428      223 0.521
       www.google.gm      260      135 0.519
 images.google.co.uk      174       90 0.517
       www.google.ci      431      211 0.490
   www.google.com.cu      366      172 0.470
   www.google.com.af     1925      860 0.447
    www.google.co.ls      131       58 0.443
   www.google.com.bn      251      110 0.438
       www.google.sr      845      368 0.436
       www.google.bt      119       50 0.420
    www.google.co.id    30137    12387 0.411
       www.google.so      843      346 0.410
       www.google.ht     4343     1735 0.399
    www.google.co.bw      849      332 0.391
    www.google.co.in   304728   109728 0.360
   images.google.com     5104     1788 0.350
   www.google.com.tj      525      180 0.343
       www.google.sn     4019     1341 0.334
 plus.url.google.com      105       30 0.286
    www.google.co.ke      106       30 0.283
       www.google.dj      321       87 0.271
   www.google.com.ly     5985     1529 0.255
   www.google.com.vn     2678      652 0.243
       www.google.lk     2916      701 0.240
   www.google.com.pg      440      102 0.232
   www.google.com.pk    40971     9017 0.220
    images.google.it      238       50 0.210
   www.google.com.np      937      181 0.193
   www.google.com.py     8396     1597 0.190
   www.google.com.ph    17451     3284 0.188
    images.google.fr    39138     7037 0.180
   www.google.com.my     2735      463 0.169
   www.google.com.bo    24065     4046 0.168
       www.google.de   988321   165395 0.167
    www.google.co.kr     7588     1252 0.165
    images.google.es      154       25 0.162
    www.google.co.ve   134479    21594 0.161
images.google.com.ng      890      143 0.161
       www.google.je      504       78 0.155
   www.google.com.ni     9348     1445 0.155
   www.google.com.qa      285       44 0.154
       www.google.cv      645       98 0.152
    www.google.co.jp   137912    20966 0.152
images.google.com.et      107       16 0.150
       www.google.mu     2320      345 0.149
   www.google.com.eg    61127     9086 0.149
    images.google.pl      108       16 0.148
   www.google.com.bh     3713      544 0.147
   www.google.com.pa     1162      170 0.146
       www.google.at   159372    23091 0.145
       www.google.jo    16813     2429 0.144
   www.google.com.kh      336       47 0.140
       www.google.nl   289281    40196 0.139
       www.google.fi   115331    16005 0.139
   www.google.com.lb    10762     1499 0.139
       www.google.az    18538     2541 0.137
   www.google.com.bz      822      112 0.136
       www.google.it   777786   104333 0.134
   www.google.com.hk    22805     3028 0.133
       www.google.bs     1214      158 0.130
images.google.com.eg      101       13 0.129
   www.google.com.ua   198162    24737 0.125
   www.google.com.ag      463       58 0.125
       www.google.dz    66236     8115 0.123
   www.google.com.sv    23628     2914 0.123
images.google.com.tr      154       19 0.123
   www.google.com.ec    38221     4636 0.121
       www.google.by    38121     4624 0.121
   www.google.com.tr   279622    33558 0.120
       www.google.tn    17435     2071 0.119
   www.google.com.br   410470    48747 0.119
       www.google.al     8071      957 0.119
   www.google.com.ar   154236    17863 0.116
 images.google.co.tz      139       16 0.115
       www.google.fr   868807    99331 0.114
   www.google.com.sa    66806     7633 0.114
   www.google.com.jm     5692      646 0.113
   www.google.com.do    31532     3570 0.113
       www.google.ru   389303    43448 0.112
   www.google.com.vc      492       55 0.112
       www.google.be   148805    16635 0.112
       www.google.si    24532     2735 0.111
       www.google.gy      914      101 0.111
    www.google.co.ma     4062      452 0.111
       www.google.ae    34878     3874 0.111
       www.google.cz    97116    10580 0.109
    images.google.cd      129       14 0.109
       www.google.ps     7024      761 0.108
       www.google.iq    21147     2275 0.108
       www.google.ad      522       56 0.107
       www.google.me     5045      536 0.106
       www.google.hn    24710     2613 0.106
       www.google.ee    19735     2096 0.106
   www.google.com.kw     8991      955 0.106
       www.google.pl   460141    48163 0.105
       www.google.rs    63169     6510 0.103
       www.google.hr    45199     4676 0.103
       www.google.gr    90642     9337 0.103
   www.google.com.cy     6726      694 0.103
       www.google.es   456786    46658 0.102
   www.google.com.pr    17107     1709 0.100
       www.google.tt     4523      448 0.099
       www.google.ch   162986    16058 0.099
       www.google.bg    59704     5823 0.098
       www.google.la      113       11 0.097
       www.google.ba    23935     2286 0.096
       www.google.lt    22717     2148 0.095
       www.google.lv    18136     1696 0.094
       www.google.hu   104841     9807 0.094
    www.google.co.nz    27551     2582 0.094
   www.google.com.gt    49673     4648 0.094
 images.google.co.za      640       60 0.094
   www.google.com.uy    13809     1286 0.093
   www.google.com.mx   399052    37112 0.093
       www.google.sk    45439     4159 0.092
      www.google.com  3756097   347138 0.092
    images.google.de   122731    11236 0.092
       www.google.cf      199       18 0.090
images.google.com.br      267       24 0.090
       www.google.kg      102        9 0.088
   www.google.com.pe    56446     4970 0.088
       www.google.ca   445716    39178 0.088
       www.google.ro    86141     7458 0.087
       www.google.pt    71295     6184 0.087
   www.google.com.au    61537     5348 0.087
images.google.com.ua      127       11 0.087
       www.google.mn      258       22 0.085
       www.google.kz    14661     1225 0.084
    www.google.co.uk  1275423   107473 0.084
       www.google.li      449       36 0.080
       www.google.ie    80018     6407 0.080
    www.google.co.th    14851     1175 0.079
    www.google.co.il    97140     7521 0.077
images.google.com.mx      142       11 0.077
       www.google.tm     1156       84 0.073
    images.google.rw      110        8 0.073
       www.google.gp     2368      171 0.072
   www.google.com.mt     5215      377 0.072
       www.google.am     7717      555 0.072
       www.google.gl      154       11 0.071
       www.google.lu     8239      578 0.070
images.google.com.pk      243       17 0.070
   www.google.com.co   287456    19947 0.069
    images.google.dz      117        8 0.068
       www.google.mk      124        8 0.065
       www.google.dk   130526     8477 0.065
       www.google.cl    66681     4311 0.065
       www.google.mv      605       39 0.064
       www.google.se   290969    18286 0.063
       www.google.gg      162       10 0.062
   www.google.com.tw    29346     1826 0.062
       www.google.no   108167     6615 0.061
       www.google.ge    11256      663 0.059
   www.google.com.sg    12868      734 0.057
    www.google.co.cr     3643      206 0.057
       www.google.md     1228       68 0.055
   www.google.com.gi      238       13 0.055
 images.google.co.in     2264      120 0.053
       www.google.is     3411      176 0.052
    images.google.cm      183        9 0.049
       www.google.vg      255       12 0.047
       www.google.as     1063       48 0.045
    www.google.co.uz      327       10 0.031
       www.google.dm      361       10 0.028
 images.google.co.id      580       10 0.017
    sites.google.com     1325        4 0.003
     plus.google.com      183        0 0.000
     mail.google.com      484        0 0.000
       kh.google.com    19264        0 0.000
     hume.google.com      346        0 0.000
     docs.google.com      395        0 0.000
     apis.google.com      306        0 0.000
 accounts.google.com      187        0 0.000

re-sorted by most referalls w/ query

            domain  referrals      w/q % w/q
      www.google.com  3756097   347138 0.092
       www.google.de   988321   165395 0.167
    www.google.co.in   304728   109728 0.360
    www.google.co.uk  1275423   107473 0.084
       www.google.it   777786   104333 0.134
       www.google.fr   868807    99331 0.114
   www.google.com.ng    59269    52170 0.880
   www.google.com.br   410470    48747 0.119
    www.google.co.za    83821    48358 0.577
       www.google.pl   460141    48163 0.105
       www.google.es   456786    46658 0.102
       www.google.ru   389303    43448 0.112
       www.google.nl   289281    40196 0.139
       www.google.ca   445716    39178 0.088
   www.google.com.mx   399052    37112 0.093
   www.google.com.tr   279622    33558 0.120
   www.google.com.ua   198162    24737 0.125
       www.google.at   159372    23091 0.145
    www.google.co.ve   134479    21594 0.161
    www.google.co.jp   137912    20966 0.152
   www.google.com.co   287456    19947 0.069
       www.google.se   290969    18286 0.063
   www.google.com.ar   154236    17863 0.116
       www.google.be   148805    16635 0.112
       www.google.ch   162986    16058 0.099
       www.google.fi   115331    16005 0.139
   www.google.com.gh    18496    13952 0.754
    www.google.co.id    30137    12387 0.411
    images.google.de   122731    11236 0.092
       www.google.cz    97116    10580 0.109
       www.google.hu   104841     9807 0.094
       www.google.gr    90642     9337 0.103
   www.google.com.eg    61127     9086 0.149
   www.google.com.pk    40971     9017 0.220
       www.google.dk   130526     8477 0.065
       www.google.dz    66236     8115 0.123
   www.google.com.sa    66806     7633 0.114
    www.google.co.il    97140     7521 0.077
       www.google.ro    86141     7458 0.087
    images.google.fr    39138     7037 0.180
       www.google.no   108167     6615 0.061
       www.google.rs    63169     6510 0.103
       www.google.ie    80018     6407 0.080
       www.google.pt    71295     6184 0.087
       www.google.cd     6696     6143 0.917
       www.google.bg    59704     5823 0.098
    www.google.co.tz     7419     5568 0.751
   www.google.com.au    61537     5348 0.087
   www.google.com.pe    56446     4970 0.088
       www.google.hr    45199     4676 0.103
   www.google.com.gt    49673     4648 0.094
   www.google.com.ec    38221     4636 0.121
       www.google.by    38121     4624 0.121
       www.google.cl    66681     4311 0.065
       www.google.sk    45439     4159 0.092
   www.google.com.bo    24065     4046 0.168
       www.google.ae    34878     3874 0.111
      cse.google.com     3675     3670 0.999
   www.google.com.do    31532     3570 0.113
    www.google.co.mz     5009     3480 0.695
   www.google.com.ph    17451     3284 0.188
encrypted.google.com     3633     3101 0.854
   www.google.com.hk    22805     3028 0.133
   www.google.com.sv    23628     2914 0.123
       www.google.si    24532     2735 0.111
   www.google.com.et     3221     2646 0.821
       www.google.hn    24710     2613 0.106
    www.google.co.nz    27551     2582 0.094
       www.google.az    18538     2541 0.137
       www.google.jo    16813     2429 0.144
       www.google.cm     4108     2393 0.583
       www.google.ba    23935     2286 0.096
       www.google.iq    21147     2275 0.108
       www.google.bj     2725     2274 0.834
    www.google.co.zm     3026     2197 0.726
       www.google.lt    22717     2148 0.095
       www.google.ee    19735     2096 0.106
       www.google.tn    17435     2071 0.119
       www.google.cg     2359     2021 0.857
       www.google.rw     2324     1884 0.811
   www.google.com.tw    29346     1826 0.062
   images.google.com     5104     1788 0.350
       www.google.ht     4343     1735 0.399
   www.google.com.na     2537     1735 0.684
   www.google.com.pr    17107     1709 0.100
       www.google.lv    18136     1696 0.094
       www.google.bf     1954     1653 0.846
   www.google.com.py     8396     1597 0.190
   www.google.com.ly     5985     1529 0.255
    www.google.co.zw     2343     1520 0.649
   www.google.com.lb    10762     1499 0.139
   www.google.com.ni     9348     1445 0.155
   www.google.com.bd     2191     1377 0.628
       www.google.sn     4019     1341 0.334
   www.google.com.uy    13809     1286 0.093
    www.google.co.kr     7588     1252 0.165
       www.google.kz    14661     1225 0.084
       www.google.ml     1883     1197 0.636
    www.google.co.th    14851     1175 0.079
       www.google.al     8071      957 0.119
   www.google.com.kw     8991      955 0.106
   www.google.com.af     1925      860 0.447
       www.google.td      989      857 0.867
       www.google.mg     1188      829 0.698
       www.google.ps     7024      761 0.108
       www.google.tg      946      735 0.777
   www.google.com.sg    12868      734 0.057
       www.google.lk     2916      701 0.240
   www.google.com.cy     6726      694 0.103
       www.google.ge    11256      663 0.059
   www.google.com.vn     2678      652 0.243
   www.google.com.jm     5692      646 0.113
       www.google.lu     8239      578 0.070
       www.google.am     7717      555 0.072
   www.google.com.bh     3713      544 0.147
       www.google.me     5045      536 0.106
   www.google.com.sl      630      479 0.760
   www.google.com.my     2735      463 0.169
    www.google.co.ma     4062      452 0.111
       www.google.tt     4523      448 0.099
      www.google.cat      778      406 0.522
    cse.google.co.uk      406      406 1.000
   www.google.com.mt     5215      377 0.072
       www.google.sr      845      368 0.436
       www.google.so      843      346 0.410
       www.google.mu     2320      345 0.149
    www.google.co.bw      849      332 0.391
    cse.google.co.il      322      322 1.000
       www.google.bi      357      311 0.871
       cse.google.hu      288      288 1.000
www.google.interia.pl      295      278 0.942
       cse.google.pl      227      227 1.000
   www.google.com.mm      428      223 0.521
       cse.google.fr      218      218 1.000
       cse.google.nl      216      216 1.000
       www.google.ci      431      211 0.490
    www.google.co.cr     3643      206 0.057
   www.google.com.np      937      181 0.193
   www.google.com.tj      525      180 0.343
       www.google.is     3411      176 0.052
       cse.google.de      173      173 1.000
   www.google.com.cu      366      172 0.470
       www.google.gp     2368      171 0.072
   www.google.com.pa     1162      170 0.146
       www.google.bs     1214      158 0.130
   cse.google.com.br      156      156 1.000
       cse.google.at      147      147 1.000
images.google.com.ng      890      143 0.161
       www.google.gm      260      135 0.519
 images.google.co.in     2264      120 0.053
   www.google.com.bz      822      112 0.136
   www.google.com.bn      251      110 0.438
   www.google.com.pg      440      102 0.232
       www.google.gy      914      101 0.111
       www.google.cv      645       98 0.152
 images.google.co.uk      174       90 0.517
       www.google.dj      321       87 0.271
       www.google.tm     1156       84 0.073
       www.google.je      504       78 0.155
       www.google.md     1228       68 0.055
 images.google.co.za      640       60 0.094
   www.google.com.ag      463       58 0.125
    www.google.co.ls      131       58 0.443
       www.google.ad      522       56 0.107
   www.google.com.vc      492       55 0.112
       www.google.bt      119       50 0.420
    images.google.it      238       50 0.210
       www.google.as     1063       48 0.045
   www.google.com.kh      336       47 0.140
   www.google.com.qa      285       44 0.154
       www.google.mv      605       39 0.064
       www.google.li      449       36 0.080
    www.google.co.ke      106       30 0.283
 plus.url.google.com      105       30 0.286
    images.google.es      154       25 0.162
images.google.com.br      267       24 0.090
       www.google.mn      258       22 0.085
images.google.com.tr      154       19 0.123
       www.google.cf      199       18 0.090
images.google.com.pk      243       17 0.070
    images.google.pl      108       16 0.148
 images.google.co.tz      139       16 0.115
images.google.com.et      107       16 0.150
    images.google.cd      129       14 0.109
   www.google.com.gi      238       13 0.055
images.google.com.eg      101       13 0.129
       www.google.vg      255       12 0.047
       www.google.la      113       11 0.097
       www.google.gl      154       11 0.071
images.google.com.ua      127       11 0.087
images.google.com.mx      142       11 0.077
       www.google.gg      162       10 0.062
       www.google.dm      361       10 0.028
    www.google.co.uz      327       10 0.031
 images.google.co.id      580       10 0.017
       www.google.kg      102        9 0.088
    images.google.cm      183        9 0.049
       www.google.mk      124        8 0.065
    images.google.rw      110        8 0.073
    images.google.dz      117        8 0.068
    sites.google.com     1325        4 0.003
     plus.google.com      183        0 0.000
     mail.google.com      484        0 0.000
       kh.google.com    19264        0 0.000
     hume.google.com      346        0 0.000
     docs.google.com      395        0 0.000
     apis.google.com      306        0 0.000
 accounts.google.com      187        0 0.000
mpopov moved this task from Needs triage to Analysis on the Discovery board.
mpopov added a subscriber: Deskana.Feb 26 2016, 8:15 PM

@EBernhardson This sounds like an interest task and potentially very useful knowledge, but @Deskana will decide when to pull it into our sprint and how to prioritize it. Cheers.

doh, didn't intend to throw it into the sprint. i get confused because you don't have your own non-sprint board and we are doing away with the global discovery board.

Deskana triaged this task as Low priority.Mar 3 2016, 9:20 PM

This is pretty low priority; it's interesting, but doesn't directly affect any decisions we're making right now.

For the most part this data isn't available from google anymore. Their main search (at least when i use it) now puts the query string in the hash, which doesn't get sent as a referrer. Additionally google is utilizing the new referrer policy (which wikimedia also uses) to limit referrer's to only the domain name.

select IF(parse_url(referer, 'QUERY', 'q') IS NULL, false, parse_url(referer, 'QUERY', 'q') <> '') as has_query, count(1) as count from webrequest where year=2
016 and month=5 and day=5 and instr(referer, 'google.com') > 0 group by IF(parse_url(referer, 'QUERY', 'q') IS NULL, false, parse_url(referer, 'QUERY', 'q') <> '');

has_query       count
false           143584813
true            8281424

Breaking down where we do get referer's from, by browser:

select user_agent_map['browser_family'] as browser_family, user_agent_map['browser_major'] as browser_major, count(1) as count from webrequest where year=2016 and month=5 and day=5 and instr(referer, 'google.com') > 0 and IF(parse_url(referer, 'QUERY', 'q') IS NULL, false, parse_url(referer, 'QUERY', 'q') <> '') group by user_agent_map['browser_family'], user_agent_map['browser_major'] having count(1) > 50000;

browser_family  browser_major   count
Android                 4       1310069
Android                 2       726045
Opera Mini              4       631868
Chrome Mobile           38      531712
IE Mobile               10      498497
Opera Mini              7       451939
Chrome Mobile           44      403932
UC Browser              10      334875
IE                      4       263885
Chrome Mobile           34      243510
IE Mobile               11      205381
Firefox Mobile          46      147647
Mobile Safari           5       143777
Opera Mini              8       143173
Amazon Silk             49      115859
Chrome Mobile           18      108367
Safari                  5       108192
Opera Mini              15      103149
BlackBerry WebKit       10      83180
BlackBerry WebKit       7       65590
Opera Mini              6       57818

And by country:

select geocoded_data['country_code'] as country_code, count(1) as count from webrequest where year=2016 and month=5 and day=5 and instr(referer, 'google.com') > 0 and IF(parse_url(referer, 'QUERY', 'q') IS NULL, false, parse_url(referer, 'QUERY', 'q') <> '') group by geocoded_data['country_code'] having count(1) > 50000;

country_code    count
US      1840201
BR      493790
NG      468878
MX      428783
TR      358285
AR      321687
IR      293876
UA      265119
CO      236432
VN      204405
PK      203773
BD      171051
ID      157151
PH      152244
KE      141400
MY      136119
AU      127374
MA      103600
PE      100313
TW      99556
EG      97528
GH      79925
FI      79431
BO      77430
EC      77214
SA      68723
DO      65244
HK      61356
IN      58514
ZA      57709
ET      51147

The summary here seems to be that the breakdown by country isn't too bad (just by eye, havn't compared the breakdown of all google referred traffic). The breakdown by browser is incredibly skewed towards mobile though. If we just want mobile data this might work, but as a sample of all traffic I don't think this is particularly good.

Mobile is likely what you should be optimizing towards. Laptops are going to disappear quickly. Mobile queries can be more difficult due to issues like poor typing, voice recognition queries skew towards full sentences eg, "What is a gubbins?", and a (maybe) more local intent. Local intent ex: "ferry building" when standing in SF should bring up [[San Francisco Ferry Building]].

debt moved this task from Later to Up Next on the Discovery-Analysis board.Sep 20 2016, 8:24 PM
debt added a subscriber: MelodyKramer.
mpopov claimed this task.Oct 11 2016, 7:12 PM
mpopov set the point value for this task to 4.
mpopov moved this task from Backlog to In progress on the Discovery-Analysis (Current work) board.

Short version

Not representative. Like, at all.

Long version

Data

SELECT
  platform, has_query, country, browser,
  COUNT(1) AS pageviews,
  COUNT(DISTINCT(user_id)) AS users
FROM (
  SELECT
    CONCAT_WS(' ', user_agent_map['browser_family'], user_agent_map['browser_major']) AS browser,
    geocoded_data['country'] AS country,
    access_method AS platform,
    IF(parse_url(referer, 'QUERY', 'q') IS NULL, false, parse_url(referer, 'QUERY', 'q') <> '') as has_query,
    CONCAT(user_agent, client_ip, accept_language) AS user_id
  FROM webrequest
  WHERE
    webrequest_source = 'text'
    AND year = 2016 AND month = 10 AND (day >= 1 AND day < 8)
    AND is_pageview = true
    AND agent_type = 'user'
    AND INSTR(referer, '.google.') > 0
) AS refined_requests
GROUP BY
  platform, has_query, country, browser;

For privacy reasons, I cannot share the dataset this query yielded. However, I can share a refined version of the dataset:

that was created using methods & code described in Methods.

Methods

First, we take N (20) browsers and countries within each platform (desktop vs mobile web) and lump other countries & browsers into "Other" category. This ensures that the dataset we will be sharing will not include browsers extracted from user agents that might contain PII.

Then we ensure that every country & browser combination has a count of pageviews and users, even if it's 0. This makes it much easier to compare counts of pageviews/users that have query in referer vs. those that do not.

Here is the R code that takes the data from the Hive query above:

library(tidyverse)
import::from(magrittr, "%<>%")

# setwd("~/Desktop/queries/")
googlers <- read_csv("query_result.csv", col_types = "ccccii")

fct_lump_by <- function(f, n, by, other_level = "Other") {
  top_n <- head(f[order(by, decreasing = TRUE)], n)
  f[!(f %in% top_n)] <- other_level
  return(f)
}

googlers %<>%
  group_by(platform) %>%
  mutate(country = fct_lump_by(country, 20, users),
         browser = fct_lump_by(browser, 20, users),
         has_query = ifelse(has_query == "True", "w/ query", "w/o query")) %>%
  group_by(platform, country, browser, has_query) %>%
  summarize(pageviews = sum(pageviews),
            users = sum(users)) %>%
  ungroup %>%
  gather("metric", "value", -c(platform, country, browser, has_query))

pageviews <- googlers %>%
  filter(metric == "pageviews") %>%
  select(-metric) %>%
  spread(has_query, value, fill = 0) %>%
  # filter(`w/ query` > 0 & `w/o query` > 0) %>%
  gather("has_query", "value", -c(platform, country, browser))

users <- googlers %>%
  filter(metric == "users") %>%
  select(-metric) %>%
  spread(has_query, value, fill = 0) %>%
  # filter(`w/ query` > 0 & `w/o query` > 0) %>%
  gather("has_query", "value", -c(platform, country, browser))

googlers <- bind_rows(list(pageviews = pageviews, users = users), .id = "metric")
rm(pageviews, users)

googlers$value %<>% as.integer
googlers %<>% arrange(metric, platform, country, browser, has_query)
write_csv(googlers, "query_results_refined.csv")

Results

So... IF users w/ query in referer look the same as users w/o query in referer (in the sense that we have nearly equal counts for each country-browser combination), THEN overall distributions of counts would be very similar, without needing to look at specific country & UA combinations. So what do we see? Well, two groups that are not similar at all!

This plot was created using the following R code:

library(tidyverse)
import::from(magrittr, "%<>%")

googlers <- read_csv("query_results_refined.csv.gz", col_types = "ccccci")

googlers %>%
  filter(country != "Other" & browser != "Other") %>%
  # Remove country-browser combos that have either 0 PVs/users w/ queries or 0 PVs/users w/o queries
  group_by(metric, platform, country, browser) %>%
  filter(value > 0) %>%
  group_by(metric, platform, country, browser) %>%
  filter(all(c("w/ query", "w/o query") %in% has_query)) %>%
  ungroup %>%
  # Plot it!
  ggplot(aes(x = value, color = has_query)) +
  geom_density(
    aes(y = ..scaled..),
    adjust = 2
  ) +
  scale_x_log10(breaks = vapply(0:9, function(pow) {10^pow}, 0), labels = polloi::compress) +
  scale_color_brewer("Presence of query in referer", palette = "Set1") +
  facet_grid(metric ~ platform, scales = "free") +
  labs(x = NULL, y = NULL, title = "Distribution of pageview and user counts, by platform and presence of query in referer",
       subtitle = "Using a week's worth of pageviews, made between 2016-10-01 and 2016-10-07") +
  theme_minimal() +
  theme(legend.position = "bottom",
        strip.background = element_rect(fill = "gray90"),
        panel.border = element_rect(color = "gray30", fill = NA),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank())
debt closed this task as Resolved.Oct 14 2016, 9:40 PM
debt added a subscriber: debt.

wow, nice work! :)

mpopov reopened this task as Open.Oct 25 2016, 11:39 PM

Crap crackers, I forgot to use proportions instead of counts. Need to come back to this and do that.

mpopov added a comment.EditedOct 27 2016, 1:33 AM

Note: In the original version of this analysis, I totally had what can be fairly and unprofessionally referred to as a "brain fart" and used raw counts instead of proportions to compare the two samples. This was an oversight on my part and I am sorry for the careless mistake. Once I redid the analysis using proportions, the conclusion changed (somewhat).

Short version

The proportions of UA+country combinations are different, but not too different between Google-referred pageviews with query info and those without. We still wouldn't recommend viewing one group as representative of the other.

The most popular UA+country pair (with respect to unique desktop users) was Chrome 53 in U.S., with 1.6% of desktop users w/ query in their referer field using Chrome 53 in U.S. and 10.1% of all desktop users without it were using Chrome 53 in U.S.

Long version

Data

SELECT
  platform, has_query, country, browser,
  COUNT(1) AS pageviews,
  COUNT(DISTINCT(user_id)) AS users
FROM (
  SELECT
    CONCAT_WS(' ', user_agent_map['browser_family'], user_agent_map['browser_major']) AS browser,
    geocoded_data['country'] AS country,
    access_method AS platform,
    IF(parse_url(referer, 'QUERY', 'q') IS NULL, false, parse_url(referer, 'QUERY', 'q') <> '') as has_query,
    CONCAT(user_agent, client_ip, accept_language) AS user_id
  FROM webrequest
  WHERE
    webrequest_source = 'text'
    AND year = 2016 AND month = 10 AND (day >= 1 AND day < 8)
    AND is_pageview = true
    AND agent_type = 'user'
    AND INSTR(referer, '.google.') > 0
) AS refined_requests
GROUP BY
  platform, has_query, country, browser;

For privacy reasons, I cannot share the dataset this query yielded. However, I can share a refined version of the dataset:

that was created using methods & code described in Methods.

Methods

First, we take N (20) browsers and countries within each platform (desktop vs mobile web) and lump other countries & browsers into "Other" category. This ensures that the dataset we will be sharing will not include browsers extracted from user agents that might contain PII.

Then we ensure that every country & browser combination has a count of pageviews and users, even if it's 0. This makes it much easier to compare counts of pageviews/users that have query in referer vs. those that do not.

Here is the R code that takes the data from the Hive query above:

library(tidyverse)
import::from(magrittr, "%<>%")

# setwd("~/Desktop/queries/")
googlers <- read_csv("query_result.csv", col_types = "ccccii")

fct_lump_by <- function(f, n, by, other_level = "Other") {
  top_n <- head(f[order(by, decreasing = TRUE)], n)
  f[!(f %in% top_n)] <- other_level
  return(f)
}

googlers %<>%
  group_by(platform) %>%
  mutate(country = fct_lump_by(country, 20, users),
         browser = fct_lump_by(browser, 20, users),
         has_query = ifelse(has_query == "True", "w/ query", "w/o query")) %>%
  group_by(platform, country, browser, has_query) %>%
  summarize(pageviews = sum(pageviews),
            users = sum(users)) %>%
  ungroup %>%
  gather("metric", "value", -c(platform, country, browser, has_query))

pageviews <- googlers %>%
  filter(metric == "pageviews") %>%
  select(-metric) %>%
  spread(has_query, value, fill = 0) %>%
  # filter(`w/ query` > 0 & `w/o query` > 0) %>%
  gather("has_query", "value", -c(platform, country, browser))

users <- googlers %>%
  filter(metric == "users") %>%
  select(-metric) %>%
  spread(has_query, value, fill = 0) %>%
  # filter(`w/ query` > 0 & `w/o query` > 0) %>%
  gather("has_query", "value", -c(platform, country, browser))

googlers <- bind_rows(list(pageviews = pageviews, users = users), .id = "metric")
rm(pageviews, users)

googlers$value %<>% as.integer
googlers %<>% arrange(metric, platform, country, browser, has_query)
write_csv(googlers, "query_results_refined.csv")

Results

Let's take a look at the top UA+country pairs that different the most between the two types of Google-referred traffic:

metricplatformcountrybrowserw/ queryw/o querydifference
usersdesktopUnited StatesChrome 531.653%10.112%0.0845897
usersmobile webUnited StatesMobile Safari 100.196%6.748%0.0655211
usersmobile webUnited StatesChrome Mobile 530.024%5.165%0.0514102
usersmobile webUnited StatesMobile Safari 90.109%4.629%0.0451948
usersdesktopBrazilChrome 530.622%2.753%0.0213110
usersdesktopUnited KingdomChrome 530.287%2.245%0.0195739

And the least:

metricplatformcountrybrowserw/ queryw/o querydifference
usersdesktopJapanFirefox 490.420%0.396%-0.0002426
usersdesktopArgentinaSafari 90.001%0.007%0.000061
usersdesktopTurkeySafari 90.001%0.013%0.0001246
usersdesktopColombiaSafari 90.002%0.016%0.0001409
usersdesktopPolandSafari 90.003%0.023%0.0002064
usersdesktopBrazilSafari 90.006%0.026%0.0002080

So... IF users w/ query in referer look the same as users w/o query in referer (in the sense that we have nearly equal counts for each country-browser combination), THEN overall proportions would be very similar, without needing to look at specific country & UA combinations. So what do we see?

For the most part, the difference between proportion of traffic w/ query data and proportion of traffic w/o query data is close to 0, although as we saw in the first table, we can definitely see some very large differences for some UA+country pairs.

This plot was created using the following R code:

library(tidyverse)
import::from(magrittr, "%<>%")

googlers <- read_csv("~/Desktop/query_results_refined.csv", col_types = "ccccci")

temp <- googlers %>%
  group_by(metric, platform, has_query) %>%
  mutate(prop = value/sum(value)) %>%
  ungroup %>%
  select(-value) %>%
  spread(has_query, prop) %>%
  mutate(difference = `w/o query` - `w/ query`)

  # Plot it!
temp %>%
  ggplot(aes(x = difference)) +
  geom_density(
    aes(y = ..scaled..),
    adjust = 4,
    color = NA, fill = "gray60"
  ) +
  facet_grid(metric ~ platform) +
  labs(x = "proportion w/ query - proportion w/o query", y = NULL,
       title = "Distribution of differences of proportions of referred traffic",
       subtitle = "Using a week's worth of pageviews, made between 2016-10-01 and 2016-10-07",
       caption = "Proportions of UA+country combinations are similar between referred pageviews with query data and those without.\nThat is, most differences are less than 0.01") +
  theme_minimal() +
  theme(legend.position = "bottom",
        strip.background = element_rect(fill = "gray90"),
        panel.border = element_rect(color = "gray30", fill = NA),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank())

Then we decided to take a look at the median of differences. We bootstrap the distribution by sampling the differences (with repeats) b=1000 times and calculating the median for each bootstrapped sample, obtaining the following distributions and 95% confidence intervals:

set.seed(0)
bootstrapped <- list()
for (metric in unique(temp$metric)) {
  bootstrapped[[metric]] <- list()
  for (platform in unique(temp$platform)) {
    bootstrap_sample <- replicate(1000, {
      return(median(sample(temp$difference[temp$metric == metric & temp$platform == platform], sum(temp$metric == metric & temp$platform == platform), replace = TRUE)))
    })
    bootstrapped[[metric]][[platform]]$observed <- median(temp$difference[temp$metric == metric & temp$platform == platform])
    bootstrapped[[metric]][[platform]]$conf_int <- quantile(bootstrap_sample, c(0.025, 0.975))
    bootstrapped[[metric]][[platform]]$sample <- bootstrap_sample
  }
}; rm(bootstrap_sample)

tidy_bootstrap <- bind_rows(lapply(bootstrapped, function(by_metric) {
  bind_rows(lapply(by_metric, function(by_platform) {
    return(data.frame(median_difference = by_platform$sample))
  }), .id = "platform")
}), .id = "metric")
head(tidy_bootstrap)

tidy_confint <- bind_rows(lapply(bootstrapped, function(by_metric) {
  bind_rows(lapply(by_metric, function(by_platform) {
    return(data.frame(lower = by_platform$conf_int[1], upper = by_platform$conf_int[2]))
  }), .id = "platform")
}), .id = "metric")

tidy_observed <- bind_rows(lapply(bootstrapped, function(by_metric) {
  bind_rows(lapply(by_metric, function(by_platform) {
    return(data.frame(median_difference = by_platform$observed))
  }), .id = "platform")
}), .id = "metric")

ggplot(tidy_bootstrap, aes(x = median_difference)) +
  geom_density(
    aes(y = ..scaled..), adjust = 2,
    color = NA, fill = "gray80"
  ) +
  facet_grid(metric ~ platform) +
  geom_vline(data = tidy_observed,
             aes(xintercept = median_difference),
             color = "black", size = 1.1) +
  geom_text(data = tidy_observed,
            aes(x = 0.006, y = 0.6,
                label = sprintf("Observed: %.3f", median_difference))) +
  geom_text(data = tidy_confint,
            aes(x = 0.006, y = 0.5,
                label = sprintf("95%% CI: (%.3f, %.3f)", lower, upper))) +
  labs(x = "proportion w/ query - proportion w/o query", y = NULL,
       title = "Bootstrapped distributions of median difference of proportions",
       subtitle = "Using a week's worth of pageviews, made between 2016-10-01 and 2016-10-07",
       caption = "Proportions of UA+country combinations are similar between referred pageviews with query data and those without.\nThat is, most differences are less than 0.01") +
  theme_minimal() +
  theme(legend.position = "bottom",
        strip.background = element_rect(fill = "gray90"),
        panel.border = element_rect(color = "gray30", fill = NA),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank())
debt added a comment.Oct 27 2016, 8:07 PM

Asking @mpopov to pretty please make this analysis as a pdf and upload to commons. thanks! :)