Page MenuHomePhabricator
Paste P5958

collect human relevance configuration
ActivePublic

Authored by EBernhardson on Sep 5 2017, 4:02 PM.
Tags
None
Referenced Files
F9342155: collect human relevance configuration
Sep 5 2017, 4:02 PM
Subscribers
None
import json
import math
from multiprocessing.dummy import Pool
import requests
import urllib.parse
TARGET_IMPRESSIONS = 1000
REMOVE_QUERIES = set([
'298005 b.c.',
'antonio parent',
'brook valley division monroe north carolina carolina',
'LAW IN LONDON',
'Leix\u00f5es S.C.',
'what if lamba is lower than required',
'aa rochester ny meetings',
'Antiochus Epiphanes was another: after his detention at Rome and attempts',
'compare roads built Marcos and Aquino',
'examples of confidential email message',
'highliting text and calculations in word documents',
'how would you test for starch?explain your answer',
'Pets parallelism',
'red room tor sites',
'SANTA CLAUS PRINT OUT 3D PAPERTOYS',
'tayps of wlding difats',
'top out at',
'treewards',
'Unblock.request reason isis',
'when was it found',
])
REWRITE_QUERIES = {
', enzymes concentration': 'enzymes concentration',
'Death of a Red Heroine by Qiu Xialong:': 'Death of a Red Heroine by Qiu Xialong',
'ORDERS OF IRDA ABOUT SURRENDER': 'orders of IRDA about surrender',
'PURI TEMPLE': 'Puri temple',
'rape of nanking]': 'rape of nanking',
'Trust (property': 'Trust(property)',
}
def take(n, iterable):
i = 0
for x in iterable:
yield x
i += 1
if i > n:
break
def simplify_title(title):
return title.replace(',', '').replace('.', '')
def lookup_article(orig_title, ask=False):
# /w/api.php?action=query&format=json&titles=&generator=prefixsearch&redirects=1&gpssearch=Old+english+personal+pronouns
url = 'https://en.wikipedia.org/w/api.php'
query = {
'action': 'query',
'format': 'json',
'formatversion': 2,
'generator': 'prefixsearch',
'redirects': 1,
'gpssearch': orig_title,
'gpslimit': 10,
'gpsprofile': 'strict'
}
response = requests.get(url, params=query)
if 'query' not in response.json():
# No results
return None, orig_title
results = sorted(response.json()['query']['pages'], key=lambda x: x['index'])
if 'redirects' in response.json()['query']:
for redirect in response.json()['query']['redirects']:
for result in results:
if redirect['to'] == result['title']:
if 'orig_title' in result:
raise Exception('duplicate redirect')
result['orig_title'] = redirect['from']
break
else:
raise Exception('redirect not found in results')
for result in results:
if 'orig_title' not in result:
result['orig_title'] = result['title']
transformers = [lambda x:x, lambda x: x.replace(',', ''), lambda x: x.replace(':', ''), lambda x: x.lower()]
for transformer in transformers:
title = transformer(orig_title)
for result in results:
if transformer(result['orig_title']) == title:
if 'pageid' in result:
return result['pageid'], result['title']
else:
print("No pageid: %s" % (json.dumps(result)))
return None, result['title']
if ask and len(results) > 0:
while True:
print("Requested title: %s" % (orig_title))
for i, result in enumerate(results):
print("\t%d - %s" % (i, result['orig_title']))
print("\t%d - NONE OF THE ABOVE" % (i + 1))
chosen_text = input("Choose a title: ")
try:
index = int(chosen_text)
if index == len(results):
return None, orig_title
else:
return results[index]['pageid'], results[index]['title']
except:
print("Invalid input\n")
return None, orig_title
def lookup_page_views(title):
quoted = urllib.parse.quote(title).replace('/', '%2F')
url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/%s/daily/2017082000/2017082600' % (quoted)
response = requests.get(url)
if response.status_code != 200:
return None
items = response.json()['items']
if len(items) > 0:
return sum([item['views'] for item in items]) / len(items)
else:
return 1
def rewrite_query(query):
query = query.strip()
if query in REMOVE_QUERIES:
return None
if query in REWRITE_QUERIES:
return REWRITE_QUERIES[query]
return query
def gen_work(ask=False, result=None):
if result is None:
result = {}
failed_lookup = list()
failed_pageviews = list()
def work(judgement):
query = rewrite_query(judgement['query'])
if query is None:
return
if judgement['score'] is None:
return
if not ask:
print('.', end='', flush=True)
article_id, title = lookup_article(judgement['title'], ask)
if article_id is None:
failed_lookup.append(judgement)
return
score = float(judgement['score'])
if article_id in result:
if query in result[article_id]['scores']:
print("\nDuplicate query `%s` for article `%s` with scores %.2s, %.2s" % (query, title, judgement['score'], result[article_id]['scores'][query]))
result[article_id]['scores'][query] = max(score, result[article_id]['scores'][query])
result[article_id]['scores'][query] = score
else:
daily_views = lookup_page_views(title)
if daily_views is None:
# This are often very odd pages returned from non-wiki search,
# like Portal:London/Did you know/05 2009 that have no page views
failed_pageviews.append(title)
else:
result[article_id] = {
'title': title,
'daily_views': daily_views,
'scores': {query: score}
}
return
return (result, failed_lookup, failed_pageviews, work)
if __name__ == "__main__":
with open('discernatron_all_scores.json', 'r') as f:
data = json.loads(f.read())
print("Collecting")
(result, failed_lookup, failed_pageviews, work) = gen_work()
list(Pool(5).imap_unordered(work, data['scores']))
print("\nDone collecting.")
(_, _, _, work_ask) = gen_work(ask=True, result=result)
print("Failed article lookups:")
for judgement in failed_lookup:
work_ask(judgement)
print("")
print("Failed page view lookups:")
for title in failed_pageviews:
print("\t%s" % (title))
wme_config = {}
for article_id, config in result.items():
weekly_views = config['daily_views'] * 7
impressions = TARGET_IMPRESSIONS * len(config['scores'])
sample_rate = min(1, round(weekly_views / impressions, 2))
wme_config[article_id] = {
'sampleRate': sample_rate,
'queries': list(config['scores'].keys()),
}
with open('config.json', 'w') as f:
f.write(json.dumps(result))
with open('wme_config.json', 'w') as f:
f.write(json.dumps(wme_config))