import json import math from multiprocessing.dummy import Pool import requests import urllib.parse TARGET_IMPRESSIONS = 1000 REMOVE_QUERIES = set([ '298005 b.c.', 'antonio parent', 'brook valley division monroe north carolina carolina', 'LAW IN LONDON', 'Leix\u00f5es S.C.', 'what if lamba is lower than required', 'aa rochester ny meetings', 'Antiochus Epiphanes was another: after his detention at Rome and attempts', 'compare roads built Marcos and Aquino', 'examples of confidential email message', 'highliting text and calculations in word documents', 'how would you test for starch?explain your answer', 'Pets parallelism', 'red room tor sites', 'SANTA CLAUS PRINT OUT 3D PAPERTOYS', 'tayps of wlding difats', 'top out at', 'treewards', 'Unblock.request reason isis', 'when was it found', ]) REWRITE_QUERIES = { ', enzymes concentration': 'enzymes concentration', 'Death of a Red Heroine by Qiu Xialong:': 'Death of a Red Heroine by Qiu Xialong', 'ORDERS OF IRDA ABOUT SURRENDER': 'orders of IRDA about surrender', 'PURI TEMPLE': 'Puri temple', 'rape of nanking]': 'rape of nanking', 'Trust (property': 'Trust(property)', } def take(n, iterable): i = 0 for x in iterable: yield x i += 1 if i > n: break def simplify_title(title): return title.replace(',', '').replace('.', '') def lookup_article(orig_title, ask=False): # /w/api.php?action=query&format=json&titles=&generator=prefixsearch&redirects=1&gpssearch=Old+english+personal+pronouns url = 'https://en.wikipedia.org/w/api.php' query = { 'action': 'query', 'format': 'json', 'formatversion': 2, 'generator': 'prefixsearch', 'redirects': 1, 'gpssearch': orig_title, 'gpslimit': 10, 'gpsprofile': 'strict' } response = requests.get(url, params=query) if 'query' not in response.json(): # No results return None, orig_title results = sorted(response.json()['query']['pages'], key=lambda x: x['index']) if 'redirects' in response.json()['query']: for redirect in response.json()['query']['redirects']: for result in results: if redirect['to'] == result['title']: if 'orig_title' in result: raise Exception('duplicate redirect') result['orig_title'] = redirect['from'] break else: raise Exception('redirect not found in results') for result in results: if 'orig_title' not in result: result['orig_title'] = result['title'] transformers = [lambda x:x, lambda x: x.replace(',', ''), lambda x: x.replace(':', ''), lambda x: x.lower()] for transformer in transformers: title = transformer(orig_title) for result in results: if transformer(result['orig_title']) == title: if 'pageid' in result: return result['pageid'], result['title'] else: print("No pageid: %s" % (json.dumps(result))) return None, result['title'] if ask and len(results) > 0: while True: print("Requested title: %s" % (orig_title)) for i, result in enumerate(results): print("\t%d - %s" % (i, result['orig_title'])) print("\t%d - NONE OF THE ABOVE" % (i + 1)) chosen_text = input("Choose a title: ") try: index = int(chosen_text) if index == len(results): return None, orig_title else: return results[index]['pageid'], results[index]['title'] except: print("Invalid input\n") return None, orig_title def lookup_page_views(title): quoted = urllib.parse.quote(title).replace('/', '%2F') url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/%s/daily/2017082000/2017082600' % (quoted) response = requests.get(url) if response.status_code != 200: return None items = response.json()['items'] if len(items) > 0: return sum([item['views'] for item in items]) / len(items) else: return 1 def rewrite_query(query): query = query.strip() if query in REMOVE_QUERIES: return None if query in REWRITE_QUERIES: return REWRITE_QUERIES[query] return query def gen_work(ask=False, result=None): if result is None: result = {} failed_lookup = list() failed_pageviews = list() def work(judgement): query = rewrite_query(judgement['query']) if query is None: return if judgement['score'] is None: return if not ask: print('.', end='', flush=True) article_id, title = lookup_article(judgement['title'], ask) if article_id is None: failed_lookup.append(judgement) return score = float(judgement['score']) if article_id in result: if query in result[article_id]['scores']: print("\nDuplicate query `%s` for article `%s` with scores %.2s, %.2s" % (query, title, judgement['score'], result[article_id]['scores'][query])) result[article_id]['scores'][query] = max(score, result[article_id]['scores'][query]) result[article_id]['scores'][query] = score else: daily_views = lookup_page_views(title) if daily_views is None: # This are often very odd pages returned from non-wiki search, # like Portal:London/Did you know/05 2009 that have no page views failed_pageviews.append(title) else: result[article_id] = { 'title': title, 'daily_views': daily_views, 'scores': {query: score} } return return (result, failed_lookup, failed_pageviews, work) if __name__ == "__main__": with open('discernatron_all_scores.json', 'r') as f: data = json.loads(f.read()) print("Collecting") (result, failed_lookup, failed_pageviews, work) = gen_work() list(Pool(5).imap_unordered(work, data['scores'])) print("\nDone collecting.") (_, _, _, work_ask) = gen_work(ask=True, result=result) print("Failed article lookups:") for judgement in failed_lookup: work_ask(judgement) print("") print("Failed page view lookups:") for title in failed_pageviews: print("\t%s" % (title)) wme_config = {} for article_id, config in result.items(): weekly_views = config['daily_views'] * 7 impressions = TARGET_IMPRESSIONS * len(config['scores']) sample_rate = min(1, round(weekly_views / impressions, 2)) wme_config[article_id] = { 'sampleRate': sample_rate, 'queries': list(config['scores'].keys()), } with open('config.json', 'w') as f: f.write(json.dumps(result)) with open('wme_config.json', 'w') as f: f.write(json.dumps(wme_config))