collect human relevance configuration
ActivePublic
Actions

Authored by EBernhardson on Sep 5 2017, 4:02 PM.

Tags

None

Referenced Files

	F9342155: collect human relevance configuration
	Sep 5 2017, 4:02 PM

Subscribers

None

	import json
	import math
	from multiprocessing.dummy import Pool
	import requests
	import urllib.parse

	TARGET_IMPRESSIONS = 1000
	REMOVE_QUERIES = set([
	'298005 b.c.',
	'antonio parent',
	'brook valley division monroe north carolina carolina',
	'LAW IN LONDON',
	'Leix\u00f5es S.C.',
	'what if lamba is lower than required',
	'aa rochester ny meetings',
	'Antiochus Epiphanes was another: after his detention at Rome and attempts',
	'compare roads built Marcos and Aquino',
	'examples of confidential email message',
	'highliting text and calculations in word documents',
	'how would you test for starch?explain your answer',
	'Pets parallelism',
	'red room tor sites',
	'SANTA CLAUS PRINT OUT 3D PAPERTOYS',
	'tayps of wlding difats',
	'top out at',
	'treewards',
	'Unblock.request reason isis',
	'when was it found',
	])

	REWRITE_QUERIES = {
	', enzymes concentration': 'enzymes concentration',
	'Death of a Red Heroine by Qiu Xialong:': 'Death of a Red Heroine by Qiu Xialong',
	'ORDERS OF IRDA ABOUT SURRENDER': 'orders of IRDA about surrender',
	'PURI TEMPLE': 'Puri temple',
	'rape of nanking]': 'rape of nanking',
	'Trust (property': 'Trust(property)',
	}

	def take(n, iterable):
	i = 0
	for x in iterable:
	yield x
	i += 1
	if i > n:
	break

	def simplify_title(title):
	return title.replace(',', '').replace('.', '')

	def lookup_article(orig_title, ask=False):
	# /w/api.php?action=query&format=json&titles=&generator=prefixsearch&redirects=1&gpssearch=Old+english+personal+pronouns
	url = 'https://en.wikipedia.org/w/api.php'
	query = {
	'action': 'query',
	'format': 'json',
	'formatversion': 2,
	'generator': 'prefixsearch',
	'redirects': 1,
	'gpssearch': orig_title,
	'gpslimit': 10,
	'gpsprofile': 'strict'
	}
	response = requests.get(url, params=query)
	if 'query' not in response.json():
	# No results
	return None, orig_title

	results = sorted(response.json()['query']['pages'], key=lambda x: x['index'])
	if 'redirects' in response.json()['query']:
	for redirect in response.json()['query']['redirects']:
	for result in results:
	if redirect['to'] == result['title']:
	if 'orig_title' in result:
	raise Exception('duplicate redirect')
	result['orig_title'] = redirect['from']
	break
	else:
	raise Exception('redirect not found in results')
	for result in results:
	if 'orig_title' not in result:
	result['orig_title'] = result['title']

	transformers = [lambda x:x, lambda x: x.replace(',', ''), lambda x: x.replace(':', ''), lambda x: x.lower()]
	for transformer in transformers:
	title = transformer(orig_title)
	for result in results:
	if transformer(result['orig_title']) == title:
	if 'pageid' in result:
	return result['pageid'], result['title']
	else:
	print("No pageid: %s" % (json.dumps(result)))
	return None, result['title']

	if ask and len(results) > 0:
	while True:
	print("Requested title: %s" % (orig_title))
	for i, result in enumerate(results):
	print("\t%d - %s" % (i, result['orig_title']))
	print("\t%d - NONE OF THE ABOVE" % (i + 1))
	chosen_text = input("Choose a title: ")
	try:
	index = int(chosen_text)
	if index == len(results):
	return None, orig_title
	else:
	return results[index]['pageid'], results[index]['title']
	except:
	print("Invalid input\n")
	return None, orig_title

	def lookup_page_views(title):
	quoted = urllib.parse.quote(title).replace('/', '%2F')
	url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/%s/daily/2017082000/2017082600' % (quoted)
	response = requests.get(url)
	if response.status_code != 200:
	return None
	items = response.json()['items']
	if len(items) > 0:
	return sum([item['views'] for item in items]) / len(items)
	else:
	return 1


	def rewrite_query(query):
	query = query.strip()
	if query in REMOVE_QUERIES:
	return None
	if query in REWRITE_QUERIES:
	return REWRITE_QUERIES[query]
	return query

	def gen_work(ask=False, result=None):
	if result is None:
	result = {}
	failed_lookup = list()
	failed_pageviews = list()
	def work(judgement):
	query = rewrite_query(judgement['query'])
	if query is None:
	return
	if judgement['score'] is None:
	return

	if not ask:
	print('.', end='', flush=True)
	article_id, title = lookup_article(judgement['title'], ask)
	if article_id is None:
	failed_lookup.append(judgement)
	return

	score = float(judgement['score'])
	if article_id in result:
	if query in result[article_id]['scores']:
	print("\nDuplicate query `%s` for article `%s` with scores %.2s, %.2s" % (query, title, judgement['score'], result[article_id]['scores'][query]))
	result[article_id]['scores'][query] = max(score, result[article_id]['scores'][query])
	result[article_id]['scores'][query] = score
	else:
	daily_views = lookup_page_views(title)
	if daily_views is None:
	# This are often very odd pages returned from non-wiki search,
	# like Portal:London/Did you know/05 2009 that have no page views
	failed_pageviews.append(title)
	else:
	result[article_id] = {
	'title': title,
	'daily_views': daily_views,
	'scores': {query: score}
	}
	return
	return (result, failed_lookup, failed_pageviews, work)

	if __name__ == "__main__":
	with open('discernatron_all_scores.json', 'r') as f:
	data = json.loads(f.read())


	print("Collecting")
	(result, failed_lookup, failed_pageviews, work) = gen_work()
	list(Pool(5).imap_unordered(work, data['scores']))
	print("\nDone collecting.")

	(_, _, _, work_ask) = gen_work(ask=True, result=result)
	print("Failed article lookups:")
	for judgement in failed_lookup:
	work_ask(judgement)
	print("")
	print("Failed page view lookups:")
	for title in failed_pageviews:
	print("\t%s" % (title))

	wme_config = {}
	for article_id, config in result.items():
	weekly_views = config['daily_views'] * 7
	impressions = TARGET_IMPRESSIONS * len(config['scores'])
	sample_rate = min(1, round(weekly_views / impressions, 2))
	wme_config[article_id] = {
	'sampleRate': sample_rate,
	'queries': list(config['scores'].keys()),
	}

	with open('config.json', 'w') as f:
	f.write(json.dumps(result))
	with open('wme_config.json', 'w') as f:
	f.write(json.dumps(wme_config))

Event Timeline

EBernhardson created this paste.Sep 5 2017, 4:02 PM

EBernhardson mentioned this in T174106: Search Relevance Survey test #3: action items.

collect human relevance configurationActivePublicActions

Event Timeline

collect human relevance configuration
ActivePublic
Actions