Page MenuHomePhabricator
Paste P8829

structured data aggregations on commons in elasticsearch
ActivePublic

Authored by EBernhardson on Jul 30 2019, 4:24 PM.
Tags
None
Referenced Files
F29907227: raw.txt
Jul 30 2019, 4:24 PM
Subscribers
None
import json
import requests
def get_source_search(term):
res = requests.get('https://commons.wikimedia.org/wiki/Special:Search', params={
'cirrusDumpQuery': 1,
'fulltext': 1,
'search': term
})
try:
return res.json()
except:
print(res.text)
raise
def augment_search(search):
query = search['__main__']['query']
del query['highlight']
query['size'] = 0
query['aggs'] = {
'depicts': {
'terms': {
'field': 'statement_keywords'
}
}
}
return query
def run_elastic_query(search_query):
url = 'http://localhost:9200/commonswiki_file/page/_search'
res = requests.post(url, data=json.dumps(search_query), headers={
'Content-Type': 'application/json'
})
return res.json()
def extract_qids(results):
for bucket in results['aggregations']['depicts']['buckets']:
q_item = bucket['key'].split('=', 1)[1]
count = bucket['doc_count']
yield (q_item, count)
def augment_qids(qids):
titles = []
for qid, count in qids:
titles.append(qid)
res = requests.get('https://www.wikidata.org/w/api.php', params={
'action': 'wbgetentities',
'ids': '|'.join(titles),
'props': 'labels',
'format': 'json',
'formatversion': 2,
})
out = {}
for qid, data in res.json()['entities'].items():
try:
label = data['labels']['en']
except KeyError:
label = next(iter(data['labels'].values()))
out[qid] = label['value']
return out
def doit(term):
source = get_source_search(term)
final = augment_search(source)
results = run_elastic_query(final)
qids = list(extract_qids(results))
augmented = augment_qids(qids)
for qid, count in qids:
yield (qid, augmented[qid], count)
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument('term')
args = parser.parse_args()
print('|count|qid|label')
for qid, label, count in doit(args.term):
print('|{: 5d}|{:10s}|{}'.format(count, qid, label))