Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Paste
P8829
structured data aggregations on commons in elasticsearch
Active
Public
Actions
Authored by
EBernhardson
on Jul 30 2019, 4:24 PM.
Edit Paste
Archive Paste
View Raw File
Subscribe
Mute Notifications
Award Token
Flag For Later
Tags
None
Referenced Files
F29907227: raw.txt
Jul 30 2019, 4:24 PM
2019-07-30 16:24:12 (UTC+0)
Subscribers
None
import json
import requests
def get_source_search(term):
res = requests.get('https://commons.wikimedia.org/wiki/Special:Search', params={
'cirrusDumpQuery': 1,
'fulltext': 1,
'search': term
})
try:
return res.json()
except:
print(res.text)
raise
def augment_search(search):
query = search['__main__']['query']
del query['highlight']
query['size'] = 0
query['aggs'] = {
'depicts': {
'terms': {
'field': 'statement_keywords'
}
}
}
return query
def run_elastic_query(search_query):
url = 'http://localhost:9200/commonswiki_file/page/_search'
res = requests.post(url, data=json.dumps(search_query), headers={
'Content-Type': 'application/json'
})
return res.json()
def extract_qids(results):
for bucket in results['aggregations']['depicts']['buckets']:
q_item = bucket['key'].split('=', 1)[1]
count = bucket['doc_count']
yield (q_item, count)
def augment_qids(qids):
titles = []
for qid, count in qids:
titles.append(qid)
res = requests.get('https://www.wikidata.org/w/api.php', params={
'action': 'wbgetentities',
'ids': '|'.join(titles),
'props': 'labels',
'format': 'json',
'formatversion': 2,
})
out = {}
for qid, data in res.json()['entities'].items():
try:
label = data['labels']['en']
except KeyError:
label = next(iter(data['labels'].values()))
out[qid] = label['value']
return out
def doit(term):
source = get_source_search(term)
final = augment_search(source)
results = run_elastic_query(final)
qids = list(extract_qids(results))
augmented = augment_qids(qids)
for qid, count in qids:
yield (qid, augmented[qid], count)
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument('term')
args = parser.parse_args()
print('|count|qid|label')
for qid, label, count in doit(args.term):
print('|{: 5d}|{:10s}|{}'.format(count, qid, label))
Event Timeline
EBernhardson
created this paste.
Jul 30 2019, 4:24 PM
2019-07-30 16:24:12 (UTC+0)
EBernhardson
mentioned this in
T229027: search of related images on wikidata (for structured data on commons)
.
Jul 30 2019, 4:30 PM
2019-07-30 16:30:19 (UTC+0)
Log In to Comment