Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F29907227
raw.txt
No One
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
EBernhardson
Jul 30 2019, 4:24 PM
2019-07-30 16:24:12 (UTC+0)
Size
2 KB
Referenced Files
None
Subscribers
None
raw.txt
View Options
import json
import requests
def get_source_search(term):
res = requests.get('https://commons.wikimedia.org/wiki/Special:Search', params={
'cirrusDumpQuery': 1,
'fulltext': 1,
'search': term
})
try:
return res.json()
except:
print(res.text)
raise
def augment_search(search):
query = search['__main__']['query']
del query['highlight']
query['size'] = 0
query['aggs'] = {
'depicts': {
'terms': {
'field': 'statement_keywords'
}
}
}
return query
def run_elastic_query(search_query):
url = 'http://localhost:9200/commonswiki_file/page/_search'
res = requests.post(url, data=json.dumps(search_query), headers={
'Content-Type': 'application/json'
})
return res.json()
def extract_qids(results):
for bucket in results['aggregations']['depicts']['buckets']:
q_item = bucket['key'].split('=', 1)[1]
count = bucket['doc_count']
yield (q_item, count)
def augment_qids(qids):
titles = []
for qid, count in qids:
titles.append(qid)
res = requests.get('https://www.wikidata.org/w/api.php', params={
'action': 'wbgetentities',
'ids': '|'.join(titles),
'props': 'labels',
'format': 'json',
'formatversion': 2,
})
out = {}
for qid, data in res.json()['entities'].items():
try:
label = data['labels']['en']
except KeyError:
label = next(iter(data['labels'].values()))
out[qid] = label['value']
return out
def doit(term):
source = get_source_search(term)
final = augment_search(source)
results = run_elastic_query(final)
qids = list(extract_qids(results))
augmented = augment_qids(qids)
for qid, count in qids:
yield (qid, augmented[qid], count)
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument('term')
args = parser.parse_args()
print('|count|qid|label')
for qid, label, count in doit(args.term):
print('|{: 5d}|{:10s}|{}'.format(count, qid, label))
File Metadata
Details
Attached
Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
7795634
Default Alt Text
raw.txt (2 KB)
Attached To
Mode
P8829 structured data aggregations on commons in elasticsearch
Attached
Detach File
Event Timeline
Log In to Comment