Page MenuHomePhabricator
Paste P10507

fetch ORES articletopic scores for all wiki pages
ActivePublic

Authored by Tgr on Feb 25 2020, 2:18 AM.
Tags
None
Referenced Files
F31629974: raw.txt
Feb 25 2020, 2:18 AM
Subscribers
None
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import itertools
import mwapi
import oresapi
def make_batch(iterable, size):
iterable = iter(iterable)
next_batch = tuple(itertools.islice(iterable, size))
while next_batch:
yield next_batch
next_batch = tuple(itertools.islice(iterable, size))
def ores_row(revid, predictions):
return {'wiki': 'enwiki', 'revid': revid, 'predictions': predictions}
api = mwapi.Session('https://en.wikipedia.org',
user_agent='Test / gtisza@wikimedia.org')
ores = oresapi.Session( 'https://ores.wikimedia.org',
user_agent='Test / gtisza@wikimedia.org')
page_batches = api.get(
formatversion=2,
action='query',
generator='allpages',
gaplimit=500,
prop='revisions',
rvprop='ids',
continuation=True,
)
pages = (page
for batch in page_batches
for page in batch['query']['pages']
)
revisions = (page['revisions'][0]['revid'] for page in pages)
revision_batches = make_batch(revisions, 1000)
scores = ((revid, data['articletopic']['score']['probability'])
for revids in revision_batches
for (revid, data) in zip(revids, ores.score('enwiki', ['articletopic'], revids))
)
# test
scores = itertools.islice(scores, 1500)
with open('test.txt', 'w') as out:
for (revid, prediction) in scores:
print(ores_row(revid, prediction), file=out)