Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Paste
P10507
fetch ORES articletopic scores for all wiki pages
Active
Public
Actions
Authored by
Tgr
on Feb 25 2020, 2:18 AM.
Edit Paste
Archive Paste
View Raw File
Subscribe
Mute Notifications
Award Token
Flag For Later
Tags
None
Referenced Files
F31629974: raw.txt
Feb 25 2020, 2:18 AM
2020-02-25 02:18:10 (UTC+0)
Subscribers
None
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import
itertools
import
mwapi
import
oresapi
def
make_batch
(
iterable
,
size
):
iterable
=
iter
(
iterable
)
next_batch
=
tuple
(
itertools
.
islice
(
iterable
,
size
))
while
next_batch
:
yield
next_batch
next_batch
=
tuple
(
itertools
.
islice
(
iterable
,
size
))
def
ores_row
(
revid
,
predictions
):
return
{
'wiki'
:
'enwiki'
,
'revid'
:
revid
,
'predictions'
:
predictions
}
api
=
mwapi
.
Session
(
'https://en.wikipedia.org'
,
user_agent
=
'Test / gtisza@wikimedia.org'
)
ores
=
oresapi
.
Session
(
'https://ores.wikimedia.org'
,
user_agent
=
'Test / gtisza@wikimedia.org'
)
page_batches
=
api
.
get
(
formatversion
=
2
,
action
=
'query'
,
generator
=
'allpages'
,
gaplimit
=
500
,
prop
=
'revisions'
,
rvprop
=
'ids'
,
continuation
=
True
,
)
pages
=
(
page
for
batch
in
page_batches
for
page
in
batch
[
'query'
][
'pages'
]
)
revisions
=
(
page
[
'revisions'
][
0
][
'revid'
]
for
page
in
pages
)
revision_batches
=
make_batch
(
revisions
,
1000
)
scores
=
((
revid
,
data
[
'articletopic'
][
'score'
][
'probability'
])
for
revids
in
revision_batches
for
(
revid
,
data
)
in
zip
(
revids
,
ores
.
score
(
'enwiki'
,
[
'articletopic'
],
revids
))
)
# test
scores
=
itertools
.
islice
(
scores
,
1500
)
with
open
(
'test.txt'
,
'w'
)
as
out
:
for
(
revid
,
prediction
)
in
scores
:
print
(
ores_row
(
revid
,
prediction
),
file
=
out
)
Event Timeline
Tgr
created this paste.
Feb 25 2020, 2:18 AM
2020-02-25 02:18:10 (UTC+0)
Tgr
mentioned this in
T243357: Once the ORES articletopic - ElasticSearch pipeline is set up, update data about all articles
.
Log In to Comment