Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F34740900
vandalized_articles_checker.py
No One
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
kevinbazira
Nov 10 2021, 3:56 PM
2021-11-10 15:56:10 (UTC+0)
Size
4 KB
Referenced Files
None
Subscribers
None
vandalized_articles_checker.py
View Options
"""
``vandalized_articles_checker -h``
::
"vandalized_articles_checker.py" is a python utility that takes
article_titles as command-line arguments then uses the ORES editquality
damaging model to return a percentage of damaging edits per article.
Usage:
vandalized_articles_checker -h | --help
vandalized_articles_checker <input>...
Options:
-h --help Print this documentation.
<input> List of input article titles
e.g: "Michael Jordan" "Barack Obama"
"""
import
mwapi
import
logging.config
from
docopt
import
docopt
from
revscoring
import
Model
from
revscoring.extractors
import
api
from
tabulate
import
tabulate
from
tqdm
import
tqdm
def
main
(
argv
=
None
):
"""
Parse command-line arguments, analyse each article's edits and
print their vandalism percentages in a table.
"""
cli_args
=
docopt
(
__doc__
,
argv
=
argv
)
article_titles
=
cli_args
[
"<input>"
]
mwapi_session
=
mwapi
.
Session
(
host
=
"https://en.wikipedia.org"
,
user_agent
=
"vandalized_articles_checker"
)
articles_and_vandalism_percentages
=
articles_vandalism_percentages
(
article_titles
,
mwapi_session
)
vandalized_articles_table
(
articles_and_vandalism_percentages
)
def
vandalized_articles_table
(
articles_and_vandalism_percentages
):
"""
Print tabular data that shows percentages of vandalized articles
"""
vandalized_articles_table
=
tabulate
(
articles_and_vandalism_percentages
,
headers
=
[
"Article"
,
"Percentage"
],
tablefmt
=
"grid"
,
)
print
(
vandalized_articles_table
)
def
articles_vandalism_percentages
(
article_titles
,
mwapi_session
):
"""
Return articles and their vandalism percentages while
printing progress bars as each article is being processed.
"""
articles_and_vandalism_percentages
=
[]
articles_bar
=
"progress analysing all articles"
article_bar
=
"analysing edits for '"
for
article_title
in
tqdm
(
article_titles
,
desc
=
articles_bar
):
edits
=
edit_ids
(
article_title
,
mwapi_session
)
damaging_edits_scores
=
[
damaging_score
(
edit_id
,
mwapi_session
)
for
edit_id
in
tqdm
(
edits
,
desc
=
article_bar
+
article_title
+
"'"
)
]
damaging_edits_percentage
=
percentage_of_damaging_edits
(
damaging_edits_scores
)
articles_and_vandalism_percentages
.
append
(
[
article_title
,
damaging_edits_percentage
]
)
return
articles_and_vandalism_percentages
def
percentage_of_damaging_edits
(
damaging_edits_scores
):
"""
Return percentage of damaging edits based on damaging edits' scores
"""
number_of_damaging_edits
=
damaging_edits_scores
.
count
(
True
)
number_of_edits
=
len
(
damaging_edits_scores
)
percentage_of_damaging_edits
=
(
number_of_damaging_edits
/
number_of_edits
)
*
100
return
percentage_of_damaging_edits
def
edit_ids
(
article_title
,
mwapi_session
):
"""
Return edit ids based on article title and number_of_edits_limit
"""
number_of_edits_limit
=
"100"
# MediaWiki API has an rvlimit: 1 - 500
mwapi_response
=
mwapi_session
.
get
(
action
=
"query"
,
prop
=
"revisions"
,
titles
=
article_title
,
rvlimit
=
number_of_edits_limit
,
rvprop
=
"ids|timestamp|user"
,
rvslots
=
"main"
,
formatversion
=
"2"
,
format
=
"json"
,
)
edits
=
mwapi_response
[
"query"
][
"pages"
][
0
][
"revisions"
]
edit_ids
=
[
edit
[
"revid"
]
for
edit
in
edits
]
return
edit_ids
def
damaging_score
(
edit_id
,
mwapi_session
):
"""
Return damaging score for an edit id
"""
disable_logging
(
True
)
with
open
(
"enwiki.damaging.gradient_boosting.model"
)
as
f
:
model
=
Model
.
load
(
f
)
disable_logging
(
False
)
extractor
=
api
.
Extractor
(
mwapi_session
)
damaging_score
=
True
try
:
values
=
extractor
.
extract
(
edit_id
,
model
.
features
)
model_score
=
model
.
score
(
values
)
damaging_score
=
model_score
[
"prediction"
]
except
Exception
:
# handling revscoring deleted revisions
pass
return
damaging_score
def
disable_logging
(
config
):
"""
Enable or diable logging
"""
logging
.
config
.
dictConfig
({
"version"
:
1
,
"disable_existing_loggers"
:
config
})
if
__name__
==
"__main__"
:
main
()
File Metadata
Details
Attached
Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
9235154
Default Alt Text
vandalized_articles_checker.py (4 KB)
Attached To
Mode
P17709 vandalized_articles_checker.py
Attached
Detach File
Event Timeline
Log In to Comment