Page MenuHomePhabricator
Paste P17709

vandalized_articles_checker.py
ActivePublic

Authored by kevinbazira on Nov 8 2021, 4:39 PM.
Referenced Files
F34741004: vandalized_articles_checker.py
Nov 10 2021, 5:22 PM
F34740900: vandalized_articles_checker.py
Nov 10 2021, 3:56 PM
F34738193: vandalized_articles_checker.py
Nov 8 2021, 4:39 PM
Subscribers
Tokens
"Like" token, awarded by ACraze.
"""
``vandalized_articles_checker -h``
::
"vandalized_articles_checker.py" is a python utility that takes
article_titles as command-line arguments then uses the ORES editquality
damaging model to return a percentage of damaging edits per article.
Usage:
vandalized_articles_checker -h | --help
vandalized_articles_checker <input>...
Options:
-h --help Print this documentation.
<input> List of input article titles
e.g: "Michael Jordan" "Barack Obama"
"""
import mwapi
import logging.config
from docopt import docopt
from revscoring import Model
from revscoring.extractors import api
from tabulate import tabulate
from tqdm import tqdm
def main(argv=None):
"""
Parse command-line arguments, analyse each article's edits and
print their vandalism percentages in a table.
"""
cli_args = docopt(__doc__, argv=argv)
article_titles = cli_args["<input>"]
mwapi_session = mwapi.Session(
host="https://en.wikipedia.org",
user_agent="vandalized_articles_checker"
)
articles_and_vandalism_percentages = articles_vandalism_percentages(
article_titles,
mwapi_session
)
vandalized_articles_table(articles_and_vandalism_percentages)
def vandalized_articles_table(articles_and_vandalism_percentages):
"""
Print tabular data that shows percentages of vandalized articles
"""
vandalized_articles_table = tabulate(
articles_and_vandalism_percentages,
headers=["Article", "Percentage"],
tablefmt="grid",
)
print(vandalized_articles_table)
def articles_vandalism_percentages(article_titles, mwapi_session):
"""
Return articles and their vandalism percentages while
printing progress bars as each article is being processed.
"""
articles_and_vandalism_percentages = []
articles_bar = "progress analysing all articles"
article_bar = "analysing edits for '"
for article_title in tqdm(article_titles, desc=articles_bar):
edits = edit_ids(article_title, mwapi_session)
damaging_edits_scores = [
damaging_score(edit_id, mwapi_session) for edit_id in tqdm(
edits,
desc=article_bar + article_title + "'"
)
]
damaging_edits_percentage = percentage_of_damaging_edits(
damaging_edits_scores
)
articles_and_vandalism_percentages.append(
[article_title, damaging_edits_percentage]
)
return articles_and_vandalism_percentages
def percentage_of_damaging_edits(damaging_edits_scores):
"""
Return percentage of damaging edits based on damaging edits' scores
"""
number_of_damaging_edits = damaging_edits_scores.count(True)
number_of_edits = len(damaging_edits_scores)
percentage_of_damaging_edits = (
number_of_damaging_edits / number_of_edits
) * 100
return percentage_of_damaging_edits
def edit_ids(article_title, mwapi_session):
"""
Return edit ids based on article title and number_of_edits_limit
"""
number_of_edits_limit = "500" # MediaWiki API has an rvlimit: 1 - 500
mwapi_response = mwapi_session.get(
action="query",
prop="revisions",
titles=article_title,
rvlimit=number_of_edits_limit,
rvprop="ids|timestamp|user",
rvslots="main",
formatversion="2",
format="json",
)
edits = mwapi_response["query"]["pages"][0]["revisions"]
edit_ids = [edit["revid"] for edit in edits]
return edit_ids
def damaging_score(edit_id, mwapi_session):
"""
Return damaging score for an edit id
"""
disable_logging(True)
with open("enwiki.damaging.gradient_boosting.model") as f:
model = Model.load(f)
disable_logging(False)
extractor = api.Extractor(mwapi_session)
damaging_score = True
try:
values = extractor.extract(edit_id, model.features)
model_score = model.score(values)
damaging_score = model_score["prediction"]
except Exception:
# handling revscoring deleted revisions
pass
return damaging_score
def disable_logging(config):
"""
Enable or diable logging
"""
logging.config.dictConfig({
"version": 1,
"disable_existing_loggers": config
})
if __name__ == "__main__":
main()