Page MenuHomePhabricator
Paste P10884

English Wikipedia Drafttopic thresholds
ActivePublic

Authored by Halfak on Apr 3 2020, 3:55 PM.
$ python get_thresholds.py enwiki drafttopic
------------------------------------------- -------- --------- --------- ------
label pop rate threshold precision recall
Culture.Biography.Biography* 0.123 0.335 0.7 0.893
Culture.Biography.Women 0.015 0.731 0.5 0.531
Culture.Food and drink 0.002 0.377 0.5 0.702
Culture.Internet culture 0.004 0.884 0.7 0.597
Culture.Linguistics 0.007 0.275 0.7 0.725
Culture.Literature 0.016 0.499 0.5 0.678
Culture.Media.Books 0.004 0.684 0.5 0.541
Culture.Media.Entertainment 0.004 0.191 0.15 0.62
Culture.Media.Films 0.011 0.467 0.7 0.774
Culture.Media.Media* 0.059 0.698 0.7 0.726
Culture.Media.Music 0.024 0.235 0.7 0.843
Culture.Media.Radio 0.002 0.54 0.7 0.754
Culture.Media.Software 0.001 0.9 < 0.15
Culture.Media.Television 0.009 0.726 0.7 0.531
Culture.Media.Video games 0.003 0.579 0.7 0.846
Culture.Performing arts 0.003 0.579 0.5 0.576
Culture.Philosophy and religion 0.011 0.187 0.3 0.688
Culture.Sports 0.071 0.041 0.7 0.963
Culture.Visual arts.Architecture 0.011 0.736 0.7 0.583
Culture.Visual arts.Comics and Anime 0.002 0.646 0.5 0.69
Culture.Visual arts.Fashion 0.001 0.506 0.3 0.658
Culture.Visual arts.Visual arts* 0.018 0.77 0.7 0.57
Geography.Geographical 0.024 0.428 0.7 0.682
Geography.Regions.Africa.Africa* 0.008 0.809 0.7 0.705
Geography.Regions.Africa.Central Africa 0.0 0.9 < 0.15
Geography.Regions.Africa.Eastern Africa 0.0 0.89 0.5 0.643
Geography.Regions.Africa.Northern Africa 0.001 0.831 0.5 0.576
Geography.Regions.Africa.Southern Africa 0.001 0.583 0.5 0.682
Geography.Regions.Africa.Western Africa 0.001 0.482 0.3 0.778
Geography.Regions.Americas.Central America 0.003 0.764 0.7 0.504
Geography.Regions.Americas.North America 0.064 0.55 0.7 0.603
Geography.Regions.Americas.South America 0.006 0.644 0.7 0.631
Geography.Regions.Asia.Asia* 0.046 0.531 0.7 0.813
Geography.Regions.Asia.Central Asia 0.001 0.812 0.5 0.647
Geography.Regions.Asia.East Asia 0.011 0.672 0.7 0.655
Geography.Regions.Asia.North Asia 0.001 0.48 0.15 0.616
Geography.Regions.Asia.South Asia 0.015 0.105 0.7 0.9
Geography.Regions.Asia.Southeast Asia 0.006 0.47 0.7 0.734
Geography.Regions.Asia.West Asia 0.011 0.298 0.7 0.802
Geography.Regions.Europe.Eastern Europe 0.013 0.556 0.7 0.712
Geography.Regions.Europe.Europe* 0.076 0.662 0.7 0.623
Geography.Regions.Europe.Northern Europe 0.031 0.654 0.7 0.542
Geography.Regions.Europe.Southern Europe 0.013 0.643 0.7 0.576
Geography.Regions.Europe.Western Europe 0.019 0.7 0.7 0.508
Geography.Regions.Oceania 0.015 0.211 0.7 0.828
History and Society.Business and economics 0.01 0.14 0.15 0.721
History and Society.Education 0.007 0.27 0.3 0.584
History and Society.History 0.011 0.149 0.15 0.683
History and Society.Military and warfare 0.014 0.773 0.7 0.531
History and Society.Politics and government 0.028 0.571 0.7 0.539
History and Society.Society 0.013 0.151 0.15 0.643
History and Society.Transportation 0.015 0.416 0.7 0.843
STEM.Biology 0.034 0.088 0.7 0.892
STEM.Chemistry 0.002 0.557 0.3 0.622
STEM.Computing 0.003 0.309 0.15 0.759
STEM.Earth and environment 0.005 0.645 0.7 0.623
STEM.Engineering 0.005 0.842 0.7 0.562
STEM.Libraries & Information 0.001 0.396 0.15 0.576
STEM.Mathematics 0.0 0.458 0.15 0.676
STEM.Medicine & Health 0.006 0.785 0.7 0.508
STEM.Physics 0.001 0.436 0.15 0.64
STEM.STEM* 0.069 0.469 0.7 0.853
STEM.Space 0.006 0.173 0.7 0.893
STEM.Technology 0.005 0.289 0.15 0.711
------------------------------------------- -------- --------- --------- ------

Event Timeline

Here's the content of the get_thresholds.py script:

"""
Queries for optimal thresholds from ORES.


Usage:
    get_thresholds (-h|--help)
    get_thresholds <wiki> <model>

Options:
    -h --help  Prints this documentation
    <wiki>     The DBname of the wiki to query thresholds for.
    <model>    The name of the model to get thresholds for.
"""
import docopt
import requests
from tabulate import tabulate

ORES_HOST = "https://ores.wikimedia.org"
PATH = "/v3/scores"
PRECISION_TARGETS = [0.7, 0.5, 0.3, 0.15]


def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    wiki = args['<wiki>']
    model = args['<model>']

    headers = [['label', 'pop rate', 'threshold', 'precision', 'recall']]

    table_data = headers
    for label, pop_rate in get_labels(wiki, model):
        threshold, precision, recall = get_best_threshold(wiki, model, label)
        row = [label, pop_rate, threshold, precision, recall]
        table_data.append(row)

    print(tabulate(table_data))


def get_labels(wiki, model):
    doc = requests.get(
        ORES_HOST + PATH + "/" + wiki + "/",
        params={
            'models': model,
            'model_info': "params|statistics.rates"
        }
    ).json()
    labels = doc[wiki]['models'][model]['params']['labels']
    pop_rates = doc[wiki]['models'][model]['statistics']['rates']['population']
    return [(l, pop_rates[l]) for l in labels]


def get_threshold(wiki, model, label, target):
    doc = requests.get(
        ORES_HOST + PATH + "/" + wiki + "/",
        params={
            'models': model,
            'model_info': "statistics.thresholds.{0}.'maximum recall @ precision >= {1}'".format(repr(label), target)
        }
    ).json()

    thresholds = doc[wiki]['models'][model]['statistics']['thresholds'][label]
    if len(thresholds) == 1 and thresholds[0] is not None:
        return thresholds[0]['threshold'], thresholds[0]['recall']
    else:
        return None, None


def get_best_threshold(wiki, model, label):
    for target in PRECISION_TARGETS:
        threshold, recall = get_threshold(wiki, model, label, target)
        if recall is not None and recall >= 0.5:
            return threshold, target, recall

    return 0.9, "< 0.15", None


if __name__ == '__main__':
    main()