import json
import math
from multiprocessing.dummy import Pool
import requests
import urllib.parse

TARGET_IMPRESSIONS = 1000
REMOVE_QUERIES = set([
    '298005 b.c.',
    'antonio parent',
    'brook valley division monroe north carolina carolina',
    'LAW IN LONDON',
    'Leix\u00f5es S.C.',
    'what if lamba is lower than required',
    'aa rochester ny meetings',
    'Antiochus Epiphanes was another: after his detention at Rome and attempts',
    'compare roads built Marcos and Aquino',
    'examples of confidential email message',
    'highliting text and calculations in word documents',
    'how would you test for starch?explain your answer',
    'Pets parallelism',
    'red room tor sites',
    'SANTA CLAUS PRINT OUT 3D PAPERTOYS',
    'tayps of wlding difats',
    'top out at',
    'treewards',
    'Unblock.request reason isis',
    'when was it found',
])

REWRITE_QUERIES = {
    ', enzymes concentration': 'enzymes concentration',
    'Death of a Red Heroine by Qiu Xialong:': 'Death of a Red Heroine by Qiu Xialong',
    'ORDERS OF IRDA ABOUT SURRENDER': 'orders of IRDA about surrender',
    'PURI TEMPLE': 'Puri temple',
    'rape of nanking]': 'rape of nanking',
    'Trust (property': 'Trust(property)',
}

def take(n, iterable):
    i = 0
    for x in iterable:
        yield x
        i += 1
        if i > n:
            break

def simplify_title(title):
    return title.replace(',', '').replace('.', '')

def lookup_article(orig_title, ask=False):
    # /w/api.php?action=query&format=json&titles=&generator=prefixsearch&redirects=1&gpssearch=Old+english+personal+pronouns
    url = 'https://en.wikipedia.org/w/api.php'
    query = {
        'action': 'query',
        'format': 'json',
        'formatversion': 2,
        'generator': 'prefixsearch',
        'redirects': 1,
        'gpssearch': orig_title,
        'gpslimit': 10,
        'gpsprofile': 'strict'
    }
    response = requests.get(url, params=query)
    if 'query' not in response.json():
        # No results
        return None, orig_title

    results = sorted(response.json()['query']['pages'], key=lambda x: x['index'])
    if 'redirects' in response.json()['query']:
        for redirect in response.json()['query']['redirects']:
            for result in results:
                if redirect['to'] == result['title']:
                    if 'orig_title' in result:
                        raise Exception('duplicate redirect')
                    result['orig_title'] = redirect['from']
                    break
            else:
                raise Exception('redirect not found in results')
    for result in results:
        if 'orig_title' not in result:
            result['orig_title'] = result['title']

    transformers = [lambda x:x, lambda x: x.replace(',', ''), lambda x: x.replace(':', ''), lambda x: x.lower()]
    for transformer in transformers:
        title = transformer(orig_title)
        for result in results:
            if transformer(result['orig_title']) == title:
                if 'pageid' in result:
                    return result['pageid'], result['title']
                else:
                    print("No pageid: %s" % (json.dumps(result)))
                    return None, result['title']

    if ask and len(results) > 0:
        while True:
            print("Requested title: %s" % (orig_title))
            for i, result in enumerate(results):
                print("\t%d - %s" % (i, result['orig_title']))
            print("\t%d - NONE OF THE ABOVE" % (i + 1))
            chosen_text = input("Choose a title: ")
            try:
                index = int(chosen_text)
                if index == len(results):
                    return None, orig_title
                else:
                    return results[index]['pageid'], results[index]['title']
            except:
                print("Invalid input\n")
    return None, orig_title

def lookup_page_views(title):
    quoted = urllib.parse.quote(title).replace('/', '%2F')
    url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/%s/daily/2017082000/2017082600' % (quoted)
    response = requests.get(url)
    if response.status_code != 200:
        return None
    items = response.json()['items']
    if len(items) > 0:
        return sum([item['views'] for item in items]) / len(items)
    else:
        return 1


def rewrite_query(query):
    query = query.strip()
    if query in REMOVE_QUERIES:
        return None
    if query in REWRITE_QUERIES:
        return REWRITE_QUERIES[query]
    return query

def gen_work(ask=False, result=None):
    if result is None:
        result = {}
    failed_lookup = list()
    failed_pageviews = list()
    def work(judgement):
        query = rewrite_query(judgement['query'])
        if query is None:
            return
        if judgement['score'] is None:
            return

        if not ask:
            print('.', end='', flush=True)
        article_id, title = lookup_article(judgement['title'], ask)
        if article_id is None:
            failed_lookup.append(judgement)
            return

        score = float(judgement['score'])
        if article_id in result:
            if query in result[article_id]['scores']:
                print("\nDuplicate query `%s` for article `%s` with scores %.2s, %.2s" % (query, title, judgement['score'], result[article_id]['scores'][query]))
                result[article_id]['scores'][query] = max(score, result[article_id]['scores'][query])
            result[article_id]['scores'][query] = score
        else:
            daily_views = lookup_page_views(title)
            if daily_views is None:
                # This are often very odd pages returned from non-wiki search,
                # like Portal:London/Did you know/05 2009 that have no page views
                failed_pageviews.append(title)
            else:
                result[article_id] = {
                    'title': title,
                    'daily_views': daily_views,
                    'scores': {query: score}
                }
        return
    return (result, failed_lookup, failed_pageviews, work)

if __name__ == "__main__":
    with open('discernatron_all_scores.json', 'r') as f:
        data = json.loads(f.read())


    print("Collecting")
    (result, failed_lookup, failed_pageviews, work) = gen_work()
    list(Pool(5).imap_unordered(work, data['scores']))
    print("\nDone collecting.")

    (_, _, _, work_ask) = gen_work(ask=True, result=result)
    print("Failed article lookups:")
    for judgement in failed_lookup:
        work_ask(judgement)
    print("")
    print("Failed page view lookups:")
    for title in failed_pageviews:
        print("\t%s" % (title))

    wme_config = {}
    for article_id, config in result.items():
        weekly_views = config['daily_views'] * 7
        impressions = TARGET_IMPRESSIONS * len(config['scores'])
        sample_rate = min(1, round(weekly_views / impressions, 2))
        wme_config[article_id] = {
            'sampleRate': sample_rate,
            'queries': list(config['scores'].keys()),
        }

    with open('config.json', 'w') as f:
        f.write(json.dumps(result))
    with open('wme_config.json', 'w') as f:
        f.write(json.dumps(wme_config))