diff --git a/docker-compose.yml b/docker-compose.yml index b013bec..03359b6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,16 +1,25 @@ version: '2' services: web: build: context: . dockerfile: conf/Dockerfile.web command: flask run --host=0.0.0.0 environment: - FLASK_APP=integraality/app.py - FLASK_ENV=development - PYWIKIBOT_NO_USER_CONFIG=1 - PYTHONPATH=/code/integraality/ + - JAEGER_AGENT_HOST=jaeger + - JAEGER_AGENT_PORT=6831 ports: - - 5000:5000 + - 5000:5000 volumes: - - .:/code + - .:/code + depends_on: + - jaeger + jaeger: + image: jaegertracing/all-in-one:latest + ports: + - "6831:6831/udp" + - "16686:16686" diff --git a/integraality/app.py b/integraality/app.py index ecdd11b..444835c 100755 --- a/integraality/app.py +++ b/integraality/app.py @@ -1,63 +1,102 @@ #!/usr/bin/python # -*- coding: utf-8 -*- +import logging import os from flask import Flask, render_template, request +from flask_opentracing import FlaskTracing +from jaeger_client import Config from pages_processor import PagesProcessor, ProcessingException +def _initialize_tracer(): + config = Config(config={'sampler': {'type': 'const', 'param': 1}, + 'logging': True, + 'local_agent': { + 'reporting_host': os.environ.get('JAEGER_AGENT_HOST'), + 'reporting_port': os.environ.get('JAEGER_AGENT_PORT'), + } + }, + # Service name can be arbitrary string describing this particular web service. + service_name="integraality", + validate=True) + print("Tracing enabled!") + return config.initialize_tracer() + + app = Flask(__name__) app.debug = True +jaeger_tracer = _initialize_tracer() +tracing = FlaskTracing(jaeger_tracer, True, app) @app.route('/') def index(): + print("INDEX") return render_template('index.html') @app.route('/update') def update(): page = request.args.get('page') processor = PagesProcessor() try: processor.process_one_page(page) return render_template('update.html', page=page) except ProcessingException as e: return render_template('update_error.html', page=page, error_message=e) except Exception as e: return render_template('update_unknown_error.html', page=page, error_type=type(e), error_message=e) @app.route('/queries') def queries(): page = request.args.get('page') property = request.args.get('property') grouping = request.args.get('grouping') - processor = PagesProcessor() + with jaeger_tracer.start_active_span('flask_queries_pages_processor') as scope: + processor = PagesProcessor() + scope.span.log_kv({'event': 'PagesProcessor created', 'result': processor}) try: - stats = processor.make_stats_object_for_page_title(page) - positive_query = stats.get_query_for_items_for_property_positive(property, grouping) - negative_query = stats.get_query_for_items_for_property_negative(property, grouping) + with jaeger_tracer.start_active_span('flask_queries_make_stats_object_for_page_title') as scope: + stats = processor.make_stats_object_for_page_title(page) + scope.span.log_kv({'event': 'stats object created', 'result': stats}) + + with jaeger_tracer.start_active_span('flask_queries_positive_query') as scope: + positive_query = stats.get_query_for_items_for_property_positive(property, grouping) + scope.span.log_kv({'event': 'positive_query computed', 'result': positive_query}) + + with jaeger_tracer.start_active_span('flask_queries_negative_query') as scope: + negative_query = stats.get_query_for_items_for_property_negative(property, grouping) + scope.span.log_kv({'event': 'negative_query computed', 'result': negative_query}) + return render_template('queries.html', page=page, property=property, grouping=grouping, positive_query=positive_query, negative_query=negative_query) except ProcessingException as e: return render_template('queries_error.html', page=page, error_message=e) except Exception as e: return render_template('queries_unknown_error.html', page=page, error_type=type(e), error_message=e) @app.errorhandler(404) def page_not_found(error): return render_template('page_not_found.html', title=u'Page not found'), 404 if __name__ == '__main__': - if os.uname()[1].startswith('tools-webgrid'): - from flup.server.fcgi_fork import WSGIServer - WSGIServer(app).run() - else: - if os.environ.get('LOCAL_ENVIRONMENT', False): - app.run(host='0.0.0.0') - else: - app.run() + print("START") + # if os.uname()[1].startswith('tools-webgrid'): + # from flup.server.fcgi_fork import WSGIServer + # WSGIServer(app).run() + # else: + # if os.environ.get('LOCAL_ENVIRONMENT', False): + # jaeger_tracer = _initialize_tracer() + # tracing = FlaskTracing(jaeger_tracer, True, app) + # app.run(debug=True, host='0.0.0.0') + # else: + # print("NOT LOCAL") + # app.run() + logging.info("INITIALISE") + + # app.run(debug=True, host='0.0.0.0') diff --git a/integraality/pages_processor.py b/integraality/pages_processor.py index 2d490a3..e858286 100644 --- a/integraality/pages_processor.py +++ b/integraality/pages_processor.py @@ -1,169 +1,190 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ Bot to generate statistics """ import re from ww import f +import opentracing import pywikibot from pywikibot import pagegenerators from property_statistics import ( PropertyConfig, PropertyStatistics, QueryException ) REQUIRED_CONFIG_FIELDS = ['selector_sparql', 'grouping_property', 'properties'] class ProcessingException(Exception): pass class ConfigException(ProcessingException): pass class NoEndTemplateException(ProcessingException): pass class PagesProcessor: def __init__(self): - site = pywikibot.Site('en', 'wikipedia') - self.repo = site.data_repository() + with opentracing.tracer.start_active_span('pages_processor_pywikibot_site') as scope: + site = pywikibot.Site('en', 'wikipedia') + scope.span.log_kv({'event': 'Pywkibot site created', 'result': site}) + with opentracing.tracer.start_active_span('pages_processor_data_repository') as scope: + self.repo = site.data_repository() + scope.span.log_kv({'event': 'data repository created', 'result': self.repo}) self.template_name = 'Property dashboard' self.end_template_name = 'Property dashboard end' self.summary = u'Update property usage stats' self.outputs = [] def get_all_pages(self): template = pywikibot.Page(self.repo, self.template_name, ns=10) return pagegenerators.ReferringPageGenerator(template, onlyTemplateInclusion=True) @staticmethod def extract_elements_from_template_param(template_param): """Extract and sanitize the contents of a parsed template param.""" (field, _, value) = template_param.partition(u'=') return (field.strip(), value.replace('{{!}}', '|')) def parse_config_from_params(self, params): return { key: value for (key, value) in [self.extract_elements_from_template_param(param) for param in params] if key } def make_stats_object_for_page(self, page): - all_templates_with_params = page.templatesWithParams() + with opentracing.tracer.start_active_span('pages_processor_make_stats_objects_templatesWithParams') as scope: + all_templates_with_params = page.templatesWithParams() + scope.span.log_kv({'event': 'templates retrieved', 'result': all_templates_with_params}) if self.end_template_name not in [template.title(with_ns=False) for (template, _) in all_templates_with_params]: raise NoEndTemplateException("No end template '%s' provided" % self.end_template_name) - start_templates_with_params = [ - (template, params) for (template, params) in all_templates_with_params if - template.title(with_ns=False) == self.template_name - ] + with opentracing.tracer.start_active_span('pages_processor_make_stats_objects_start_templates_with_params') as scope: + start_templates_with_params = [ + (template, params) for (template, params) in all_templates_with_params if + template.title(with_ns=False) == self.template_name + ] + scope.span.log_kv({'event': 'start_templates_with_params retrieved', 'result': start_templates_with_params}) if not start_templates_with_params: msg = ( "No start template '%s' found, which is an impossible situation. " "This is potentially an upstream pywikibot issue." % self.template_name ) raise ConfigException(msg) if len(start_templates_with_params) > 1: pywikibot.warn("More than one template on the page %s" % page.title()) (template, params) = start_templates_with_params[0] - parsed_config = self.parse_config_from_params(params) - config = self.parse_config(parsed_config) + with opentracing.tracer.start_active_span('pages_processor_make_stats_objects_parse_config_from_params') as scope: + parsed_config = self.parse_config_from_params(params) + scope.span.log_kv({'event': 'config parsed', 'result': parsed_config}) + with opentracing.tracer.start_active_span('pages_processor_make_stats_objects_parse_config') as scope: + config = self.parse_config(parsed_config) + scope.span.log_kv({'event': 'config created', 'result': config}) try: - return PropertyStatistics(**config) + with opentracing.tracer.start_active_span('pages_processor_make_stats_objects_create_property_statistics') as scope: + property_statistics = PropertyStatistics(**config) + scope.span.log_kv({'event': 'PropertyStatistics object created', 'result': property_statistics}) + return property_statistics except TypeError: raise ConfigException("The template parameters are incorrect.") def process_page(self, page): stats = self.make_stats_object_for_page(page) try: output = stats.retrieve_and_process_data() except QueryException as e: raise ConfigException(e) new_text = self.replace_in_page(output, page.get()) page.put(new_text, self.summary) def parse_config(self, config): for field in REQUIRED_CONFIG_FIELDS: if field not in config: pywikibot.output("Missing required field %s" % field) raise ConfigException("A required field is missing: %s" % field) config['properties'] = self.parse_config_properties(config['properties']) config['stats_for_no_group'] = bool(config.get('stats_for_no_group', False)) return config @staticmethod def parse_config_properties(properties_string): properties = properties_string.split(',') properties_data = [] for prop in properties: try: (key, title) = prop.split(':') except ValueError: (key, title) = (prop, None) if key: splitted = key.split('/') if len(splitted) == 3: (property_name, value, qualifier) = splitted elif len(splitted) == 2: (property_name, value, qualifier) = (splitted[0], None, splitted[1]) else: (property_name, value, qualifier) = (key, None, None) entry = PropertyConfig(property=property_name, title=title, qualifier=qualifier, value=value) properties_data.append(entry) return properties_data def replace_in_page(self, output, page_text): regex_text = f('({{{{{self.template_name}.*?(?= {self.grouping_threshold}) ORDER BY DESC(?count) LIMIT 1000 """) else: query = f(""" SELECT ?grouping (COUNT(DISTINCT ?entity) as ?count) WHERE {{ ?entity {self.selector_sparql} . ?entity wdt:{self.grouping_property} ?grouping . }} GROUP BY ?grouping HAVING (?count >= {self.grouping_threshold}) ORDER BY DESC(?count) LIMIT 1000 """) grouping_counts = collections.OrderedDict() grouping_groupings = collections.OrderedDict() sq = pywikibot.data.sparql.SparqlQuery() queryresult = sq.select(query) if not queryresult: raise QueryException("No result when querying groupings.") for resultitem in queryresult: qid = resultitem.get('grouping').replace(u'http://www.wikidata.org/entity/', u'') grouping_counts[qid] = int(resultitem.get('count')) if self.higher_grouping: value = resultitem.get('higher_grouping') if value: value = value.replace(u'http://www.wikidata.org/entity/', u'') grouping_groupings[qid] = value return (grouping_counts, grouping_groupings) def get_query_for_items_for_property_positive(self, property, grouping): query = f(""" SELECT DISTINCT ?entity ?entityLabel ?value ?valueLabel WHERE {{ ?entity {self.selector_sparql} .""") if grouping: query += f(""" ?entity wdt:{self.grouping_property} wd:{grouping} .""") else: query += f(""" MINUS {{ ?entity wdt:{self.grouping_property} [] . }}""") query += f(""" ?entity p:{property} ?prop . OPTIONAL {{ ?prop ps:{property} ?value }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} }} """) return query def get_query_for_items_for_property_negative(self, property, grouping): query = f(""" SELECT DISTINCT ?entity ?entityLabel WHERE {{ ?entity {self.selector_sparql} .""") if grouping: query += f(""" ?entity wdt:{self.grouping_property} wd:{grouping} . MINUS {{""") else: query += f(""" MINUS {{ {{?entity wdt:{self.grouping_property} [] .}} UNION""") query += f(""" {{?entity a wdno:{property} .}} UNION {{?entity wdt:{property} ?prop .}} }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} }} """) return query def get_property_info(self, property): """ Get the usage counts for a property for the groupings :param prop: Wikidata Pid of the property :return: (Ordered) dictionary with the counts per grouping """ query = f(""" SELECT ?grouping (COUNT(DISTINCT ?entity) as ?count) WHERE {{ ?entity {self.selector_sparql} . ?entity wdt:{self.grouping_property} ?grouping . FILTER EXISTS {{ ?entity p:{property} [] }} . }} GROUP BY ?grouping HAVING (?count >= {self.property_threshold}) ORDER BY DESC(?count) LIMIT 1000 """) result = collections.OrderedDict() sq = pywikibot.data.sparql.SparqlQuery() queryresult = sq.select(query) if not queryresult: return None for resultitem in queryresult: qid = resultitem.get('grouping').replace(u'http://www.wikidata.org/entity/', u'') result[qid] = int(resultitem.get('count')) return result def get_qualifier_info(self, property, qualifier, value="[]"): """ Get the usage counts for a qulifier for the groupings :param property: Wikidata Pid of the property :param qualifier: Wikidata Pid of the qualifier :return: (Ordered) dictionary with the counts per grouping """ query = f(""" SELECT ?grouping (COUNT(DISTINCT ?entity) as ?count) WHERE {{ ?entity {self.selector_sparql} . ?entity wdt:{self.grouping_property} ?grouping . FILTER EXISTS {{ ?entity p:{property} [ ps:{property} {value} ; pq:{qualifier} [] ] }} . }} GROUP BY ?grouping HAVING (?count >= {self.property_threshold}) ORDER BY DESC(?count) LIMIT 1000 """) result = collections.OrderedDict() sq = pywikibot.data.sparql.SparqlQuery() queryresult = sq.select(query) if not queryresult: return None for resultitem in queryresult: qid = resultitem.get('grouping').replace(u'http://www.wikidata.org/entity/', u'') result[qid] = int(resultitem.get('count')) return result def get_property_info_no_grouping(self, property): """ Get the usage counts for a property without a grouping :param property: Wikidata Pid of the property :return: (Ordered) dictionary with the counts per grouping """ query = f(""" SELECT (COUNT(?entity) AS ?count) WHERE {{ ?entity {self.selector_sparql} . MINUS {{ ?entity wdt:{self.grouping_property} _:b28. }} FILTER(EXISTS {{ ?entity p:{property} _:b29. }}) }} GROUP BY ?grouping ORDER BY DESC (?count) LIMIT 10 """) return self._get_count_from_sparql(query) def get_qualifier_info_no_grouping(self, property, qualifier, value='[]'): """ Get the usage counts for a qualifier without a grouping :param property: Wikidata Pid of the property :param qualifier: Wikidata Pid of the qualifier :return: (Ordered) dictionary with the counts per grouping """ query = f(""" SELECT (COUNT(?entity) AS ?count) WHERE {{ ?entity {self.selector_sparql} . MINUS {{ ?entity wdt:{self.grouping_property} _:b28. }} FILTER EXISTS {{ ?entity p:{property} [ ps:{property} {value} ; pq:{qualifier} [] ] }} . }} GROUP BY ?grouping ORDER BY DESC (?count) LIMIT 10 """) return self._get_count_from_sparql(query) def get_totals_for_property(self, property): """ Get the totals of entities with that property :param prop: Wikidata Pid of the property. :return: number of games found """ query = f(""" SELECT (COUNT(?item) as ?count) WHERE {{ ?item {self.selector_sparql} FILTER EXISTS {{ ?item p:{property}[] }} . }} """) return self._get_count_from_sparql(query) def get_totals_for_qualifier(self, property, qualifier, value="[]"): """ Get the totals of entities with that property :param prop: Wikidata Pid of the property. :return: number of games found """ query = f(""" SELECT (COUNT(?item) as ?count) WHERE {{ ?item {self.selector_sparql} FILTER EXISTS {{ ?item p:{property} [ ps:{property} {value} ; pq:{qualifier} [] ] }} . }} """) return self._get_count_from_sparql(query) def get_totals_no_grouping(self): query = f(""" SELECT (COUNT(?item) as ?count) WHERE {{ ?item {self.selector_sparql} MINUS {{ ?item wdt:{self.grouping_property} _:b28. }} }} """) return self._get_count_from_sparql(query) def get_totals(self): query = f(""" SELECT (COUNT(?item) as ?count) WHERE {{ ?item {self.selector_sparql} }} """) return self._get_count_from_sparql(query) @staticmethod def _get_count_from_sparql(query): sq = pywikibot.data.sparql.SparqlQuery() queryresult = sq.select(query) if not queryresult: return None return int(queryresult[0].get('count')) @staticmethod def _get_percentage(count, total): if not count: return 0 return round(1.0 * count / max(total, 1) * 100, 2) @staticmethod def make_column_header(prop_entry): if prop_entry.qualifier: property_link = prop_entry.qualifier else: property_link = prop_entry.property if prop_entry.title: label = f('[[Property:{property_link}|{prop_entry.title}]]') else: label = f('{{{{Property|{property_link}}}}}') return f('! data-sort-type="number"|{label}\n') def get_header(self): text = u'{| class="wikitable sortable"\n' colspan = 3 if self.higher_grouping else 2 text += f('! colspan="{colspan}" |Top groupings (Minimum {self.grouping_threshold} items)\n') text += f('! colspan="{len(self.properties)}"|Top Properties (used at least {self.property_threshold} times per grouping)\n') # noqa text += u'|-\n' if self.higher_grouping: text += u'! \n' text += u'! Name\n' text += u'! Count\n' for prop_entry in self.properties: text += self.make_column_header(prop_entry) return text def format_higher_grouping_text(self, higher_grouping_value): type_mapping = { "country": "{{Flag|%s}}" % higher_grouping_value, } if re.match(r"Q\d+", higher_grouping_value): higher_grouping_text = f('{{{{Q|{higher_grouping_value}}}}}') elif re.match(r"http://commons.wikimedia.org/wiki/Special:FilePath/(.*?)$", higher_grouping_value): match = re.match(r"http://commons.wikimedia.org/wiki/Special:FilePath/(.*?)$", higher_grouping_value) image_name = match.groups()[0] higher_grouping_text = f('[[File:{image_name}|center|100px]]') higher_grouping_value = image_name elif self.higher_grouping_type in type_mapping: higher_grouping_text = type_mapping.get(self.higher_grouping_type) else: higher_grouping_text = higher_grouping_value return f('| data-sort-value="{higher_grouping_value}"| {higher_grouping_text}\n') def make_stats_for_no_group(self): """ Query the data for no_group, return the wikitext """ text = u'|-\n' if self.higher_grouping: text += u'|\n' total_no_count = self.get_totals_no_grouping() text += u'| No grouping \n' text += f('| {total_no_count} \n') for prop_entry in self.properties: property_name = prop_entry.property if prop_entry.qualifier: value = prop_entry.value or '[]' propcount = self.get_qualifier_info_no_grouping(property_name, prop_entry.qualifier, value) else: propcount = self.get_property_info_no_grouping(property_name) percentage = self._get_percentage(propcount, total_no_count) text += f('| {{{{{self.cell_template}|{percentage}|{propcount}|property={prop_entry.property}}}}}\n') # noqa return text def make_stats_for_one_grouping(self, grouping, item_count, higher_grouping): """ Query the data for one group, return the wikitext. """ text = u'|-\n' if self.higher_grouping: if higher_grouping: text += self.format_higher_grouping_text(higher_grouping) else: text += u'|\n' text += u'| {{Q|%s}}\n' % (grouping,) if self.grouping_link: group_item = pywikibot.ItemPage(self.repo, grouping) group_item.get() label = group_item.labels["en"] text += f('| [[{self.grouping_link}/{label}|{item_count}]] \n') else: text += f('| {item_count} \n') for prop_entry in self.properties: prop_entry_key = prop_entry.get_key() try: propcount = self.property_data.get(prop_entry_key).get(grouping) except AttributeError: propcount = 0 if not propcount: propcount = 0 percentage = self._get_percentage(propcount, item_count) text += f('| {{{{{self.cell_template}|{percentage}|{propcount}|property={prop_entry.property}|grouping={grouping}}}}}\n') # noqa return text def make_footer(self): total_items = self.get_totals() text = u'|- class="sortbottom"\n|' if self.higher_grouping: text += u"|\n|" text += f('\'\'\'Totals\'\'\' (all items):\n| {total_items}\n') for prop_entry in self.properties: property_name = prop_entry.property if prop_entry.qualifier: totalprop = self.get_totals_for_qualifier(property=property_name, qualifier=prop_entry.qualifier) else: totalprop = self.get_totals_for_property(property=property_name) percentage = self._get_percentage(totalprop, total_items) text += f('| {{{{{self.cell_template}|{percentage}|{totalprop}}}}}\n') text += u'|}\n' return text def retrieve_and_process_data(self): """ Query the data, output wikitext """ logging.info("Retrieving grouping information...") try: (groupings_counts, groupings_groupings) = self.get_grouping_information() except QueryException as e: logging.error(f('No groupings found.')) raise e logging.info(f('Grouping retrieved: {len(groupings_counts)}')) for prop_entry in self.properties: property_name = prop_entry.property prop_entry_key = prop_entry.get_key() if prop_entry.qualifier: value = prop_entry.value or '[]' self.property_data[prop_entry_key] = self.get_qualifier_info(property_name, prop_entry.qualifier, value) else: self.property_data[prop_entry_key] = self.get_property_info(property_name) text = self.get_header() for (grouping, item_count) in groupings_counts.items(): higher_grouping = groupings_groupings.get(grouping) text += self.make_stats_for_one_grouping(grouping, item_count, higher_grouping) if self.stats_for_no_group: text += self.make_stats_for_no_group() text += self.make_footer() return text def main(*args): """ Main function. """ properties = [ PropertyConfig('P21'), PropertyConfig('P19'), ] logging.info("Main function...") stats = PropertyStatistics( properties=properties, selector_sparql=u'wdt:P31 wd:Q41960', grouping_property=u'P551', stats_for_no_group=True, grouping_threshold=5, property_threshold=1, ) print(stats.retrieve_and_process_data()) if __name__ == "__main__": main() diff --git a/requirements.txt b/requirements.txt index f19e954..dfc172b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,19 @@ -i https://pypi.org/simple certifi==2019.11.28 chardet==3.0.4 click==7.0 flask==1.1.1 formatizer==0.1.1 future==0.18.2 idna==2.8 itsdangerous==1.1.0 jinja2==2.10.3 markupsafe==1.1.1 pywikibot==3.0.20190722 requests==2.22.0 six==1.13.0 urllib3==1.25.7 werkzeug==0.16.0 ww==0.2.1 +Flask-Opentracing +jaeger-client \ No newline at end of file