diff --git a/integraality/app.py b/integraality/app.py index ce1dbd4..aae481c 100755 --- a/integraality/app.py +++ b/integraality/app.py @@ -1,122 +1,122 @@ #!/usr/bin/python # -*- coding: utf-8 -*- import os import traceback from time import perf_counter from flask import Flask, render_template, request from pages_processor import PagesProcessor, ProcessingException from sparql_utils import QueryException app = Flask(__name__) app.debug = True @app.route("/") def index(): return render_template("index.html") @app.route("/update") def update(): start_time = perf_counter() page_url = request.args.get("url") page_title = request.args.get("page") processor = PagesProcessor(page_url) try: processor.process_one_page(page_title) elapsed_time = perf_counter() - start_time return render_template( "update.html", page_title=page_title, page_url=page_url, elapsed_time=elapsed_time, ) except QueryException as e: return render_template( "update_query_error.html", page_title=page_title, page_url=page_url, error_message=e, query=e.query, ) except ProcessingException as e: return render_template( "update_error.html", page_title=page_title, page_url=page_url, error_message=e, ) except Exception as e: return render_template( "update_unknown_error.html", page_title=page_title, page_url=page_url, error_message=traceback.format_exception(type(e), e, e.__traceback__), ) @app.route("/queries") def queries(): page_url = request.args.get("url") page_title = request.args.get("page") column_key = request.args.get("column") or request.args.get("property") processor = PagesProcessor(page_url) try: stats = processor.make_stats_object_for_page_title(page_title) grouping_arg = request.args.get("grouping") try: grouping = stats.GROUP_MAPPING(grouping_arg) except ValueError: grouping = stats.GROUP_MAPPING.__members__.get(grouping_arg, grouping_arg) column = stats.columns.get(column_key) positive_query = stats.get_query_for_items_for_property_positive( column, grouping ) negative_query = stats.get_query_for_items_for_property_negative( column, grouping ) formatted_predicate = stats.grouping_configuration.format_predicate_html() return render_template( "queries.html", page_title=page_title, page_url=page_url, - column=column_key, + column=column, grouping=request.args.get("grouping"), formatted_predicate=formatted_predicate, positive_query=positive_query, negative_query=negative_query, ) except ProcessingException as e: return render_template( "queries_error.html", page_title=page_title, page_url=page_url, error_message=e, ) except Exception as e: return render_template( "queries_unknown_error.html", page_title=page_title, page_url=page_url, error_message=traceback.format_exception(type(e), e, e.__traceback__), ) @app.errorhandler(404) def page_not_found(error): return render_template("page_not_found.html", title="Page not found"), 404 if __name__ == "__main__": if os.uname()[1].startswith("tools-webgrid"): from flup.server.fcgi_fork import WSGIServer WSGIServer(app).run() else: if os.environ.get("LOCAL_ENVIRONMENT", False): app.run(host="0.0.0.0") else: app.run() diff --git a/integraality/column.py b/integraality/column.py index e3d3861..0dd7c62 100644 --- a/integraality/column.py +++ b/integraality/column.py @@ -1,261 +1,282 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ Column configuration classes """ import json import os from enum import Enum class GroupingType(Enum): YEAR = "year" class ColumnSyntaxException(Exception): pass class ColumnMaker: @staticmethod def make(key, title): current_dir = os.path.dirname(__file__) wikiprojects_path = os.path.join(current_dir, "wikiprojects.json") wikiprojects = json.load(open(wikiprojects_path, "r")) if key.startswith("P"): splitted = key.split("/") if len(splitted) == 3: (property_name, value, qualifier) = splitted elif len(splitted) == 2: (property_name, value, qualifier) = (splitted[0], None, splitted[1]) else: (property_name, value, qualifier) = (key, None, None) return PropertyColumn( property=property_name, title=title, qualifier=qualifier, value=value ) elif key.startswith("L"): return LabelColumn(language=key[1:]) elif key.startswith("D"): return DescriptionColumn(language=key[1:]) elif key in wikiprojects: wikiproject = wikiprojects.get(key) return SitelinkColumn(project=key, title=title) else: raise ColumnSyntaxException("Unknown column syntax %s" % key) class AbstractColumn: def get_info_query(self, property_statistics): """ Get the usage counts for a column for the groupings :return: (str) SPARQL query """ grouping_selector = "\n".join(property_statistics.grouping_configuration.get_grouping_selector()) query = f""" SELECT ?grouping (COUNT(DISTINCT ?entity) as ?count) WHERE {{ ?entity {property_statistics.selector_sparql} . {grouping_selector} FILTER(EXISTS {{{self.get_filter_for_info()} }}) }} GROUP BY ?grouping HAVING (?count >= {property_statistics.property_threshold}) ORDER BY DESC(?count) LIMIT 1000 """ return query def get_totals_query(self, property_statistics): """ Get the totals of entities with the column set. :return: (str) SPARQL query """ query = f""" SELECT (COUNT(*) as ?count) WHERE {{ ?entity {property_statistics.selector_sparql} FILTER(EXISTS {{{self.get_filter_for_info()} }}) }} """ return query def get_info_no_grouping_query(self, property_statistics): """ Get the usage counts for a column without a grouping :return: (str) SPARQL query """ query = f""" SELECT (COUNT(*) AS ?count) WHERE {{ ?entity {property_statistics.selector_sparql} . MINUS {{ ?entity wdt:{property_statistics.grouping_configuration.property} _:b28. }} FILTER(EXISTS {{{self.get_filter_for_info()} }}) }} """ return query class PropertyColumn(AbstractColumn): def __init__(self, property, title=None, value=None, qualifier=None): self.property = property self.title = title self.value = value self.qualifier = qualifier def __eq__(self, other): return ( self.property == other.property and self.title == other.title and self.value == other.value and self.qualifier == other.qualifier ) def get_title(self): return "/".join([x for x in [self.property, self.value, self.qualifier] if x]) def get_key(self): return "".join([x for x in [self.property, self.value, self.qualifier] if x]) + def get_type_name(self): + return "property" + + def format_html_snippet(self): + return f'{self.property}' + def make_column_header(self): if self.qualifier: property_link = self.qualifier else: property_link = self.property if self.title: label = f"[[Property:{property_link}|{self.title}]]" else: label = f"{{{{Property|{property_link}}}}}" return f'! data-sort-type="number"|{label}\n' def get_filter_for_info(self): if self.qualifier: property_value = f"wd:{self.value}" if self.value else "[]" return f""" ?entity p:{self.property} [ ps:{self.property} {property_value} ; pq:{self.qualifier} [] ]""" else: return f""" ?entity p:{self.property}[]""" def get_filter_for_positive_query(self): return f""" ?entity p:{self.property} ?prop . OPTIONAL {{ ?prop ps:{self.property} ?value }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} """ def get_filter_for_negative_query(self): return f""" {{?entity a wdno:{self.property} .}} UNION {{?entity wdt:{self.property} ?prop .}} }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} """ class TextColumn(AbstractColumn): def __init__(self, language, title=None): self.language = language self.title = title def __eq__(self, other): return self.language == other.language and self.title == other.title def get_title(self): return self.get_key() + def format_html_snippet(self): + return f"{self.language} {self.get_type_name()}" + def make_column_header(self): if self.title: text = f"{self.title}" else: text = f"{{{{#language:{self.language}}}}}" return f'! data-sort-type="number"|{text}\n' def get_filter_for_info(self): return f""" ?entity {self.get_selector()} ?lang_label. FILTER((LANG(?lang_label)) = '{self.language}').""" def get_filter_for_positive_query(self): return f""" FILTER(EXISTS {{ ?entity {self.get_selector()} ?lang_label. FILTER((LANG(?lang_label)) = "{self.language}"). }}) SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{self.language}". }} """ def get_filter_for_negative_query(self): return f""" {{ ?entity {self.get_selector()} ?lang_label. FILTER((LANG(?lang_label)) = "{self.language}") }} }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} """ class LabelColumn(TextColumn): def get_key(self): return "L%s" % self.language def get_selector(self): return "rdfs:label" + def get_type_name(self): + return "label" + class DescriptionColumn(TextColumn): def get_key(self): return "D%s" % self.language def get_selector(self): return "schema:description" + def get_type_name(self): + return "description" + class SitelinkColumn(AbstractColumn): def __init__(self, project, title=None): current_dir = os.path.dirname(__file__) wikiprojects_path = os.path.join(current_dir, "wikiprojects.json") wikiprojects = json.load(open(wikiprojects_path, "r")) self.project = project self.url = wikiprojects[project]["url"] self.item = wikiprojects[project]["item"] self.title = title def __eq__(self, other): return self.url == other.url and self.title == other.title def get_key(self): return self.project def get_title(self): return self.get_key() + def get_type_name(self): + return "sitelink" + + def format_html_snippet(self): + return f'{self.get_title()} {self.get_type_name()}' + def make_column_header(self): if self.title: label = f"[[{self.item}|{self.title}]]" else: label = f"{{{{Q|{self.item}}}}}" return f'! data-sort-type="number"|{label}\n' def get_filter_for_info(self): return f""" ?sitelink schema:about ?entity; schema:isPartOf <{self.url}>.""" def get_filter_for_positive_query(self): return f""" ?sitelink schema:about ?entity; schema:isPartOf <{self.url}>; schema:name ?value. SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} """ def get_filter_for_negative_query(self): return f""" ?sitelink schema:about ?entity; schema:isPartOf <{self.url}>. }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} """ diff --git a/integraality/templates/queries.html b/integraality/templates/queries.html index 0d60bff..6347f75 100644 --- a/integraality/templates/queries.html +++ b/integraality/templates/queries.html @@ -1,31 +1,16 @@ {% extends "base.html" %} -{% if column.startswith("P") -%} - {% set name = 'property' %} -{%- elif column.startswith("L") -%} - {% set name = 'label' %} -{%- elif column.startswith("D") -%} - {% set name = 'description' %} -{%- else -%} - {% set name = 'sitelink' %} -{%- endif -%} {% block content %}
-

From page {{ page_title }}, {% if column.startswith("P") -%} - {{ column }} - {%- elif column.startswith("L") -%} - {{ column[1:] }} label - {%- elif column.startswith("D") -%} - {{ column[1:] }} description - {%- endif %}, {% if grouping == 'None' -%} +

From page {{ page_title }}, {{ column.format_html_snippet() | safe }}, {% if grouping == 'None' -%} without {{ formatted_predicate | safe }} grouping {%- elif grouping == 'UNKNOWN_VALUE' -%} with unknown value as {{ formatted_predicate | safe }} {%- elif grouping -%} with {{ grouping }} as {{ formatted_predicate | safe }} {%- else -%} for the totals {%- endif %}.

- All items with the {{ name }} set - All items without the {{ name }} set + All items with the {{ column.get_type_name() }} set + All items without the {{ column.get_type_name() }} set
{% endblock %}