diff --git a/app.py b/app.py
index ade4e17..ea1996b 100644
--- a/app.py
+++ b/app.py
@@ -1,661 +1,666 @@
# -*- coding: utf-8 -*-
import bs4 # type: ignore
import cachetools
import datetime
import flask
import humanize
import mwapi # type: ignore
import mwoauth # type: ignore
import os
import random
import re
import requests_oauthlib # type: ignore
import string
import threading
import toolforge
import traceback
from typing import Any, List, Optional, Tuple, Type, Union
import werkzeug.wsgi
import yaml
from batch import StoredBatch, OpenBatch
-from command import Command, CommandRecord, CommandPlan, CommandPending, CommandEdit, CommandNoop, CommandFailure, CommandPageMissing, CommandPageProtected, CommandEditConflict, CommandMaxlagExceeded, CommandBlocked, CommandWikiReadOnly
+from command import Command, CommandRecord, CommandPlan, CommandPending, CommandEdit, CommandNoop, CommandFailure, CommandPageMissing, CommandTitleInvalid, CommandPageProtected, CommandEditConflict, CommandMaxlagExceeded, CommandBlocked, CommandWikiReadOnly
from localuser import LocalUser
import parse_wikitext
import parse_tpsv
from querytime import QueryTimingCursor, flush_querytime, slow_queries, query_summary
from runner import Runner
from store import BatchStore
from timestamp import now
app = flask.Flask(__name__)
user_agent = toolforge.set_user_agent('quickcategories', email='mail@lucaswerkmeister.de')
__dir__ = os.path.dirname(__file__)
try:
with open(os.path.join(__dir__, 'config.yaml')) as config_file:
app.config.update(yaml.safe_load(config_file))
except FileNotFoundError:
print('config.yaml file not found, assuming local development setup')
app.secret_key = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(64))
if 'oauth' in app.config:
consumer_token = mwoauth.ConsumerToken(app.config['oauth']['consumer_key'], app.config['oauth']['consumer_secret'])
if 'database' in app.config:
from database import DatabaseStore
app.config['database']['cursorclass'] = QueryTimingCursor
batch_store = DatabaseStore(app.config['database']) # type: BatchStore
def sometimes_flush_querytime():
if random.randrange(128) == 0:
with batch_store.connect() as connection:
flush_querytime(connection)
class SometimesFlushQuerytimeMiddleware:
def __init__(self, app):
self.app = app
def __call__(self, environ, start_response):
return werkzeug.wsgi.ClosingIterator(self.app(environ, start_response), sometimes_flush_querytime)
app.wsgi_app = SometimesFlushQuerytimeMiddleware(app.wsgi_app) # type: ignore # “cannot assign to a method”
else:
from in_memory import InMemoryStore
print('No database configuration, using in-memory store (batches will be lost on every restart)')
batch_store = InMemoryStore()
stewards_global_user_ids_cache = cachetools.TTLCache(maxsize=1, ttl=24*60*60) # type: cachetools.TTLCache[Any, List[int]]
stewards_global_user_ids_cache_lock = threading.RLock()
def log(type, message):
if app.config.get('DEBUG_' + type, False):
print('[%s] %s' % (type, message))
@app.template_global()
def csrf_token() -> str:
if 'csrf_token' not in flask.session:
log('CSRF', 'allocating a new token')
flask.session['csrf_token'] = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(64))
else:
log('CSRF', 'reusing token from session')
return flask.session['csrf_token']
@app.template_global()
def form_value(name: str) -> flask.Markup:
if 'repeat_form' in flask.g and name in flask.request.form:
return (flask.Markup(r' value="') +
flask.Markup.escape(flask.request.form[name]) +
flask.Markup(r'" '))
else:
return flask.Markup()
@app.template_global()
def form_attributes(name: str) -> flask.Markup:
return (flask.Markup(r' id="') +
flask.Markup.escape(name) +
flask.Markup(r'" name="') +
flask.Markup.escape(name) +
flask.Markup(r'" ') +
form_value(name))
@app.template_filter()
def user_link(user_name: str) -> flask.Markup:
return (flask.Markup(r'') +
flask.Markup(r'') +
flask.Markup.escape(user_name) +
flask.Markup(r'') +
flask.Markup(r''))
@app.template_global()
def user_logged_in() -> bool:
return authenticated_session() is not None
@app.template_global()
def authentication_area() -> flask.Markup:
if 'oauth' not in app.config:
return flask.Markup()
if 'oauth_access_token' not in flask.session:
return (flask.Markup(r'Log in'))
access_token = mwoauth.AccessToken(**flask.session['oauth_access_token'])
identity = mwoauth.identify('https://meta.wikimedia.org/w/index.php',
consumer_token,
access_token)
return (flask.Markup(r'Logged in as ') +
user_link(identity['username']) +
flask.Markup(r''))
@app.template_global()
def can_run_commands(command_records: List[CommandRecord]) -> bool:
return flask.g.can_run_commands and any(filter(lambda command_record: isinstance(command_record, CommandPlan), command_records))
@app.template_global()
def can_start_background() -> bool:
return flask.g.can_start_background
@app.template_global()
def can_stop_background() -> bool:
return flask.g.can_stop_background
@app.template_global() # TODO make domain part of Command and turn this into a template filter?
def render_command(command: Command, domain: str) -> flask.Markup:
return flask.Markup(flask.render_template('command.html',
domain=domain,
command=command))
@app.template_global() # TODO also turn into a template filter?
def render_command_record(command_record: CommandRecord, domain: str) -> flask.Markup:
if isinstance(command_record, CommandPlan):
command_record_markup = flask.render_template('command_plan.html',
domain=domain,
command_plan=command_record)
elif isinstance(command_record, CommandPending):
command_record_markup = flask.render_template('command_pending.html',
domain=domain,
command_pending=command_record)
elif isinstance(command_record, CommandEdit):
command_record_markup = flask.render_template('command_edit.html',
domain=domain,
command_edit=command_record)
elif isinstance(command_record, CommandNoop):
command_record_markup = flask.render_template('command_noop.html',
domain=domain,
command_noop=command_record)
elif isinstance(command_record, CommandPageMissing):
command_record_markup = flask.render_template('command_page_missing.html',
domain=domain,
command_page_missing=command_record)
+ elif isinstance(command_record, CommandTitleInvalid):
+ command_record_markup = flask.render_template('command_title_invalid.html',
+ domain=domain,
+ command_title_invalid=command_record)
elif isinstance(command_record, CommandPageProtected):
command_record_markup = flask.render_template('command_page_protected.html',
domain=domain,
command_page_protected=command_record)
elif isinstance(command_record, CommandEditConflict):
command_record_markup = flask.render_template('command_edit_conflict.html',
domain=domain,
command_edit_conflict=command_record)
elif isinstance(command_record, CommandMaxlagExceeded):
command_record_markup = flask.render_template('command_maxlag_exceeded.html',
domain=domain,
command_maxlag_exceeded=command_record)
elif isinstance(command_record, CommandBlocked):
command_record_markup = flask.render_template('command_blocked.html',
domain=domain,
command_blocked=command_record)
elif isinstance(command_record, CommandWikiReadOnly):
command_record_markup = flask.render_template('command_wiki_read_only.html',
domain=domain,
command_blocked=command_record)
else:
raise ValueError('Unknown command record type')
return flask.Markup(command_record_markup)
@app.template_filter()
def render_command_record_type(command_record_type: Type[CommandRecord]) -> flask.Markup:
template_names = {
CommandPlan: 'command_plan_badge.html',
CommandPending: 'command_pending_badge.html',
CommandEdit: 'command_edit_badge.html',
CommandNoop: 'command_noop_badge.html',
CommandPageMissing: 'command_page_missing_badge.html',
+ CommandTitleInvalid: 'command_title_invalid_badge.html',
CommandPageProtected: 'command_page_protected_badge.html',
CommandEditConflict: 'command_edit_conflict_badge.html',
CommandMaxlagExceeded: 'command_maxlag_exceeded_badge.html',
CommandBlocked: 'command_blocked_badge.html',
CommandWikiReadOnly: 'command_wiki_read_only_badge.html',
}
template_name = template_names[command_record_type]
return flask.Markup(flask.render_template(template_name))
@app.template_filter()
def render_datetime(dt: datetime.datetime) -> flask.Markup:
naive_dt = dt.astimezone().replace(tzinfo=None) # humanize doesn’t support timezones :(
return (flask.Markup(r''))
@app.template_global()
def render_local_user(local_user: LocalUser) -> flask.Markup:
return (flask.Markup(r'') +
flask.Markup.escape(local_user.user_name) +
flask.Markup(r''))
@app.template_global()
def render_batch_title(batch: StoredBatch) -> Optional[flask.Markup]:
if not batch.title:
return None
return parse_wikitext.parse_summary(anonymous_session(batch.domain), batch.title)
@app.template_filter()
def html_text(html: Union[str, flask.Markup]) -> flask.Markup:
soup = bs4.BeautifulSoup(html, 'html.parser')
text = soup.text
return flask.Markup.escape(text)
@app.template_global()
def render_batch_title_text(batch: StoredBatch) -> Optional[flask.Markup]:
title_html = render_batch_title(batch)
if not title_html:
return None
return html_text(title_html)
def authenticated_session(domain: str = 'meta.wikimedia.org') -> Optional[mwapi.Session]:
if 'oauth_access_token' in flask.session:
access_token = mwoauth.AccessToken(**flask.session['oauth_access_token'])
auth = requests_oauthlib.OAuth1(client_key=consumer_token.key, client_secret=consumer_token.secret,
resource_owner_key=access_token.key, resource_owner_secret=access_token.secret)
return mwapi.Session(host='https://'+domain, auth=auth, user_agent=user_agent)
else:
return None
def anonymous_session(domain: str = 'meta.wikimedia.org') -> mwapi.Session:
return mwapi.Session(host='https://'+domain, user_agent=user_agent)
def any_session(domain: str = 'meta.wikimedia.org') -> mwapi.Session:
return authenticated_session(domain) or anonymous_session(domain)
@app.route('/')
def index():
return flask.render_template('index.html', batches=batch_store.get_batches_slice(offset=0, limit=10))
@app.route('/batch', methods=['POST'])
def new_batch():
if not submitted_request_valid():
return 'CSRF error', 400
domain = flask.request.form.get('domain', '(not provided)')
if not is_wikimedia_domain(domain):
return flask.Markup.escape(domain) + flask.Markup(' is not recognized as a Wikimedia domain'), 400
title = flask.request.form.get('title')
if title is not None and len(title) > 800:
return flask.Markup('Title "') + flask.Markup.escape(title) + flask.Markup('" is too long for a summary'), 400
session = authenticated_session(domain)
if not session:
return 'not logged in', 403 # Forbidden; 401 Unauthorized would be inappropriate because we don’t send WWW-Authenticate
try:
response = session.get(action='query',
meta='siteinfo')
servername = response['query']['general']['servername']
if servername != domain:
message = (flask.Markup(r'The server at ') +
flask.Markup.escape(domain) +
flask.Markup(r'
indicates its actual name is ') +
flask.Markup.escape(servername) +
flask.Markup(r'
.'))
return flask.render_template('new_batch_error.html',
message=message), 400
except Exception:
traceback.print_exc() # for possible later manual inspection
message = (flask.Markup(r'Could not connect to the server at ') +
flask.Markup.escape(domain) +
flask.Markup(r'
.'))
return flask.render_template('new_batch_error.html',
message=message), 400
try:
batch = parse_tpsv.parse_batch(flask.request.form.get('commands', ''), title=title)
except parse_tpsv.ParseBatchError as e:
return str(e)
batch.cleanup()
id = batch_store.store_batch(batch, session).id
return flask.redirect(flask.url_for('batch', id=id))
@app.route('/batch/')
def batches():
offset, limit = slice_from_args(flask.request.args)
return flask.render_template('batches.html',
batches=batch_store.get_batches_slice(offset=offset, limit=limit),
offset=offset,
limit=limit,
count=batch_store.get_batches_count())
@app.route('/batch//')
def batch(id: int):
batch = batch_store.get_batch(id)
if batch is None:
return flask.render_template('batch_not_found.html',
id=id), 404
session = authenticated_session(batch.domain)
if session:
try:
userinfo = session.get(action='query',
meta='userinfo',
uiprop=['groups', 'centralids'])['query']['userinfo']
except mwapi.errors.APIError as e:
if e.code == 'mwoauth-invalid-authorization-invalid-user':
# user is viewing a batch for a wiki where they do not have a local user account
# treat as anonymous on the local wiki, but query Meta to find out if they’re a steward
local_user_id = None # type: Optional[int]
groups = [] # type: List[str]
meta_session = authenticated_session('meta.wikimedia.org') # type: mwapi.Session
meta_userinfo = meta_session.get(action='query',
meta='userinfo',
uiprop=['centralids'])['query']['userinfo']
global_user_id = meta_userinfo['centralids']['CentralAuth']
else:
raise e
else:
local_user_id = userinfo['id']
groups = userinfo['groups']
global_user_id = userinfo['centralids']['CentralAuth']
flask.g.can_run_commands = local_user_id == batch.local_user.local_user_id
flask.g.can_start_background = flask.g.can_run_commands and \
'autoconfirmed' in groups and \
isinstance(batch, OpenBatch)
flask.g.can_stop_background = flask.g.can_start_background or \
'sysop' in groups or \
global_user_id in steward_global_user_ids()
else:
flask.g.can_run_commands = False
flask.g.can_start_background = False
flask.g.can_stop_background = False
offset, limit = slice_from_args(flask.request.args)
return flask.render_template('batch.html',
batch=batch,
offset=offset,
limit=limit)
@app.route('/batch//background_history')
def batch_background_history(id: int):
batch = batch_store.get_batch(id)
if batch is None:
return flask.render_template('batch_not_found.html',
id=id), 404
return flask.render_template('background_history.html',
batch=batch)
@app.route('/batch//run_slice', methods=['POST'])
def run_batch_slice(id: int):
if not submitted_request_valid():
return 'CSRF error', 400
batch = batch_store.get_batch(id)
if batch is None:
return flask.render_template('batch_not_found.html',
id=id), 404
session = authenticated_session(batch.domain)
if not session:
return 'not logged in', 403
local_user_id = session.get(action='query',
meta='userinfo')['query']['userinfo']['id']
if local_user_id != batch.local_user.local_user_id:
return 'may not run this batch', 403
if 'summary_batch_link' in app.config:
summary_batch_link = app.config['summary_batch_link'].format(id)
else:
summary_batch_link = None
runner = Runner(session, batch.title, summary_batch_link)
offset, limit = slice_from_args(flask.request.form)
command_pendings = batch.command_records.make_plans_pending(offset, limit)
try:
runner.prepare_pages([command_pending.command.page for command_pending in command_pendings])
for command_pending in command_pendings:
for attempt in range(5):
command_finish = runner.run_command(command_pending)
if isinstance(command_finish, CommandFailure) and command_finish.can_retry_immediately():
continue
else:
break
batch.command_records.store_finish(command_finish)
if isinstance(command_finish, CommandFailure):
can_continue = command_finish.can_continue_batch()
if isinstance(can_continue, datetime.datetime):
batch_store.suspend_background(batch, until=can_continue)
break
elif not can_continue:
batch_store.stop_background(batch)
break
finally:
batch.command_records.make_pendings_planned([command_pending.id for command_pending in command_pendings])
return flask.redirect(flask.url_for('batch',
id=id,
offset=offset,
limit=limit))
@app.route('/batch//start_background', methods=['POST'])
def start_batch_background(id: int):
if not submitted_request_valid():
return 'CSRF error', 400
batch = batch_store.get_batch(id)
if batch is None:
return flask.render_template('batch_not_found.html',
id=id), 404
if not isinstance(batch, OpenBatch):
return 'not an open batch', 400
session = authenticated_session(batch.domain)
if not session:
return 'not logged in', 403
userinfo = session.get(action='query',
meta='userinfo',
uiprop=['groups'])['query']['userinfo']
local_user_id = userinfo['id']
if local_user_id != batch.local_user.local_user_id or \
'autoconfirmed' not in userinfo['groups']:
return 'may not start this batch in background', 403
batch_store.start_background(batch, session)
offset, limit = slice_from_args(flask.request.form)
return flask.redirect(flask.url_for('batch',
id=id,
offset=offset,
limit=limit))
@app.route('/batch//stop_background', methods=['POST'])
def stop_batch_background(id: int):
if not submitted_request_valid():
return 'CSRF error', 400
batch = batch_store.get_batch(id)
if batch is None:
return flask.render_template('batch_not_found.html',
id=id), 404
session = authenticated_session(batch.domain)
if not session:
return 'not logged in', 403
userinfo = session.get(action='query',
meta='userinfo',
uiprop=['groups', 'centralids'])['query']['userinfo']
local_user_id = userinfo['id']
global_user_id = userinfo['centralids']['CentralAuth']
if local_user_id != batch.local_user.local_user_id and \
'sysop' not in userinfo['groups'] and \
global_user_id not in steward_global_user_ids():
return 'may not stop this batch in background', 403
batch_store.stop_background(batch, session)
offset, limit = slice_from_args(flask.request.form)
return flask.redirect(flask.url_for('batch',
id=id,
offset=offset,
limit=limit))
@app.route('/login')
def login():
redirect, request_token = mwoauth.initiate('https://meta.wikimedia.org/w/index.php', consumer_token, user_agent=user_agent)
flask.session['oauth_request_token'] = dict(zip(request_token._fields, request_token))
return flask.redirect(redirect)
@app.route('/oauth/callback')
def oauth_callback():
request_token = mwoauth.RequestToken(**flask.session.pop('oauth_request_token'))
access_token = mwoauth.complete('https://meta.wikimedia.org/w/index.php', consumer_token, request_token, flask.request.query_string, user_agent=user_agent)
flask.session['oauth_access_token'] = dict(zip(access_token._fields, access_token))
return flask.redirect(flask.url_for('index'))
@app.route('/debug/query_times')
def query_times():
session = authenticated_session()
if not session:
return 'not logged in', 403
allowed_global_user_ids = [
46054761,
]
userinfo = session.get(action='query',
meta='userinfo',
uiprop=['centralids'])['query']['userinfo']
if userinfo['centralids']['CentralAuth'] not in allowed_global_user_ids:
return 'not allowed', 403
if not isinstance(batch_store, DatabaseStore):
return '', 204 # no content
until = now()
since = until - datetime.timedelta(days=7)
leading_spaces = re.compile(r'^\s+', re.MULTILINE)
with batch_store.connect() as connection:
flush_querytime(connection)
slowest_queries = [(t, duration, re.sub(leading_spaces, '', sql))
for t, duration, sql in slow_queries(connection, since, until)]
summary = query_summary(connection, since, until)
for index, (sql, stats) in enumerate(summary):
sql = re.sub(leading_spaces, '', sql)
summary[index] = (sql, stats)
return flask.render_template('query_times.html',
since=since,
until=until,
slowest_queries=slowest_queries,
summary=summary)
def is_wikimedia_domain(domain: str) -> bool:
return re.fullmatch(r'[a-z0-9-]+\.(?:wiki(?:pedia|media|books|data|news|quote|source|versity|voyage)|mediawiki|wiktionary)\.org', domain) is not None
def slice_from_args(args: dict) -> Tuple[int, int]:
try:
offset = int(args['offset'])
except (KeyError, ValueError):
offset = 0
offset = max(0, offset)
try:
limit = int(args['limit'])
except (KeyError, ValueError):
limit = 50
limit = max(1, min(500, limit))
return offset, limit
@cachetools.cached(cache=stewards_global_user_ids_cache,
key=lambda: '#stewards',
lock=stewards_global_user_ids_cache_lock)
def steward_global_user_ids() -> List[int]:
session = mwapi.Session(host='https://meta.wikimedia.org', user_agent=user_agent)
ids = []
for result in session.get(action='query',
list='allusers',
augroup=['steward'],
auprop=['centralids'],
aulimit='max',
continuation=True):
for user in result['query']['allusers']:
ids.append(user['centralids']['CentralAuth'])
return ids
def full_url(endpoint: str, **kwargs) -> str:
scheme = flask.request.headers.get('X-Forwarded-Proto', 'http')
return flask.url_for(endpoint, _external=True, _scheme=scheme, **kwargs)
@app.template_global()
def current_url(external: bool = False) -> str:
all_args = {}
for args in [flask.request.args, flask.request.view_args]:
all_args.update(args)
# in Python 3.5+, replace that with **flask.request.args, **flask.request.view_args
if external:
return flask.url_for(flask.request.endpoint,
_external=True,
_scheme=flask.request.headers.get('X-Forwarded-Proto', 'http'),
**all_args)
else:
return flask.url_for(flask.request.endpoint,
**all_args)
def submitted_request_valid() -> bool:
"""Check whether a submitted POST request is valid.
If this method returns False, the request might have been issued
by an attacker as part of a Cross-Site Request Forgery attack;
callers MUST NOT process the request in that case.
"""
real_token = flask.session.pop('csrf_token', None)
log('CSRF', 'invalidated token from session')
submitted_token = flask.request.form.get('csrf_token', None)
if not real_token:
# we never expected a POST
log('CSRF', 'no real token')
return False
if not submitted_token:
# token got lost or attacker did not supply it
log('CSRF', 'no submitted token')
return False
if submitted_token != real_token:
# incorrect token (could be outdated or incorrectly forged)
log('CSRF', 'token mismatch')
return False
if not (flask.request.referrer or '').startswith(full_url('index')):
# correct token but not coming from the correct page; for
# example, JS running on https://tools.wmflabs.org/tool-a is
# allowed to access https://tools.wmflabs.org/tool-b and
# extract CSRF tokens from it (since both of these pages are
# hosted on the https://tools.wmflabs.org domain), so checking
# the Referer header is our only protection against attackers
# from other Toolforge tools
log('CSRF', 'referrer mismatch: should start with %s, got %s' % (full_url('index'), flask.request.referrer))
return False
return True
@app.after_request
def deny_frame(response: flask.Response) -> flask.Response:
"""Disallow embedding the tool’s pages in other websites.
If other websites can embed this tool’s pages, e. g. in