mwgrep for cloudelastic
ActivePublic
Actions

Authored by EBernhardson on May 17 2019, 2:30 PM.

Tags

None

Referenced Files

	F29088945: raw.txt
	May 17 2019, 2:30 PM

Subscribers

MusikAnimal

	#!/usr/bin/env python
	# -- coding: utf-8 --
	"""
	usage: mwgrep [-h] [--max-results N] [--timeout N] [--user \| --module]
	[--title TITLE \| --etitle REGEX] regex

	Grep for Lua, CSS, JS and JSON code fragments on (per default) MediaWiki wiki pages

	positional arguments:
	regex regex to search for

	optional arguments:
	-h, --help show this help message and exit
	--max-results N show at most this many results (default: 100)
	--timeout N abort search after this many seconds (default: 30)
	--user search NS_USER rather than NS_MEDIAWIKI
	--module search NS_MODULE rather than NS_MEDIAWIKI
	--title TITLE restrict search to pages with this exact title
	--etitle REGEX restrict search to pages with this title pattern

	mwgrep will grep the MediaWiki namespace across Wikimedia wikis. specify
	--user to search the user namespace instead. See the lucene documentation
	for org.apache.lucene.util.automaton.RegExp for supported syntax. The current
	lucene version is available from `curl search.svc.eqiad.wmnet:9200`.

	""" # noqa: E501
	import sys
	reload(sys)
	sys.setdefaultencoding('utf-8')

	import argparse
	import bisect
	import json
	import requests


	TIMEOUT = 30
	# Port 9243 queries the chi cluster. Access the omega and psi clusters through chi
	# by using elasticsearch cross-cluster-search uri syntax.
	BASE_URI = 'https://cloudelastic1001.wikimedia.org:8243/,omega:,psi:*/page/_search'
	NS_MEDIAWIKI = 8
	NS_USER = 2
	NS_MODULE = 828
	PREFIX_NS = {
	NS_MEDIAWIKI: 'MediaWiki:',
	NS_USER: 'User:',
	NS_MODULE: 'Module:'
	}

	ap = argparse.ArgumentParser(
	prog='mwgrep',
	description='Grep for CSS, JS and JSON code fragments in MediaWiki wiki pages',
	epilog='mwgrep will grep the MediaWiki namespace across Wikimedia wikis. '
	'specify --user to search the user namespace instead.'
	)
	ap.add_argument('term', help='text to search for')

	ap.add_argument(
	'--max-results',
	metavar='N',
	type=int, default=100,
	help='show at most this many results (default: 100)'
	)

	ap.add_argument(
	'--timeout',
	metavar='N',
	type='{0}s'.format,
	default='30',
	help='abort search after this many seconds (default: 30)'
	)

	ns_group = ap.add_mutually_exclusive_group()
	ns_group.add_argument(
	'--user',
	action='store_const',
	const=NS_USER,
	default=NS_MEDIAWIKI,
	dest='ns',
	help='search NS_USER rather than NS_MEDIAWIKI'
	)

	ns_group.add_argument(
	'--module',
	action='store_const',
	const=NS_MODULE,
	default=NS_MEDIAWIKI,
	dest='ns',
	help='search NS_MODULE rather than NS_MEDIAWIKI'
	)

	title_group = ap.add_mutually_exclusive_group()
	title_group.add_argument(
	'--title',
	help='restrict search to pages with this exact title (sans namespace)'
	)
	title_group.add_argument(
	'--etitle',
	help='restrict search to pages with this title pattern (sans namespace)'
	)

	args = ap.parse_args()

	filters = [
	{'term': {'namespace': str(args.ns)}},
	{'source_regex': {
	'regex': args.term,
	'field': 'source_text',
	'ngram_field': 'source_text.trigram',
	'max_determinized_states': 20000,
	'max_expand': 10,
	'case_sensitive': True,
	'locale': 'en',
	}},
	]


	if args.title is not None:
	filters.append({'term': {'title.keyword': args.title}})
	elif args.etitle is not None:
	filters.append({'regexp': {'title.keyword': args.etitle}})
	elif args.ns == NS_USER or args.ns == NS_MEDIAWIKI:
	filters.append({'regexp': {'title.keyword': '(Gadgets-definition\|.*\\.(js\|css\|json))'}})

	search = {
	'size': args.max_results,
	'_source': ['namespace', 'title'],
	'sort': ['_doc'],
	'query': {'bool': {'filter': filters}},
	'stats': ['mwgrep'],
	}

	query = {
	'timeout': args.timeout,
	}

	matches = {'public': []}
	try:
	resp = requests.get(BASE_URI, params=query, json=search)
	try:
	full_result = resp.json()
	if resp.status_code >= 400:
	error_body = resp.json()
	if 'error' in error_body and 'root_cause' in error_body['error']:
	for root_cause in error_body['error']['root_cause']:
	if root_cause['type'] == 'invalid_regex_exception':
	sys.stderr.write(
	'Error while parsing regular expression: {0}\n{1}\n'.format(
	args.term, root_cause['reason']))
	exit(1)
	sys.stderr.write('Unknown error: {0}\n'.format(json.dumps(error_body, indent=4)))
	exit(1)
	else:
	sys.stderr.write(
	'Received unexpected json body from elasticsearch:\n{0}\n'.format(
	json.dumps(error_body, indent=4, separators=(',', ': '))))
	exit(1)
	except ValueError as e:
	sys.stderr.write(
	"Error '{0}' while parsing elasticsearch response '{1}'.\n".format(
	e.message, json.dumps(resp.text, indent=4, separators=(',', ': '))))
	exit(1)

	result = full_result['hits']

	for hit in result['hits']:
	index_name = hit['_index']
	if ':' in index_name:
	# strip cross-cluster identifier
	_, index_name = index_name.split(':', 1)
	db_name = index_name.rsplit('_', 2)[0]
	title = hit['_source']['title']
	page_name = '%s%s' % (PREFIX_NS[args.ns], title)
	bisect.insort(matches['public'], (db_name, page_name))

	if matches['public']:
	for db_name, page_name in matches['public']:
	print('{:<20}{}'.format(db_name, page_name))

	total = result['total']
	hits = len(result['hits'])

	print('')
	print('(total: %s, shown: %s)' % (total, hits))
	if full_result['timed_out']:
	print("""
	The query was unable to complete within the alloted time. Only partial results
	are shown here, and the reported total hits is <= the true value. To speed up
	the query:

	* Ensure the regular expression contains one or more sets of 3 contiguous
	characters. A character range ([a-z]) won't be expanded to count as
	contiguous if it matches more than 10 characters.
	* Use a simpler regular expression. Consider breaking the query up into
	multiple queries where possible.
	""")

	except requests.exceptions.RequestException as error:
	sys.stderr.write("Failed to connect to elastic {0}.\n".format(error))
	exit(1)

Event Timeline

EBernhardson created this paste.May 17 2019, 2:30 PM

mwgrep for cloudelasticActivePublicActions

Event Timeline

mwgrep for cloudelastic
ActivePublic
Actions