harvest_template.py
JAnD (Jan Dudík)
Actions

Authored By

	JAnD
	Mar 15 2016, 8:00 AM

Size

10 KB

Referenced Files

None

Subscribers

None

harvest_template.py
View Options

	#!/usr/bin/python
	# -- coding: utf-8 --
	r"""
	Template harvesting script.

	Usage:

	* python pwb.py harvest_template -transcludes:"..." \
	template_parameter PID [template_parameter PID]
	* python pwb.py harvest_template [generators] -template:"..." \
	template_parameter PID [template_parameter PID]

	This will work on all pages that transclude the template in the article
	namespace

	These command line parameters can be used to specify which pages to work on:

	&params;

	Examples:

	python pwb.py harvest_template -lang:nl -cat:Sisoridae -namespace:0 \
	-template:"Taxobox straalvinnige" orde P70 familie P71 geslacht P74

	"""
	#
	# (C) Multichill, Amir, 2013
	# (C) Pywikibot team, 2013-2014
	#
	# Distributed under the terms of MIT License.
	#
	from __future__ import absolute_import, unicode_literals

	__version__ = '$Id: f6b5dcd584a6de3ef17df5a681d4706d97293b54 $'
	#

	import re
	import signal

	willstop = False


	def _signal_handler(signal, frame):
	global willstop
	if not willstop:
	willstop = True
	print('Received ctrl-c. Finishing current item; '
	'press ctrl-c again to abort.') # noqa
	else:
	raise KeyboardInterrupt

	signal.signal(signal.SIGINT, _signal_handler)

	import pywikibot
	from pywikibot import pagegenerators as pg, textlib, WikidataBot

	docuReplacements = {'&params;': pywikibot.pagegenerators.parameterHelp}


	class HarvestRobot(WikidataBot):

	"""A bot to add Wikidata claims."""

	def __init__(self, generator, templateTitle, fields):
	"""
	Constructor.

	Arguments:
	* generator - A generator that yields Page objects.
	* templateTitle - The template to work on
	* fields - A dictionary of fields that are of use to us

	"""
	super(HarvestRobot, self).__init__()
	self.generator = pg.PreloadingGenerator(generator)
	self.templateTitle = templateTitle.replace(u'_', u' ')
	# TODO: Make it a list which also includes the redirects to the template
	self.fields = fields
	self.cacheSources()
	self.templateTitles = self.getTemplateSynonyms(self.templateTitle)

	def getTemplateSynonyms(self, title):
	"""Fetch redirects of the title, so we can check against them."""
	temp = pywikibot.Page(pywikibot.Site(), title, ns=10)
	if not temp.exists():
	pywikibot.error(u'Template %s does not exist.' % temp.title())
	exit()

	# Put some output here since it can take a while
	pywikibot.output('Finding redirects...')
	if temp.isRedirectPage():
	temp = temp.getRedirectTarget()
	titles = [page.title(withNamespace=False)
	for page in temp.getReferences(redirectsOnly=True, namespaces=[10],
	follow_redirects=False)]
	titles.append(temp.title(withNamespace=False))
	return titles

	def _template_link_target(self, item, link_text):
	linked_page = None

	link = pywikibot.Link(link_text)
	linked_page = pywikibot.Page(link)

	if not linked_page.exists():
	pywikibot.output('%s does not exist so it cannot be linked. '
	'Skipping.' % (linked_page))
	return

	if linked_page.isRedirectPage():
	linked_page = linked_page.getRedirectTarget()

	try:
	linked_item = pywikibot.ItemPage.fromPage(linked_page)
	except pywikibot.NoPage:
	linked_item = None

	if not linked_item or not linked_item.exists():
	pywikibot.output('%s does not have a wikidata item to link with. '
	'Skipping.' % (linked_page))
	return

	if linked_item.title() == item.title():
	pywikibot.output('%s links to itself. Skipping.' % (linked_page))
	return

	return linked_item

	def treat(self, page, item):
	"""Process a single page/item."""
	if willstop:
	raise KeyboardInterrupt
	self.current_page = page
	item.get()
	if set(self.fields.values()) <= set(item.claims.keys()):
	pywikibot.output('%s item %s has claims for all properties. '
	'Skipping.' % (page, item.title()))
	return

	pagetext = page.get()
	templates = textlib.extract_templates_and_params(pagetext)
	for (template, fielddict) in templates:
	# Clean up template
	try:
	template = pywikibot.Page(page.site, template,
	ns=10).title(withNamespace=False)
	except pywikibot.exceptions.InvalidTitle:
	pywikibot.error(
	"Failed parsing template; '%s' should be the template name."
	% template)
	continue
	# We found the template we were looking for
	if template in self.templateTitles:
	for field, value in fielddict.items():
	field = field.strip()
	value = value.strip()
	if not field or not value:
	continue

	# This field contains something useful for us
	if field in self.fields:
	# Check if the property isn't already set
	claim = pywikibot.Claim(self.repo, self.fields[field])
	if claim.getID() in item.get().get('claims'):
	pywikibot.output(
	'A claim for %s already exists. Skipping.'
	% claim.getID())
	# TODO: Implement smarter approach to merging
	# harvested values with existing claims esp.
	# without overwriting humans unintentionally.
	else:
	if claim.type == 'wikibase-item':
	# Try to extract a valid page
	match = re.search(pywikibot.link_regex, value)
	"""if not match:
	pywikibot.output(
	'%s field %s value %s is not a '
	'wikilink. Skipping.'
	% (claim.getID(), field, value))
	continue

	link_text = match.group(1)
	linked_item = self._template_link_target(item, link_text)
	if not linked_item:
	continue

	claim.setTarget(linked_item)"""
	if match != None:
	value = match.group(1)
	try:
	link = pywikibot.Link(value)
	linkedPage = pywikibot.Page(link)
	if linkedPage.isRedirectPage():
	linkedPage = linkedPage.getRedirectTarget()
	linkedItem = pywikibot.ItemPage.fromPage(linkedPage)
	claim.setTarget(linkedItem)
	if not linkedItem.title():
	print " " # this is only to raise NoPage
	except pywikibot.exceptions.NoPage:
	pywikibot.output('%s doesn\'t exist so I can\'t link to it' % (linkedPage,))
	continue

	elif claim.type == 'string':
	claim.setTarget(value.strip())
	elif claim.type == 'external-id':
	claim.setTarget(value.strip())
	elif claim.type == 'commonsMedia':
	commonssite = pywikibot.Site("commons", "commons")
	imagelink = pywikibot.Link(value, source=commonssite,
	defaultNamespace=6)
	image = pywikibot.FilePage(imagelink)
	if image.isRedirectPage():
	image = pywikibot.FilePage(image.getRedirectTarget())
	if not image.exists():
	pywikibot.output(
	'[[%s]] doesn\'t exist so I can\'t link to it'
	% (image.title(),))
	continue
	claim.setTarget(image)
	else:
	pywikibot.output(
	'%s is not a supported datatype.'
	% claim.type)
	continue

	pywikibot.output('Adding %s --> %s'
	% (claim.getID(), claim.getTarget()))
	item.addClaim(claim)
	# A generator might yield pages from multiple sites
	source = self.getSource(page.site)
	if source:
	claim.addSource(source, bot=True)


	def main(*args):
	"""
	Process command line arguments and invoke bot.

	If args is an empty list, sys.argv is used.

	@param args: command line arguments
	@type args: list of unicode
	"""
	commandline_arguments = list()
	template_title = u''

	# Process global args and prepare generator args parser
	local_args = pywikibot.handle_args(args)
	gen = pg.GeneratorFactory()

	for arg in local_args:
	if arg.startswith('-template'):
	if len(arg) == 9:
	template_title = pywikibot.input(
	u'Please enter the template to work on:')
	else:
	template_title = arg[10:]
	elif gen.handleArg(arg):
	if arg.startswith(u'-transcludes:'):
	template_title = arg[13:]
	else:
	commandline_arguments.append(arg)

	if not template_title:
	pywikibot.error('Please specify either -template or -transcludes argument')
	return

	if len(commandline_arguments) % 2:
	raise ValueError # or something.
	fields = dict()

	for i in range(0, len(commandline_arguments), 2):
	fields[commandline_arguments[i]] = commandline_arguments[i + 1]

	generator = gen.getCombinedGenerator()
	if not generator:
	gen.handleArg(u'-transcludes:' + template_title)
	generator = gen.getCombinedGenerator()

	bot = HarvestRobot(generator, template_title, fields)
	bot.run()

	if __name__ == "__main__":
	main()

File Metadata

Mime Type: text/x-python
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3516363
Default Alt Text: harvest_template.py (10 KB)

harvest_template.pyJAnD (Jan Dudík)Actions

harvest_template.pyView Options

File Metadata

Event Timeline

harvest_template.py
JAnD (Jan Dudík)
Actions

harvest_template.py
View Options