Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F3636160
harvest_template.py
JAnD (Jan Dudík)
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
JAnD
Mar 15 2016, 8:00 AM
2016-03-15 08:00:19 (UTC+0)
Size
10 KB
Referenced Files
None
Subscribers
None
harvest_template.py
View Options
#!/usr/bin/python
# -*- coding: utf-8 -*-
r"""
Template harvesting script.
Usage:
* python pwb.py harvest_template -transcludes:"..." \
template_parameter PID [template_parameter PID]
* python pwb.py harvest_template [generators] -template:"..." \
template_parameter PID [template_parameter PID]
This will work on all pages that transclude the template in the article
namespace
These command line parameters can be used to specify which pages to work on:
¶ms;
Examples:
python pwb.py harvest_template -lang:nl -cat:Sisoridae -namespace:0 \
-template:"Taxobox straalvinnige" orde P70 familie P71 geslacht P74
"""
#
# (C) Multichill, Amir, 2013
# (C) Pywikibot team, 2013-2014
#
# Distributed under the terms of MIT License.
#
from
__future__
import
absolute_import
,
unicode_literals
__version__
=
'$Id: f6b5dcd584a6de3ef17df5a681d4706d97293b54 $'
#
import
re
import
signal
willstop
=
False
def
_signal_handler
(
signal
,
frame
):
global
willstop
if
not
willstop
:
willstop
=
True
print
(
'Received ctrl-c. Finishing current item; '
'press ctrl-c again to abort.'
)
# noqa
else
:
raise
KeyboardInterrupt
signal
.
signal
(
signal
.
SIGINT
,
_signal_handler
)
import
pywikibot
from
pywikibot
import
pagegenerators
as
pg
,
textlib
,
WikidataBot
docuReplacements
=
{
'¶ms;'
:
pywikibot
.
pagegenerators
.
parameterHelp
}
class
HarvestRobot
(
WikidataBot
):
"""A bot to add Wikidata claims."""
def
__init__
(
self
,
generator
,
templateTitle
,
fields
):
"""
Constructor.
Arguments:
* generator - A generator that yields Page objects.
* templateTitle - The template to work on
* fields - A dictionary of fields that are of use to us
"""
super
(
HarvestRobot
,
self
)
.
__init__
()
self
.
generator
=
pg
.
PreloadingGenerator
(
generator
)
self
.
templateTitle
=
templateTitle
.
replace
(
u'_'
,
u' '
)
# TODO: Make it a list which also includes the redirects to the template
self
.
fields
=
fields
self
.
cacheSources
()
self
.
templateTitles
=
self
.
getTemplateSynonyms
(
self
.
templateTitle
)
def
getTemplateSynonyms
(
self
,
title
):
"""Fetch redirects of the title, so we can check against them."""
temp
=
pywikibot
.
Page
(
pywikibot
.
Site
(),
title
,
ns
=
10
)
if
not
temp
.
exists
():
pywikibot
.
error
(
u'Template
%s
does not exist.'
%
temp
.
title
())
exit
()
# Put some output here since it can take a while
pywikibot
.
output
(
'Finding redirects...'
)
if
temp
.
isRedirectPage
():
temp
=
temp
.
getRedirectTarget
()
titles
=
[
page
.
title
(
withNamespace
=
False
)
for
page
in
temp
.
getReferences
(
redirectsOnly
=
True
,
namespaces
=
[
10
],
follow_redirects
=
False
)]
titles
.
append
(
temp
.
title
(
withNamespace
=
False
))
return
titles
def
_template_link_target
(
self
,
item
,
link_text
):
linked_page
=
None
link
=
pywikibot
.
Link
(
link_text
)
linked_page
=
pywikibot
.
Page
(
link
)
if
not
linked_page
.
exists
():
pywikibot
.
output
(
'
%s
does not exist so it cannot be linked. '
'Skipping.'
%
(
linked_page
))
return
if
linked_page
.
isRedirectPage
():
linked_page
=
linked_page
.
getRedirectTarget
()
try
:
linked_item
=
pywikibot
.
ItemPage
.
fromPage
(
linked_page
)
except
pywikibot
.
NoPage
:
linked_item
=
None
if
not
linked_item
or
not
linked_item
.
exists
():
pywikibot
.
output
(
'
%s
does not have a wikidata item to link with. '
'Skipping.'
%
(
linked_page
))
return
if
linked_item
.
title
()
==
item
.
title
():
pywikibot
.
output
(
'
%s
links to itself. Skipping.'
%
(
linked_page
))
return
return
linked_item
def
treat
(
self
,
page
,
item
):
"""Process a single page/item."""
if
willstop
:
raise
KeyboardInterrupt
self
.
current_page
=
page
item
.
get
()
if
set
(
self
.
fields
.
values
())
<=
set
(
item
.
claims
.
keys
()):
pywikibot
.
output
(
'
%s
item
%s
has claims for all properties. '
'Skipping.'
%
(
page
,
item
.
title
()))
return
pagetext
=
page
.
get
()
templates
=
textlib
.
extract_templates_and_params
(
pagetext
)
for
(
template
,
fielddict
)
in
templates
:
# Clean up template
try
:
template
=
pywikibot
.
Page
(
page
.
site
,
template
,
ns
=
10
)
.
title
(
withNamespace
=
False
)
except
pywikibot
.
exceptions
.
InvalidTitle
:
pywikibot
.
error
(
"Failed parsing template; '
%s
' should be the template name."
%
template
)
continue
# We found the template we were looking for
if
template
in
self
.
templateTitles
:
for
field
,
value
in
fielddict
.
items
():
field
=
field
.
strip
()
value
=
value
.
strip
()
if
not
field
or
not
value
:
continue
# This field contains something useful for us
if
field
in
self
.
fields
:
# Check if the property isn't already set
claim
=
pywikibot
.
Claim
(
self
.
repo
,
self
.
fields
[
field
])
if
claim
.
getID
()
in
item
.
get
()
.
get
(
'claims'
):
pywikibot
.
output
(
'A claim for
%s
already exists. Skipping.'
%
claim
.
getID
())
# TODO: Implement smarter approach to merging
# harvested values with existing claims esp.
# without overwriting humans unintentionally.
else
:
if
claim
.
type
==
'wikibase-item'
:
# Try to extract a valid page
match
=
re
.
search
(
pywikibot
.
link_regex
,
value
)
"""if not match:
pywikibot.output(
'%s field %s value %s is not a '
'wikilink. Skipping.'
% (claim.getID(), field, value))
continue
link_text = match.group(1)
linked_item = self._template_link_target(item, link_text)
if not linked_item:
continue
claim.setTarget(linked_item)"""
if
match
!=
None
:
value
=
match
.
group
(
1
)
try
:
link
=
pywikibot
.
Link
(
value
)
linkedPage
=
pywikibot
.
Page
(
link
)
if
linkedPage
.
isRedirectPage
():
linkedPage
=
linkedPage
.
getRedirectTarget
()
linkedItem
=
pywikibot
.
ItemPage
.
fromPage
(
linkedPage
)
claim
.
setTarget
(
linkedItem
)
if
not
linkedItem
.
title
():
print
" "
# this is only to raise NoPage
except
pywikibot
.
exceptions
.
NoPage
:
pywikibot
.
output
(
'
%s
doesn
\'
t exist so I can
\'
t link to it'
%
(
linkedPage
,))
continue
elif
claim
.
type
==
'string'
:
claim
.
setTarget
(
value
.
strip
())
elif
claim
.
type
==
'external-id'
:
claim
.
setTarget
(
value
.
strip
())
elif
claim
.
type
==
'commonsMedia'
:
commonssite
=
pywikibot
.
Site
(
"commons"
,
"commons"
)
imagelink
=
pywikibot
.
Link
(
value
,
source
=
commonssite
,
defaultNamespace
=
6
)
image
=
pywikibot
.
FilePage
(
imagelink
)
if
image
.
isRedirectPage
():
image
=
pywikibot
.
FilePage
(
image
.
getRedirectTarget
())
if
not
image
.
exists
():
pywikibot
.
output
(
'[[
%s
]] doesn
\'
t exist so I can
\'
t link to it'
%
(
image
.
title
(),))
continue
claim
.
setTarget
(
image
)
else
:
pywikibot
.
output
(
'
%s
is not a supported datatype.'
%
claim
.
type
)
continue
pywikibot
.
output
(
'Adding
%s
-->
%s
'
%
(
claim
.
getID
(),
claim
.
getTarget
()))
item
.
addClaim
(
claim
)
# A generator might yield pages from multiple sites
source
=
self
.
getSource
(
page
.
site
)
if
source
:
claim
.
addSource
(
source
,
bot
=
True
)
def
main
(
*
args
):
"""
Process command line arguments and invoke bot.
If args is an empty list, sys.argv is used.
@param args: command line arguments
@type args: list of unicode
"""
commandline_arguments
=
list
()
template_title
=
u''
# Process global args and prepare generator args parser
local_args
=
pywikibot
.
handle_args
(
args
)
gen
=
pg
.
GeneratorFactory
()
for
arg
in
local_args
:
if
arg
.
startswith
(
'-template'
):
if
len
(
arg
)
==
9
:
template_title
=
pywikibot
.
input
(
u'Please enter the template to work on:'
)
else
:
template_title
=
arg
[
10
:]
elif
gen
.
handleArg
(
arg
):
if
arg
.
startswith
(
u'-transcludes:'
):
template_title
=
arg
[
13
:]
else
:
commandline_arguments
.
append
(
arg
)
if
not
template_title
:
pywikibot
.
error
(
'Please specify either -template or -transcludes argument'
)
return
if
len
(
commandline_arguments
)
%
2
:
raise
ValueError
# or something.
fields
=
dict
()
for
i
in
range
(
0
,
len
(
commandline_arguments
),
2
):
fields
[
commandline_arguments
[
i
]]
=
commandline_arguments
[
i
+
1
]
generator
=
gen
.
getCombinedGenerator
()
if
not
generator
:
gen
.
handleArg
(
u'-transcludes:'
+
template_title
)
generator
=
gen
.
getCombinedGenerator
()
bot
=
HarvestRobot
(
generator
,
template_title
,
fields
)
bot
.
run
()
if
__name__
==
"__main__"
:
main
()
File Metadata
Details
Attached
Mime Type
text/x-python
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3516363
Default Alt Text
harvest_template.py (10 KB)
Attached To
Mode
T64014: Add support for item without link to harvest_template
Attached
Detach File
Event Timeline
Log In to Comment