Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F9302542
reflinks.py
Dvorapa (Pavel Dvořák)
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
Dvorapa
Sep 3 2017, 10:55 AM
2017-09-03 10:55:29 (UTC+0)
Size
29 KB
Referenced Files
None
Subscribers
None
reflinks.py
View Options
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Fetch and add titles for bare links in references.
This bot will search for references which are only made of a link without title
(i.e. <ref>[https://www.google.fr/]</ref> or <ref>https://www.google.fr/</ref>)
and will fetch the html title from the link to use it as the title of the wiki
link in the reference, i.e.
<ref>[https://www.google.fr/search?q=test test - Google Search]</ref>
The bot checks every 20 edits a special stop page. If the page has been edited,
it stops.
Warning: Running this script on German Wikipedia is not allowed anymore.
As it uses it, you need to configure noreferences.py for your wiki, or it will
not work.
pdfinfo is needed for parsing pdf titles.
¶ms;
-limit:n Stops after n edits
-xml:dump.xml Should be used instead of a simple page fetching method from
pagegenerators.py for performance and load issues
-xmlstart Page to start with when using an XML dump
-ignorepdf Do not handle PDF files (handy if you use Windows and can't
get pdfinfo)
-summary Use a custom edit summary. Otherwise it uses the default
one from i18n/reflinks.py
"""
# (C) Nicolas Dumazet (NicDumZ), 2008
# (C) Pywikibot team, 2008-2017
#
# Distributed under the terms of the MIT license.
#
from
__future__
import
absolute_import
,
division
,
unicode_literals
import
codecs
import
os
import
re
import
socket
import
subprocess
import
sys
import
tempfile
from
functools
import
partial
import
pywikibot
from
pywikibot
import
comms
,
i18n
,
pagegenerators
,
textlib
,
Bot
from
pywikibot
import
config2
as
config
from
pywikibot.pagegenerators
import
(
XMLDumpPageGenerator
as
_XMLDumpPageGenerator
,
)
from
pywikibot.tools.formatter
import
color_format
import
requests
from
scripts
import
noreferences
if
sys
.
version_info
[
0
]
>
2
:
import
http.client
as
httplib
from
urllib.error
import
URLError
else
:
import
httplib
from
urllib2
import
URLError
docuReplacements
=
{
'¶ms;'
:
pagegenerators
.
parameterHelp
}
localized_msg
=
(
'fr'
,
'it'
,
'pl'
)
# localized message at MediaWiki
# localized message at specific wikipedia site
# should be moved to MediaWiki Pywikibot manual
stop_page
=
{
'fr'
:
u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper'
,
'da'
:
u'Bruger:DumZiBoT/EditThisPageToStopMe'
,
'de'
:
u'Benutzer:DumZiBoT/EditThisPageToStopMe'
,
'fa'
:
u'کاربر:Amirobot/EditThisPageToStopMe'
,
'it'
:
u'Utente:Marco27Bot/EditThisPageToStopMe'
,
'ko'
:
u'사용자:GrassnBreadRefBot/EditThisPageToStopMe1'
,
'he'
:
u'User:Matanyabot/EditThisPageToStopMe'
,
'hu'
:
u'User:Damibot/EditThisPageToStopMe'
,
'en'
:
u'User:DumZiBoT/EditThisPageToStopMe'
,
'pl'
:
u'Wikipedysta:MastiBot/EditThisPageToStopMe'
,
'ru'
:
u'User:Rubinbot/EditThisPageToStopMe'
,
'zh'
:
u'User:Sz-iwbot'
,
}
deadLinkTag
=
{
'fr'
:
u'[
%s
] {{lien mort}}'
,
'da'
:
u'[
%s
] {{dødt link}}'
,
'fa'
:
u'[
%s
] {{پیوند مرده}}'
,
'he'
:
u'{{קישור שבור}}'
,
'hu'
:
u'[
%s
] {{halott link}}'
,
'ko'
:
u'[
%s
] {{죽은 바깥 고리}}'
,
'es'
:
u'{{enlace roto2|
%s
}}'
,
'it'
:
u'{{Collegamento interrotto|
%s
}}'
,
'en'
:
u'[
%s
] {{dead link}}'
,
'pl'
:
u'[
%s
] {{Martwy link}}'
,
'ru'
:
u'[
%s
] {{subst:dead}}'
,
}
soft404
=
re
.
compile
(
r'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog'
,
re
.
IGNORECASE
)
# matches an URL at the index of a website
dirIndex
=
re
.
compile
(
r'^\w+://[^/]+/((default|index)\.(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$'
,
re
.
IGNORECASE
)
# Extracts the domain name
domain
=
re
.
compile
(
r'^(\w+)://(?:www.|)([^/]+)'
)
globalbadtitles
=
r"""
# is
(test|
# starts with
^\W*(
register
|registration
|(sign|log)[ \-]?in
|subscribe
|sign[ \-]?up
|log[ \-]?on
|untitled[ ]?(document|page|\d+|$)
|404[ ]
).*
# anywhere
|.*(
403[ ]forbidden
|(404|page|file|information|resource).*not([ ]*be)?[ ]*(available|found)
|site.*disabled
|error[ ]404
|error.+not[ ]found
|not[ ]found.+error
|404[ ]error
|\D404\D
|check[ ]browser[ ]settings
|log[ \-]?(on|in)[ ]to
|site[ ]redirection
).*
# ends with
|.*(
register
|registration
|(sign|log)[ \-]?in
|subscribe|sign[ \-]?up
|log[ \-]?on
)\W*$
)
"""
# Language-specific bad titles
badtitles
=
{
'en'
:
''
,
'fr'
:
'.*(404|page|site).*en +travaux.*'
,
'es'
:
'.*sitio.*no +disponible.*'
,
'it'
:
'((pagina|sito) (non trovat[ao]|inesistente)|accedi|errore)'
,
'ru'
:
u'.*(Страница|страница).*(не[ ]*найдена|осутствует).*'
,
}
# Regex that match bare references
linksInRef
=
re
.
compile
(
# bracketed URLs
r'(?i)<ref(?P<name>[^>]*)>\s*\[?(?P<url>(?:http|https)://(?:'
+
# unbracketed with()
r'^\[\]\s<>"]+\([^\[\]\s<>"]+[^\[\]\s\.:;\\,<>\?"]+|'
+
# unbracketed without ()
r'[^\[\]\s<>"]+[^\[\]\s\)\.:;\\,<>\?"]+|[^\[\]\s<>"]+))[!?,\s]*\]?\s*</ref>'
)
# Download this file :
# http://www.twoevils.org/files/wikipedia/404-links.txt.gz
# ( maintained by User:Dispenser )
listof404pages
=
'404-links.txt'
XmlDumpPageGenerator
=
partial
(
_XMLDumpPageGenerator
,
text_predicate
=
linksInRef
.
search
)
class
RefLink
(
object
):
"""Container to handle a single bare reference."""
def
__init__
(
self
,
link
,
name
):
"""Constructor."""
self
.
refname
=
name
self
.
link
=
link
self
.
site
=
pywikibot
.
Site
()
self
.
linkComment
=
i18n
.
twtranslate
(
self
.
site
,
'reflinks-comment'
)
self
.
url
=
re
.
sub
(
u'#.*'
,
''
,
self
.
link
)
self
.
title
=
None
def
refTitle
(
self
):
"""Return the <ref> with its new title."""
return
'<ref
%s
>[
%s
%s
<!--
%s
-->]</ref>'
%
(
self
.
refname
,
self
.
link
,
self
.
title
,
self
.
linkComment
)
def
refLink
(
self
):
"""No title has been found, return the unbracketed link."""
return
'<ref
%s
>
%s
</ref>'
%
(
self
.
refname
,
self
.
link
)
def
refDead
(
self
):
"""Dead link, tag it with a {{dead link}}."""
tag
=
i18n
.
translate
(
self
.
site
,
deadLinkTag
)
if
not
tag
:
tag
=
self
.
link
elif
tag
and
'
%s
'
in
tag
:
tag
=
tag
%
self
.
link
return
'<ref
%s
>
%s
</ref>'
%
(
self
.
refname
,
tag
)
def
transform
(
self
,
ispdf
=
False
):
"""Normalize the title."""
# convert html entities
if
not
ispdf
:
self
.
title
=
pywikibot
.
html2unicode
(
self
.
title
)
self
.
title
=
re
.
sub
(
r'-+'
,
'-'
,
self
.
title
)
# remove formatting, i.e long useless strings
self
.
title
=
re
.
sub
(
r'[\.+\-=]{4,}'
,
' '
,
self
.
title
)
# remove \n and \r and Unicode spaces from titles
self
.
title
=
re
.
sub
(
r'(?u)\s'
,
' '
,
self
.
title
)
self
.
title
=
re
.
sub
(
r'[\n\r\t]'
,
' '
,
self
.
title
)
# remove extra whitespaces
# remove leading and trailing ./;/,/-/_/+/ /
self
.
title
=
re
.
sub
(
r' +'
,
' '
,
self
.
title
.
strip
(
r'=.;,-+_ '
))
self
.
avoid_uppercase
()
# avoid closing the link before the end
self
.
title
=
self
.
title
.
replace
(
']'
,
']'
)
# avoid multiple } being interpreted as a template inclusion
self
.
title
=
self
.
title
.
replace
(
'}}'
,
'}}'
)
# prevent multiple quotes being interpreted as '' or '''
self
.
title
=
self
.
title
.
replace
(
'
\'\'
'
,
'
\'
''
)
self
.
title
=
pywikibot
.
unicode2html
(
self
.
title
,
self
.
site
.
encoding
())
# TODO : remove HTML when both opening and closing tags are included
def
avoid_uppercase
(
self
):
"""
Convert to title()-case if title is 70% uppercase characters.
Skip title that has less than 6 characters.
"""
if
len
(
self
.
title
)
<=
6
:
return
nb_upper
=
0
nb_letter
=
0
for
letter
in
self
.
title
:
if
letter
.
isupper
():
nb_upper
+=
1
if
letter
.
isalpha
():
nb_letter
+=
1
if
letter
.
isdigit
():
return
if
nb_upper
/
(
nb_letter
+
1
)
>
.
70
:
self
.
title
=
self
.
title
.
title
()
class
DuplicateReferences
(
object
):
"""Helper to de-duplicate references in text.
When some references are duplicated in an article,
name the first, and remove the content of the others
"""
def
__init__
(
self
):
"""Constructor."""
# Match references
self
.
REFS
=
re
.
compile
(
r'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>'
)
self
.
NAMES
=
re
.
compile
(
r'(?i).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.+)\s*(?P=quote).*'
)
self
.
GROUPS
=
re
.
compile
(
r'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.+)\s*(?P=quote).*'
)
self
.
autogen
=
i18n
.
twtranslate
(
pywikibot
.
Site
(),
'reflinks-autogen'
)
def
process
(
self
,
text
):
"""Process the page."""
# keys are ref groups
# values are a dict where :
# keys are ref content
# values are [name, [list of full ref matches],
# quoted, need_to_change]
foundRefs
=
{}
foundRefNames
=
{}
# Replace key by [value, quoted]
namedRepl
=
{}
for
match
in
self
.
REFS
.
finditer
(
text
):
content
=
match
.
group
(
'content'
)
if
not
content
.
strip
():
continue
params
=
match
.
group
(
'params'
)
group
=
self
.
GROUPS
.
match
(
params
)
if
group
not
in
foundRefs
:
foundRefs
[
group
]
=
{}
groupdict
=
foundRefs
[
group
]
if
content
in
groupdict
:
v
=
groupdict
[
content
]
v
[
1
]
.
append
(
match
.
group
())
else
:
v
=
[
None
,
[
match
.
group
()],
False
,
False
]
name
=
self
.
NAMES
.
match
(
params
)
if
name
:
quoted
=
name
.
group
(
'quote'
)
==
'"'
name
=
name
.
group
(
'name'
)
if
v
[
0
]:
if
v
[
0
]
!=
name
:
namedRepl
[
name
]
=
[
v
[
0
],
v
[
2
]]
else
:
# First name associated with this content
if
name
==
'population'
:
pywikibot
.
output
(
content
)
if
name
not
in
foundRefNames
:
# first time ever we meet this name
if
name
==
'population'
:
pywikibot
.
output
(
"in"
)
v
[
2
]
=
quoted
v
[
0
]
=
name
else
:
# if has_key, means that this name is used
# with another content. We'll need to change it
v
[
3
]
=
True
foundRefNames
[
name
]
=
1
groupdict
[
content
]
=
v
id
=
1
while
self
.
autogen
+
str
(
id
)
in
foundRefNames
:
id
+=
1
for
(
g
,
d
)
in
foundRefs
.
items
():
if
g
:
group
=
u"group=
\"
%s
\"
"
%
group
else
:
group
=
u""
for
(
k
,
v
)
in
d
.
items
():
if
len
(
v
[
1
])
==
1
and
not
v
[
3
]:
continue
name
=
v
[
0
]
if
not
name
:
name
=
self
.
autogen
+
str
(
id
)
id
+=
1
elif
v
[
2
]:
name
=
u'"
%s
"'
%
name
named
=
u'<ref
%s
name=
%s
>
%s
</ref>'
%
(
group
,
name
,
k
)
text
=
text
.
replace
(
v
[
1
][
0
],
named
,
1
)
# make sure that the first (named ref) is not
# removed later :
pos
=
text
.
index
(
named
)
+
len
(
named
)
header
=
text
[:
pos
]
end
=
text
[
pos
:]
unnamed
=
u'<ref
%s
name=
%s
/>'
%
(
group
,
name
)
for
ref
in
v
[
1
][
1
:]:
end
=
end
.
replace
(
ref
,
unnamed
)
text
=
header
+
end
for
(
k
,
v
)
in
namedRepl
.
items
():
# TODO : Support ref groups
name
=
v
[
0
]
if
v
[
1
]:
name
=
u'"
%s
"'
%
name
text
=
re
.
sub
(
u'<ref name
\\
s*=
\\
s*(?P<quote>"?)
\\
s*
%s
\\
s*(?P=quote)
\\
s*/>'
%
k
,
u'<ref name=
%s
/>'
%
name
,
text
)
return
text
class
ReferencesRobot
(
Bot
):
"""References bot."""
def
__init__
(
self
,
generator
,
**
kwargs
):
"""- generator : Page generator."""
self
.
availableOptions
.
update
({
'ignorepdf'
:
False
,
# boolean
'limit'
:
None
,
# int, stop after n modified pages
'summary'
:
None
,
})
super
(
ReferencesRobot
,
self
)
.
__init__
(
**
kwargs
)
self
.
generator
=
generator
self
.
site
=
pywikibot
.
Site
()
self
.
_use_fake_user_agent
=
config
.
fake_user_agent_default
.
get
(
'reflinks'
,
False
)
# Check
manual
=
'mw:Manual:Pywikibot/refLinks'
code
=
None
for
alt
in
[
self
.
site
.
code
]
+
i18n
.
_altlang
(
self
.
site
.
code
):
if
alt
in
localized_msg
:
code
=
alt
break
if
code
:
manual
+=
'/
%s
'
%
code
if
self
.
getOption
(
'summary'
)
is
None
:
self
.
msg
=
i18n
.
twtranslate
(
self
.
site
,
'reflinks-msg'
,
locals
())
else
:
self
.
msg
=
self
.
getOption
(
'summary'
)
local
=
i18n
.
translate
(
self
.
site
,
badtitles
)
if
local
:
bad
=
'('
+
globalbadtitles
+
'|'
+
local
+
')'
else
:
bad
=
globalbadtitles
self
.
titleBlackList
=
re
.
compile
(
bad
,
re
.
I
|
re
.
S
|
re
.
X
)
self
.
norefbot
=
noreferences
.
NoReferencesBot
(
None
,
verbose
=
False
)
self
.
deduplicator
=
DuplicateReferences
()
site_stop_page
=
i18n
.
translate
(
self
.
site
,
stop_page
)
if
site_stop_page
:
self
.
stop_page
=
pywikibot
.
Page
(
self
.
site
,
site_stop_page
)
if
self
.
stop_page
.
exists
():
self
.
stop_page_rev_id
=
self
.
stop_page
.
latest_revision_id
else
:
pywikibot
.
output
(
'Warning: The stop page
%s
does not exist'
%
self
.
stop_page
.
title
(
asLink
=
True
))
# Regex to grasp content-type meta HTML tag in HTML source
self
.
META_CONTENT
=
re
.
compile
(
br
'(?i)<meta[^>]*content\-type[^>]*>'
)
# Extract the encoding from a charset property (from content-type !)
self
.
CHARSET
=
re
.
compile
(
r'(?i)charset\s*=\s*(?P<enc>[^\'",;>/]*)'
)
# Extract html title from page
self
.
TITLE
=
re
.
compile
(
r'(?is)(?<=<title>).*?(?=</title>)'
)
# Matches content inside <script>/<style>/HTML comments
self
.
NON_HTML
=
re
.
compile
(
br
'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|'
br
'<!--.*?-->|<!\[CDATA\[.*?\]\]>'
)
# Authorized mime types for HTML pages
self
.
MIME
=
re
.
compile
(
r'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml'
)
def
httpError
(
self
,
err_num
,
link
,
pagetitleaslink
):
"""Log HTTP Error."""
pywikibot
.
stdout
(
'HTTP error ({0}) for {1} on {2}'
''
.
format
(
err_num
,
link
,
pagetitleaslink
))
def
getPDFTitle
(
self
,
ref
,
f
):
"""Use pdfinfo to retrieve title from a PDF.
FIXME: Unix-only, I'm afraid.
"""
pywikibot
.
output
(
u'PDF file.'
)
fd
,
infile
=
tempfile
.
mkstemp
()
urlobj
=
os
.
fdopen
(
fd
,
'w+'
)
urlobj
.
write
(
f
.
content
)
try
:
pdfinfo_out
=
subprocess
.
Popen
([
r"pdfinfo"
,
"/dev/stdin"
],
stdin
=
urlobj
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
shell
=
False
)
.
communicate
()[
0
]
for
aline
in
pdfinfo_out
.
splitlines
():
if
aline
.
lower
()
.
startswith
(
'title'
):
ref
.
title
=
aline
.
split
(
None
)[
1
:]
ref
.
title
=
' '
.
join
(
ref
.
title
)
if
ref
.
title
!=
''
:
pywikibot
.
output
(
u'title:
%s
'
%
ref
.
title
)
pywikibot
.
output
(
u'PDF done.'
)
except
ValueError
:
pywikibot
.
output
(
u'pdfinfo value error.'
)
except
OSError
:
pywikibot
.
output
(
u'pdfinfo OS error.'
)
except
BaseException
:
# Ignore errors
pywikibot
.
output
(
u'PDF processing error.'
)
pass
finally
:
urlobj
.
close
()
os
.
unlink
(
infile
)
def
run
(
self
):
"""Run the Bot."""
try
:
deadLinks
=
codecs
.
open
(
listof404pages
,
'r'
,
'latin_1'
)
.
read
()
except
IOError
:
pywikibot
.
output
(
'You need to download '
'http://www.twoevils.org/files/wikipedia/404-links.txt.gz '
'and to ungzip it in the same directory'
)
raise
editedpages
=
0
for
page
in
self
.
generator
:
try
:
# Load the page's text from the wiki
new_text
=
page
.
get
()
if
not
page
.
canBeEdited
():
pywikibot
.
output
(
u"You can't edit page
%s
"
%
page
.
title
(
asLink
=
True
))
continue
except
pywikibot
.
NoPage
:
pywikibot
.
output
(
u'Page
%s
not found'
%
page
.
title
(
asLink
=
True
))
continue
except
pywikibot
.
IsRedirectPage
:
pywikibot
.
output
(
u'Page
%s
is a redirect'
%
page
.
title
(
asLink
=
True
))
continue
# for each link to change
for
match
in
linksInRef
.
finditer
(
textlib
.
removeDisabledParts
(
page
.
get
())):
link
=
match
.
group
(
u'url'
)
# debugging purpose
# print link
if
u'jstor.org'
in
link
:
# TODO: Clean URL blacklist
continue
ref
=
RefLink
(
link
,
match
.
group
(
'name'
))
f
=
None
try
:
f
=
comms
.
http
.
fetch
(
ref
.
url
,
use_fake_user_agent
=
self
.
_use_fake_user_agent
)
# Try to get Content-Type from server
contentType
=
f
.
response_headers
.
get
(
'content-type'
)
if
contentType
and
not
self
.
MIME
.
search
(
contentType
):
if
ref
.
link
.
lower
()
.
endswith
(
'.pdf'
)
and
\
not
self
.
getOption
(
'ignorepdf'
):
# If file has a PDF suffix
self
.
getPDFTitle
(
ref
,
f
)
else
:
pywikibot
.
output
(
color_format
(
'{lightyellow}WARNING{default} : '
'media : {0} '
,
ref
.
link
))
if
ref
.
title
:
if
not
re
.
match
(
u'(?i) *microsoft (word|excel|visio)'
,
ref
.
title
):
ref
.
transform
(
ispdf
=
True
)
repl
=
ref
.
refTitle
()
else
:
pywikibot
.
output
(
color_format
(
'{lightyellow}WARNING{default} : '
'PDF title blacklisted : {0} '
,
ref
.
title
))
repl
=
ref
.
refLink
()
else
:
repl
=
ref
.
refLink
()
new_text
=
new_text
.
replace
(
match
.
group
(),
repl
)
continue
# Get the real url where we end (http redirects !)
redir
=
f
.
data
.
url
if
redir
!=
ref
.
link
and
\
domain
.
findall
(
redir
)
==
domain
.
findall
(
link
):
if
soft404
.
search
(
redir
)
and
\
not
soft404
.
search
(
ref
.
link
):
pywikibot
.
output
(
color_format
(
'{lightyellow}WARNING{default} : '
'Redirect 404 : {0} '
,
ref
.
link
))
continue
if
dirIndex
.
match
(
redir
)
and
\
not
dirIndex
.
match
(
ref
.
link
):
pywikibot
.
output
(
color_format
(
u'{lightyellow}WARNING{default} : '
u'Redirect to root : {0} '
,
ref
.
link
))
continue
if
f
.
status
!=
requests
.
codes
.
ok
:
pywikibot
.
output
(
u'HTTP error (
%s
) for
%s
on
%s
'
%
(
f
.
status
,
ref
.
url
,
page
.
title
(
asLink
=
True
)),
toStdout
=
True
)
# 410 Gone, indicates that the resource has been purposely
# removed
if
f
.
status
==
410
or
\
(
f
.
status
==
404
and
(
u'
\t
%s
\t
'
%
ref
.
url
in
deadLinks
)):
repl
=
ref
.
refDead
()
new_text
=
new_text
.
replace
(
match
.
group
(),
repl
)
continue
linkedpagetext
=
f
.
raw
except
UnicodeError
:
# example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html
# in [[fr:Cyanure]]
pywikibot
.
output
(
color_format
(
'{lightred}Bad link{default} : {0} in {1}'
,
ref
.
url
,
page
.
title
(
asLink
=
True
)))
continue
except
(
URLError
,
socket
.
error
,
IOError
,
httplib
.
error
)
as
e
:
pywikibot
.
output
(
u'Can
\'
t retrieve page
%s
:
%s
'
%
(
ref
.
url
,
e
))
continue
# remove <script>/<style>/comments/CDATA tags
linkedpagetext
=
self
.
NON_HTML
.
sub
(
b
''
,
linkedpagetext
)
meta_content
=
self
.
META_CONTENT
.
search
(
linkedpagetext
)
enc
=
[]
s
=
None
if
contentType
:
# use charset from http header
s
=
self
.
CHARSET
.
search
(
contentType
)
if
meta_content
:
tag
=
meta_content
.
group
()
# Prefer the contentType from the HTTP header :
if
not
contentType
:
contentType
=
tag
if
not
s
:
# use charset from html
s
=
self
.
CHARSET
.
search
(
str
(
tag
))
if
s
:
tmp
=
s
.
group
(
'enc'
)
.
strip
(
"
\"
' "
)
.
lower
()
naked
=
re
.
sub
(
r'[ _\-]'
,
''
,
tmp
)
# Convert to python correct encoding names
if
naked
==
"gb2312"
:
enc
.
append
(
"gbk"
)
elif
naked
==
"shiftjis"
:
enc
.
append
(
"shift jis 2004"
)
enc
.
append
(
"cp932"
)
elif
naked
==
"xeucjp"
:
enc
.
append
(
"euc-jp"
)
else
:
enc
.
append
(
tmp
)
else
:
pywikibot
.
output
(
u'No charset found for
%s
'
%
ref
.
link
)
if
not
contentType
:
pywikibot
.
output
(
u'No content-type found for
%s
'
%
ref
.
link
)
continue
elif
not
self
.
MIME
.
search
(
contentType
):
pywikibot
.
output
(
color_format
(
'{lightyellow}WARNING{default} : media : {0} '
,
ref
.
link
))
repl
=
ref
.
refLink
()
new_text
=
new_text
.
replace
(
match
.
group
(),
repl
)
continue
# Ugly hacks to try to survive when both server and page
# return no encoding.
# Uses most used encodings for each national suffix
if
u'.ru'
in
ref
.
link
or
u'.su'
in
ref
.
link
:
# see http://www.sci.aha.ru/ATL/ra13a.htm : no server
# encoding, no page encoding
enc
=
enc
+
[
'koi8-r'
,
'windows-1251'
]
elif
u'.jp'
in
ref
.
link
:
enc
.
append
(
"shift jis 2004"
)
enc
.
append
(
"cp932"
)
elif
u'.kr'
in
ref
.
link
:
enc
.
append
(
"euc-kr"
)
enc
.
append
(
"cp949"
)
elif
u'.zh'
in
ref
.
link
:
enc
.
append
(
"gbk"
)
if
'utf-8'
not
in
enc
:
enc
.
append
(
'utf-8'
)
try
:
u
=
linkedpagetext
.
decode
(
enc
[
0
])
# Bug T69410
except
(
UnicodeDecodeError
,
LookupError
)
as
e
:
pywikibot
.
output
(
u'
%s
: Decoding error -
%s
'
%
(
ref
.
link
,
e
))
continue
# Retrieves the first non empty string inside <title> tags
for
m
in
self
.
TITLE
.
finditer
(
u
):
t
=
m
.
group
()
if
t
:
ref
.
title
=
t
ref
.
transform
()
if
ref
.
title
:
break
if
not
ref
.
title
:
repl
=
ref
.
refLink
()
new_text
=
new_text
.
replace
(
match
.
group
(),
repl
)
pywikibot
.
output
(
u'
%s
: No title found...'
%
ref
.
link
)
continue
# XXX Ugly hack
if
u'é'
in
ref
.
title
:
repl
=
ref
.
refLink
()
new_text
=
new_text
.
replace
(
match
.
group
(),
repl
)
pywikibot
.
output
(
u'
%s
: Hybrid encoding...'
%
ref
.
link
)
continue
if
self
.
titleBlackList
.
match
(
ref
.
title
):
repl
=
ref
.
refLink
()
new_text
=
new_text
.
replace
(
match
.
group
(),
repl
)
pywikibot
.
output
(
color_format
(
'{lightred}WARNING{default} {0} : '
'Blacklisted title ({1})'
,
ref
.
link
,
ref
.
title
))
continue
# Truncate long titles. 175 is arbitrary
if
len
(
ref
.
title
)
>
175
:
ref
.
title
=
ref
.
title
[:
175
]
+
"..."
repl
=
ref
.
refTitle
()
new_text
=
new_text
.
replace
(
match
.
group
(),
repl
)
# Add <references/> when needed, but ignore templates !
if
page
.
namespace
!=
10
:
if
self
.
norefbot
.
lacksReferences
(
new_text
):
new_text
=
self
.
norefbot
.
addReferences
(
new_text
)
new_text
=
self
.
deduplicator
.
process
(
new_text
)
self
.
userPut
(
page
,
page
.
text
,
new_text
,
summary
=
self
.
msg
,
ignore_save_related_errors
=
True
,
ignore_server_errors
=
True
)
if
new_text
==
page
.
text
:
continue
else
:
editedpages
+=
1
if
self
.
getOption
(
'limit'
)
and
editedpages
>=
self
.
getOption
(
'limit'
):
pywikibot
.
output
(
'Edited
%s
pages, stopping.'
%
self
.
getOption
(
'limit'
))
return
if
editedpages
%
20
==
0
:
pywikibot
.
output
(
color_format
(
'{lightgreen}Checking stop page...{default}'
))
actual_rev
=
self
.
stop_page
.
latest_revision_id
if
actual_rev
!=
self
.
stop_page_rev_id
:
pywikibot
.
output
(
u'[[
%s
]] has been edited : Someone wants us to stop.'
%
self
.
stop_page
)
return
def
main
(
*
args
):
"""
Process command line arguments and invoke bot.
If args is an empty list, sys.argv is used.
@param args: command line arguments
@type args: list of unicode
"""
xmlFilename
=
None
xmlStart
=
None
options
=
{}
generator
=
None
# Process global args and prepare generator args parser
local_args
=
pywikibot
.
handle_args
(
args
)
genFactory
=
pagegenerators
.
GeneratorFactory
()
for
arg
in
local_args
:
if
arg
.
startswith
(
'-summary:'
):
options
[
'summary'
]
=
arg
[
9
:]
elif
arg
==
'-always'
:
options
[
'always'
]
=
True
elif
arg
==
'-ignorepdf'
:
options
[
'ignorepdf'
]
=
True
elif
arg
.
startswith
(
'-limit:'
):
options
[
'limit'
]
=
int
(
arg
[
7
:])
elif
arg
.
startswith
(
'-xmlstart'
):
if
len
(
arg
)
==
9
:
xmlStart
=
pywikibot
.
input
(
u'Please enter the dumped article to start with:'
)
else
:
xmlStart
=
arg
[
10
:]
elif
arg
.
startswith
(
'-xml'
):
if
len
(
arg
)
==
4
:
xmlFilename
=
pywikibot
.
input
(
u'Please enter the XML dump
\'
s filename:'
)
else
:
xmlFilename
=
arg
[
5
:]
else
:
genFactory
.
handleArg
(
arg
)
if
xmlFilename
:
generator
=
XmlDumpPageGenerator
(
xmlFilename
,
xmlStart
,
genFactory
.
namespaces
)
if
not
generator
:
generator
=
genFactory
.
getCombinedGenerator
()
if
not
generator
:
pywikibot
.
bot
.
suggest_help
(
missing_generator
=
True
)
return
False
if
not
genFactory
.
nopreload
:
generator
=
pagegenerators
.
PreloadingGenerator
(
generator
)
generator
=
pagegenerators
.
RedirectFilterPageGenerator
(
generator
)
bot
=
ReferencesRobot
(
generator
,
**
options
)
bot
.
run
()
return
True
if
__name__
==
"__main__"
:
main
()
File Metadata
Details
Attached
Mime Type
text/x-python
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
4906287
Default Alt Text
reflinks.py (29 KB)
Attached To
Mode
T174883: reflinks.py: Title must be specified and not empty if a source is a Site.
Attached
Detach File
Event Timeline
Log In to Comment