The following wiki site on the InterWiki Map fails to decode the HTML data:
http://wiki.genealogy.net/index.php/$1
Following is the non-wiki site:
https://lists.wikimedia.org/mailman/listinfo/$1
Following is the traceback produced by detect_site_type:
TRACEBACK:
>>> pywikibot.detect_site_type("http://wiki.genealogy.net/index.php/$1")
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "C:\Users\Acer\Documents\GitHub\core\pywikibot\__init__.py", line 122, in detect_site_type
data = request.content
File "C:\Users\Acer\Documents\GitHub\core\pywikibot\comms\threadedhttp.py", line 496, in content
return self.decode(self.encoding)
File "C:\Users\Acer\Documents\GitHub\core\pywikibot\comms\threadedhttp.py", line 486, in encoding
raise self._encoding
UnicodeDecodeError: 'utf8' codec can't decode byte 0xfc in position 10989: invalid start byteSAMPLE CODE:
def detect_site_type(url):
up = urlparse(url)
if up.scheme in ('http','https', ''):
if up.scheme == '':
url = 'http:' + url
try:
request = pywikibot.comms.http.fetch(url)
except Exception as e:
return 'Detection Failed : ' + str(e)
data = request.content
elif up.scheme == 'ftp':
return 'Not a wikiengine site - ftp'
elif up.scheme == 'irc':
return 'Not a wikiengine site - irc'
else:
return 'No scheme satisfied'
wp = WikiHTMLPageParser()
wp.feed(data)
if wp.generator:
if "MediaWiki" not in wp.generator:
return 'Not a MediaWiki site.'
version = wp.generator
return version
else:
return 'generator is empty'
class WikiHTMLPageParser(HTMLParser):
"""Wiki HTML page parser."""
def __init__(self, *args, **kwargs):
HTMLParser.__init__(self, *args, **kwargs)
self.generator = None
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if tag == "meta":
if attrs.get("name") == "generator":
self.generator = attrs["content"]
if tag == "link":
if attrs.get("rel") == "EditURI":
self.edituri = attrs["href"]