The following wiki site on the InterWiki Map fails to decode the HTML data:
http://wiki.genealogy.net/index.php/$1
Following is the non-wiki site:
https://lists.wikimedia.org/mailman/listinfo/$1
Following is the traceback produced by detect_site_type:
TRACEBACK:
>>> pywikibot.detect_site_type("http://wiki.genealogy.net/index.php/$1") Traceback (most recent call last): File "<console>", line 1, in <module> File "C:\Users\Acer\Documents\GitHub\core\pywikibot\__init__.py", line 122, in detect_site_type data = request.content File "C:\Users\Acer\Documents\GitHub\core\pywikibot\comms\threadedhttp.py", line 496, in content return self.decode(self.encoding) File "C:\Users\Acer\Documents\GitHub\core\pywikibot\comms\threadedhttp.py", line 486, in encoding raise self._encoding UnicodeDecodeError: 'utf8' codec can't decode byte 0xfc in position 10989: invalid start byte
SAMPLE CODE:
def detect_site_type(url): up = urlparse(url) if up.scheme in ('http','https', ''): if up.scheme == '': url = 'http:' + url try: request = pywikibot.comms.http.fetch(url) except Exception as e: return 'Detection Failed : ' + str(e) data = request.content elif up.scheme == 'ftp': return 'Not a wikiengine site - ftp' elif up.scheme == 'irc': return 'Not a wikiengine site - irc' else: return 'No scheme satisfied' wp = WikiHTMLPageParser() wp.feed(data) if wp.generator: if "MediaWiki" not in wp.generator: return 'Not a MediaWiki site.' version = wp.generator return version else: return 'generator is empty' class WikiHTMLPageParser(HTMLParser): """Wiki HTML page parser.""" def __init__(self, *args, **kwargs): HTMLParser.__init__(self, *args, **kwargs) self.generator = None def handle_starttag(self, tag, attrs): attrs = dict(attrs) if tag == "meta": if attrs.get("name") == "generator": self.generator = attrs["content"] if tag == "link": if attrs.get("rel") == "EditURI": self.edituri = attrs["href"]