See https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse
Note
iterparse() only guarantees that it has seen the “>” character of a starting tag when it emits a “start” event, so the attributes are defined, but the contents of the text and tail attributes are undefined at that point. The same applies to the element children; they may or may not be present.
If you need a fully populated element, look for “end” events instead.
Steps to replicate the issue (include links if applicable):
- apply this patch
diff --git a/pywikibot/xmlreader.py b/pywikibot/xmlreader.py index 5d3575e6b..98a420364 100644 --- a/pywikibot/xmlreader.py +++ b/pywikibot/xmlreader.py @@ -235,10 +235,23 @@ class XmlDump: elem.clear() self.root.clear() + def _assert(self, elem): + "assert." + assert self.title == elem.findtext(f'{{{self.uri}}}title') + assert self.ns == elem.findtext(f'{{{self.uri}}}ns') + assert self.pageid == elem.findtext(f'{{{self.uri}}}id') + assert self.restrictions == elem.findtext(f'{{{self.uri}}}restrictions') + assert self.isredirect == (elem.findtext(f'{{{self.uri}}}redirect') is not None) + edit, move = parseRestrictions(elem.findtext(f'{{{self.uri}}}restrictions')) + assert self.editRestriction == edit + assert self.moveRestriction == move + def _parse_all(self, event, elem): """Parser that yields all revisions.""" if event == 'start' and elem.tag == f'{{{self.uri}}}page': self._headers(elem) + if event == 'end' and elem.tag == f'{{{self.uri}}}page': + self._assert(elem) if event == 'end' and elem.tag == f'{{{self.uri}}}revision': yield self._create_revision(elem) elem.clear()
- run
from pywikibot import xmlreader filename = 'enwikisource.xml'# use a dump with full history per page dump = xmlreader.XmlDump(filename, allrevisions=True) gen = dump.parse() for i, rev in enumerate(gen): rev.text = rev.text[0:10] + ' ...' print((rev.__dict__['revisionid'], vars(rev)))
What happens?:
('4054131', {'title': 'Presidential Radio Address - 3 March 2001', 'ns': '0', 'id': '', 'text': '{{header\n ...', 'username': 'Mpaa', 'ipedit': False, 'timestamp': '2012-09-06T23:31:28Z', 'editRestriction': None, 'moveRestriction': None, 'revisionid': '4054131', 'comment': 'year/noyearcat in header + changed cat + changed note', 'isredirect': False}) --------------------------------------------------------------------------- AssertionError Traceback (most recent call last) File ~/Downloads/xmldump/test_xml.py:63 42 yield next(dump.parse()) (...) ---> 63 for i, rev in enumerate(gen): 64 rev.text = rev.text[0:10] + ' ...' 65 print((rev.__dict__['revisionid'], vars(rev))) File ~/python/core/pywikibot/xmlreader.py:190, in XmlDump.parse(self) 188 self.root = elem 189 continue --> 190 yield from self._parse(event, elem) File ~/python/core/pywikibot/xmlreader.py:254, in XmlDump._parse_all(self, event, elem) 252 self._headers(elem) 253 if event == 'end' and elem.tag == f'{{{self.uri}}}page': --> 254 self._assert(elem) 255 if event == 'end' and elem.tag == f'{{{self.uri}}}revision': 256 yield self._create_revision(elem) File ~/python/core/pywikibot/xmlreader.py:242, in XmlDump._assert(self, elem) 240 assert self.title == elem.findtext(f'{{{self.uri}}}title') 241 assert self.ns == elem.findtext(f'{{{self.uri}}}ns') --> 242 assert self.pageid == elem.findtext(f'{{{self.uri}}}id') 243 assert self.restrictions == elem.findtext(f'{{{self.uri}}}restrictions') 244 assert self.isredirect == (elem.findtext(f'{{{self.uri}}}redirect') is not None) AssertionError: > /home/pc/python/core/pywikibot/xmlreader.py(242)_assert() 240 assert self.title == elem.findtext(f'{{{self.uri}}}title') 241 assert self.ns == elem.findtext(f'{{{self.uri}}}ns') --> 242 assert self.pageid == elem.findtext(f'{{{self.uri}}}id') 243 assert self.restrictions == elem.findtext(f'{{{self.uri}}}restrictions') 244 assert self.isredirect == (elem.findtext(f'{{{self.uri}}}redirect') is not None) ipdb> elem.findtext(f'{{{self.uri}}}id') '317'
What should have happened instead?:
no errors