Page MenuHomePhabricator
Paste P8201

splitdump.py
ActivePublic

Authored by Gilles on Mar 14 2019, 4:17 PM.
Referenced Files
F28388247: raw.txt
Mar 14 2019, 4:17 PM
Subscribers
None
import subprocess
input_file = '/mnt/data/xmldatadumps/public/svwiki/20190301/svwiki-20190301-pages-meta-history.xml.bz2'
output = subprocess.check_output(['getlastidinbz2xml', '-f', input_file, '--type', 'page'])
revid = int(output.split(':')[1])
file_id = 0
step = 1000000
fspec = []
for range_start in range(0, revid, step):
file_id += 1
range_end = range_start + step
if range_start == 0:
range_start = 1
if range_end > revid:
range_end = revid
fspec.append('svwiki-%02d.bz2:%d:%d' % (file_id, range_start, range_end))
fspec = ','.join(fspec)
command = ['writeuptopageid', '-fspec', fspec, '-i', input_file, '-odir', '.']
print(' '.join(command))
subprocess.call(command)