Page MenuHomePhabricator
Paste P631

(An Untitled Masterwork)
ActivePublic

Authored by MZMcBride on May 10 2015, 2:16 AM.
#! /usr/bin/env python
# Public domain; MZMcBride; 2015
import bz2
import re
input_file = bz2.BZ2File('/data/scratch/dumps/enwiki/20150403/' +
'enwiki-20150403-pages-meta-current.xml.bz2', 'r')
log = open('templates-by-invocations-in-articles-enwiki-20150403.txt', 'w')
template_re = re.compile(r'{{([^|}]+)')
namespace_id = None
text = []
loop = False
pages_processed = 0
def extract_templates(page_text):
templates = []
for line in page_text.split('\n'):
for match in template_re.findall(line):
match = match.strip()
try:
match = match[0].upper() + match[1:]
templates.append(match)
except IndexError:
print(match)
return templates
for line in input_file:
if line.startswith(' <ns>'):
isolated_ns = line.strip().replace('<ns>', '').replace('</ns>', '')
namespace_id = int(isolated_ns)
pages_processed += 1
if pages_processed % 1000 == 0:
print(pages_processed)
if loop:
if line.find('</text>') == -1:
text.append(line)
loop = True
elif line.find('</text>') != -1:
loop = False
line = line.replace('</text>\n', '')
text.append(line)
page_text = ''.join(text)
if namespace_id == 0:
templates = extract_templates(page_text)
if templates:
for template in templates:
log.write(template + '\n')
text = []
if (line.startswith(' <text xml:space="preserve">') and
line.find('</text>') != -1):
page_text = line.strip()
page_text = page_text.replace('<text xml:space="preserve">', '')
page_text = page_text.replace('</text>', '')
loop = False
if namespace_id == 0:
templates = extract_templates(page_text)
if templates:
for template in templates:
log.write(template + '\n')
text = []
elif line.startswith(' <text xml:space="preserve">'):
line = line.strip()
line = line.replace('<text xml:space="preserve">', '')
text.append(line)
loop = True
input_file.close()
log.close()