Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Paste
P631
(An Untitled Masterwork)
Active
Public
Actions
Authored by
•
MZMcBride
on May 10 2015, 2:16 AM.
Edit Paste
Archive Paste
View Raw File
Subscribe
Mute Notifications
Award Token
Flag For Later
Tags
None
Referenced Files
F163048:
May 10 2015, 2:16 AM
2015-05-10 02:16:46 (UTC+0)
Subscribers
None
#! /usr/bin/env python
# Public domain; MZMcBride; 2015
import
bz2
import
re
input_file
=
bz2
.
BZ2File
(
'/data/scratch/dumps/enwiki/20150403/'
+
'enwiki-20150403-pages-meta-current.xml.bz2'
,
'r'
)
log
=
open
(
'templates-by-invocations-in-articles-enwiki-20150403.txt'
,
'w'
)
template_re
=
re
.
compile
(
r'{{([^|}]+)'
)
namespace_id
=
None
text
=
[]
loop
=
False
pages_processed
=
0
def
extract_templates
(
page_text
):
templates
=
[]
for
line
in
page_text
.
split
(
'
\n
'
):
for
match
in
template_re
.
findall
(
line
):
match
=
match
.
strip
()
try
:
match
=
match
[
0
]
.
upper
()
+
match
[
1
:]
templates
.
append
(
match
)
except
IndexError
:
print
(
match
)
return
templates
for
line
in
input_file
:
if
line
.
startswith
(
' <ns>'
):
isolated_ns
=
line
.
strip
()
.
replace
(
'<ns>'
,
''
)
.
replace
(
'</ns>'
,
''
)
namespace_id
=
int
(
isolated_ns
)
pages_processed
+=
1
if
pages_processed
%
1000
==
0
:
print
(
pages_processed
)
if
loop
:
if
line
.
find
(
'</text>'
)
==
-
1
:
text
.
append
(
line
)
loop
=
True
elif
line
.
find
(
'</text>'
)
!=
-
1
:
loop
=
False
line
=
line
.
replace
(
'</text>
\n
'
,
''
)
text
.
append
(
line
)
page_text
=
''
.
join
(
text
)
if
namespace_id
==
0
:
templates
=
extract_templates
(
page_text
)
if
templates
:
for
template
in
templates
:
log
.
write
(
template
+
'
\n
'
)
text
=
[]
if
(
line
.
startswith
(
' <text xml:space="preserve">'
)
and
line
.
find
(
'</text>'
)
!=
-
1
):
page_text
=
line
.
strip
()
page_text
=
page_text
.
replace
(
'<text xml:space="preserve">'
,
''
)
page_text
=
page_text
.
replace
(
'</text>'
,
''
)
loop
=
False
if
namespace_id
==
0
:
templates
=
extract_templates
(
page_text
)
if
templates
:
for
template
in
templates
:
log
.
write
(
template
+
'
\n
'
)
text
=
[]
elif
line
.
startswith
(
' <text xml:space="preserve">'
):
line
=
line
.
strip
()
line
=
line
.
replace
(
'<text xml:space="preserve">'
,
''
)
text
.
append
(
line
)
loop
=
True
input_file
.
close
()
log
.
close
()
Event Timeline
•
MZMcBride
edited the content of this paste.
(Show Details)
May 10 2015, 2:16 AM
2015-05-10 02:16:46 (UTC+0)
•
MZMcBride
updated the paste's language from
autodetect
to
python
.
•
MZMcBride
mentioned this in
T96323: Please make me a list of commonly used templates in Wikipedia articles
.
May 10 2015, 3:54 AM
2015-05-10 03:54:14 (UTC+0)
valhallasw
mentioned this in
T98899: Connect Pastes to their related tasks
.
May 12 2015, 6:04 PM
2015-05-12 18:04:49 (UTC+0)
awight
mentioned this in
T258924: Run script to collect one-time stats about TemplateData
.
Jul 28 2020, 4:32 PM
2020-07-28 16:32:22 (UTC+0)
Log In to Comment