Page MenuHomePhabricator
Paste P5977

Dirty one of script for cleaning Georgian list of keywords
ActivePublic

Authored by Lokal_Profil on Sep 8 2017, 8:53 AM.
from collections import OrderedDict
import pywikibot as pwb
site = pwb.Site('wikidata', 'wikidata')
page = pwb.Page(site, 'Wikidata:WikiProject WLM/Mapping tables/ge (ka)/types')
contents = page.get()
header, sep, rest = contents.partition('|-')
rest, sep, footer = rest.rpartition('|}')
footer = '|}' + footer
NATIONAL_IMPORTANCE_STR = "ეროვნული"
def clean_type(text):
"""
Return a cleaned version of self.type.
Multiple types may exist either separated by "<br />" or ",".
Types may include NATIONAL_IMPORTANCE_STR which should be used only
for heritage status.
"""
raw_type = text.lower()
raw_type = raw_type.replace("<br />", ",")
types = [typ.strip() for typ in raw_type.split(',')]
if NATIONAL_IMPORTANCE_STR in types:
types.remove(NATIONAL_IMPORTANCE_STR)
types = list(filter(None, types)) # remove empty entries
return ', '.join(types)
entries = rest.split('|-')
d = {}
for entry in entries:
parts = entry.split('\n|')
name = clean_type(parts[1].strip())
num = parts[2].strip() or "0"
qid = parts[3].strip()
com = parts[4].strip()
if name not in d:
d[name] = {'num': 0, 'qid': '', 'com': '', 'orig':[]}
d[name]['num'] += int(num)
if qid and d[name]['qid']:
print('doh qid: {} {}'.format(qid, d[name]['qid']))
if com and d[name]['com']:
print('doh com: {} {}'.format(com, d[name]['com']))
d[name]['qid'] = d[name]['qid'] or qid
d[name]['com'] = d[name]['com'] or com
d[name]['orig'].append(parts[1].strip())
od = OrderedDict(sorted(d.items(), key=lambda t: t[1]['num'], reverse=True))
txt = ''
for k, v in od.items():
txt += '|- \n| {}\n| {}\n| {}\n| {}\n'.format(k, v['num'], v['qid'], v['com'])
page_text = header + txt + footer
with open('tmp.wiki', 'w', encoding='utf-8') as f:
f.write(page_text)

Event Timeline

Lokal_Profil updated the paste's language from autodetect to python.
Lokal_Profil edited the content of this paste. (Show Details)