Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F57529986
Ranking of Wikipedia wikis based on coverage
No One
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
Ladsgroup
Sep 22 2024, 11:32 AM
2024-09-22 11:32:42 (UTC+0)
Size
3 KB
Referenced Files
None
Subscribers
None
Ranking of Wikipedia wikis based on coverage
View Options
import gzip
import json
import sys
from collections import defaultdict
all_langs = set([
'en', 'ceb', 'de', 'fr', 'sv', 'nl', 'ru', 'es', 'it', 'arz', 'pl',
'ja', 'zh', 'uk', 'vi', 'war', 'ar', 'pt', 'fa', 'ca', 'sr', 'id',
'ko', 'no', 'ce', 'fi', 'tr', 'cs', 'hu', 'tt', 'sh', 'ro',
'zh-min-nan', 'eu', 'ms', 'eo', 'he', 'hy', 'da', 'bg', 'cy', 'uz',
'sk', 'simple', 'azb', 'et', 'be', 'kk', 'el', 'min', 'hr', 'lt', 'gl',
'ur', 'az', 'sl', 'lld', 'ka', 'nn', 'ta', 'th', 'hi', 'bn', 'mk',
'la', 'zh-yue', 'ast', 'lv', 'af', 'tg', 'my', 'mg', 'sq', 'mr', 'bs',
'te', 'oc', 'br', 'be-tarask', 'ml', 'nds', 'sw', 'ky', 'ku', 'lmo',
'jv', 'pnb', 'new', 'vec', 'ht', 'pms', 'ba', 'lb', 'su', 'ga', 'is',
'szl', 'ckb', 'fy', 'cv', 'pa', 'tl', 'an', 'io', 'wuu', 'diq', 'ha',
'vo', 'sco', 'yo', 'ne', 'kn', 'gu', 'als', 'ia', 'avk', 'crh', 'bar',
'scn', 'bpy', 'qu', 'ig', 'mn', 'nv', 'ban', 'xmf', 'si', 'mzn', 'frr',
'ps', 'tum', 'os', 'or', 'bat-smg', 'sah', 'cdo', 'bcl', 'gd', 'bug',
'sd', 'yi', 'ilo', 'am', 'li', 'nap', 'gor', 'mai', 'fo', 'hsb',
'map-bms', 'shn', 'eml', 'ace', 'zh-classical', 'as', 'ie', 'sa', 'wa',
'hyw', 'sn', 'mhr', 'lij', 'zu', 'hif', 'bjn', 'mrj', 'km', 'sat',
'mni', 'hak', 'ary', 'roa-tara', 'pam', 'dag', 'rue', 'bh', 'nso',
'co', 'vls', 'so', 'mi', 'nds-nl', 'myv', 'se', 'sc', 'bo', 'kw',
'vep', 'rw', 'glk', 'tk', 'kab', 'gan', 'fiu-vro', 'gv', 'zea', 'ab',
'mt', 'skr', 'ug', 'tly', 'nah', 'frp', 'udm', 'pcd', 'gn', 'smn',
'kv', 'csb', 'ay', 'nrm', 'ks', 'mdf', 'lez', 'olo', 'kaa', 'mwl',
'lfn', 'ang', 'stq', 'lo', 'fur', 'rm', 'tw', 'ln', 'pap', 'lad',
'ext', 'gom', 'tyv', 'koi', 'av', 'dty', 'dsb', 'cbk-zam', 'dv', 'ksh',
'za', 'lg', 'gag', 'bxr', 'pfl', 'szy', 'blk', 'tay', 'pag', 'pi',
'haw', 'awa', 'inh', 'krc', 'atj', 'to', 'pdc', 'tcy', 'mnw', 'arc',
'xh', 'ff', 'shi', 'xal', 'jam', 'kbp', 'wo', 'om', 'ki', 'nia', 'anp',
'kbd', 'zgh', 'nov', 'nqo', 'bi', 'tpi', 'tet', 'roa-rup', 'jbo', 'tn',
'fj', 'kg', 'lbe', 'guw', 'ty', 'cu', 'rmy', 'mad', 'trv', 'ami',
'srn', 'sm', 'alt', 'dga', 'ltg', 'gcr', 'pcm', 'chr', 'ny', 'kcg',
'gpe', 'st', 'pih', 'got', 'ss', 'gur', 'ee', 'bm', 'ts', 've', 'bbc',
'chy', 'fon', 'rn', 'ik', 'ady', 'ch', 'fat', 'guc', 'pnt', 'iu',
'pwn', 'sg', 'din', 'ti', 'kl', 'dz', 'cr',
])
rankings = {
'x**2': defaultdict(int),
'x**3': defaultdict(int)
}
class DumpReader(object):
def __init__(self, path):
self.path = path
def check_data(self, line):
try:
item = json.loads(line.replace('\n', '')[:-1])
except:
# It's not using json-lines, it's a big json thus this mess.
return
if item['type'] != 'item':
return
return item
def read_items(self):
with gzip.open(self.path, 'rt') as f:
try:
for line in f:
item = self.check_data(line)
if item is None:
continue
yield item
except EOFError:
pass
dump_reader = DumpReader(sys.argv[1])
for item in dump_reader.read_items():
langs = []
for wiki_db_name in item.get('sitelinks', {}):
if not wiki_db_name.endswith('wiki'):
continue
lang = wiki_db_name.split('wiki')[0].replace('_', '-')
if lang == 'be-x-old':
lang = 'be-tarask'
if lang not in all_langs:
continue
langs.append(lang)
if not langs:
continue
for lang in all_langs:
if lang in langs:
continue
rankings['x**2'][lang] += len(langs)**2
rankings['x**3'][lang] += len(langs)**3
print(sorted(rankings['x**2'].items(), key=lambda i: i[1]))
print(sorted(rankings['x**3'].items(), key=lambda i: i[1]))
File Metadata
Details
Attached
Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
18863326
Default Alt Text
Ranking of Wikipedia wikis based on coverage (3 KB)
Attached To
Mode
P69383 Ranking of Wikipedia wikis based on coverage
Attached
Detach File
Event Timeline
Log In to Comment