Page MenuHomePhabricator

T292270 gadgets gadgets code code!

Authored By
Aklapper
Oct 12 2022, 4:08 PM
Size
5 KB
Referenced Files
None
Subscribers
None

T292270 gadgets gadgets code code!

# based on https://paws-public.wmflabs.org/paws-public/User:Harej_(WMF)/Lists%20of%20gadgets%20for%20every%20Wikimedia%20wiki.ipynb
# based on https://public.paws.wmcloud.org/User:SSethi_(WMF)/List%20of%20most%20used%20gadgets%20across%20wikimedia%20wikis.ipynb
# if this was my code (I am Andre Klapper, <aklapper@wikimedia.org>) this was licensed under Creative Commons Zero.
import requests
import re
import json
from collections import OrderedDict, Counter
# Download the Site Matrix to get list of Wikimedia wikis
sitematrix = 'https://en.wikipedia.org/w/api.php?action=sitematrix&format=json'
r = requests.get(sitematrix)
r = r.json()
r = r['sitematrix']
site_urls = []
for blob in r.values():
if type(blob) is dict:
for subblob in blob['site']:
if 'closed' not in subblob: # Exclude closed sites
site_urls.append(subblob['url'])
elif type(blob) is list: # "Special" wikis, including Commons and Wikidata, are different for some reason
for subblob in blob:
if 'closed' not in subblob: # Exclude closed sites
site_urls.append(subblob['url'])
# Download [[MediaWiki:Gadgets-definition]] from each of these wikis
api_request_gadgets_definition = ('/w/api.php?action=parse&format=json&prop=wikitext&page=MediaWiki%3AGadgets-definition')
gadget_defs = {}
for url in site_urls:
q = url + api_request_gadgets_definition
r = requests.get(q)
r = r.json()
if 'error' not in r: # Check if [[MediaWiki:Gadgets-definition]] exists on that wiki
gadget_defs[url] = r['parse']['wikitext']['*']
print('✅ ' + url + '/wiki/MediaWiki:Gadgets-definition')
else:
print(' ❎ ' + url + '/wiki/MediaWiki:Gadgets-definition')
print('\nGadget definition pages on active wikis: ' + str(len(gadget_defs)) + '\n')
# Data cleanup and retrieving editor information
gadget_entries = {}
for site, blob in gadget_defs.items():
# print(site)
blob = blob.split('\n')
for entry in blob:
if len(entry) > 0:
if entry[0] == '*' and '|' in entry:
gadget = entry.replace('*', '').strip()
canonical = gadget.upper().\
replace('-', '').\
replace('_', '').\
replace(' ', '').\
split('|')[0].\
split('[')[0].\
strip()
if canonical in gadget_entries:
if gadget in gadget_entries[canonical]:
gadget_entries[canonical][gadget][site] = {}
else:
gadget_entries[canonical][gadget] = {site: {}}
else:
gadget_entries[canonical] = {gadget: {site: {}}}
gadget_pages = gadget
gadget_pages = re.sub(r'\*+', '', gadget_pages)
gadget_pages = re.sub(r'\[.*?\]', '', gadget_pages)
gadget_pages = re.sub(r'<!--.*?-->', '', gadget_pages) # Must be first
gadget_pages = re.sub(r'<!--.*$', '', gadget_pages) # Must be after
gadget_pages = re.sub(r'^.*-->', '', gadget_pages) # Must be after
gadget_pages = gadget_pages.split('|')
gadget_pages = ['MediaWiki:Gadget-' + x.strip().replace(' ', '_') for x in gadget_pages \
if x.strip() != '']
for gadget_page in gadget_pages:
print(site + '/wiki/' + gadget_page)
api_request_gadget_definition = ('/w/api.php?action=parse&format=json&prop=wikitext&page=' + gadget_page)
q = site + api_request_gadget_definition
r = requests.get(q)
r = r.json()
if 'error' in r:
# Check for values with an equal sign as they imply a typo (incorrect ResourceLoader array parameters)
match1 = re.search('=', gadget_page)
if match1:
print('ERROR: Gadget definition for ' + gadget_page + ' includes an equal sign, potential typo on ' + site)
else:
# Check if listed pages actually exist but exclude description/translation page URLs that don't end in .js or .css
match2 = re.search('\.js\Z', gadget_page)
match3 = re.search('\.css\Z', gadget_page)
if (match2 or match3):
print('ERROR: Non-existing gadget page on ' + site + '/wiki/MediaWiki:Gadgets-definition : ' + gadget_page)
gadget_entries[canonical][gadget][site][gadget_page] = site
gadget_entries = OrderedDict(sorted(gadget_entries.items()))
print('\nDone')
for canonical, gadgetblob in gadget_entries.items():
counter=0
for gadget, siteblob in gadgetblob.items():
for site in siteblob.keys():
counter=counter+1
if(counter>10):
print('== ' + canonical.lower() + ' ==\n')
print(gadget)
print(counter)
# for gadget, siteblob in gadgetblob.items():
# for site in siteblob.keys():
# print(site)

File Metadata

Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
9772037
Default Alt Text
T292270 gadgets gadgets code code! (5 KB)

Event Timeline