# based on https://paws-public.wmflabs.org/paws-public/User:Harej_(WMF)/Lists%20of%20gadgets%20for%20every%20Wikimedia%20wiki.ipynb # based on https://public.paws.wmcloud.org/User:SSethi_(WMF)/List%20of%20most%20used%20gadgets%20across%20wikimedia%20wikis.ipynb # if this was my code (I am Andre Klapper, ) this was licensed under Creative Commons Zero. import requests import re import json from collections import OrderedDict, Counter # Download the Site Matrix to get list of Wikimedia wikis sitematrix = 'https://en.wikipedia.org/w/api.php?action=sitematrix&format=json' r = requests.get(sitematrix) r = r.json() r = r['sitematrix'] site_urls = [] for blob in r.values(): if type(blob) is dict: for subblob in blob['site']: if 'closed' not in subblob: # Exclude closed sites site_urls.append(subblob['url']) elif type(blob) is list: # "Special" wikis, including Commons and Wikidata, are different for some reason for subblob in blob: if 'closed' not in subblob: # Exclude closed sites site_urls.append(subblob['url']) # Download [[MediaWiki:Gadgets-definition]] from each of these wikis api_request_gadgets_definition = ('/w/api.php?action=parse&format=json&prop=wikitext&page=MediaWiki%3AGadgets-definition') gadget_defs = {} for url in site_urls: q = url + api_request_gadgets_definition r = requests.get(q) r = r.json() if 'error' not in r: # Check if [[MediaWiki:Gadgets-definition]] exists on that wiki gadget_defs[url] = r['parse']['wikitext']['*'] print('✅ ' + url + '/wiki/MediaWiki:Gadgets-definition') else: print(' ❎ ' + url + '/wiki/MediaWiki:Gadgets-definition') print('\nGadget definition pages on active wikis: ' + str(len(gadget_defs)) + '\n') # Data cleanup and retrieving editor information gadget_entries = {} for site, blob in gadget_defs.items(): # print(site) blob = blob.split('\n') for entry in blob: if len(entry) > 0: if entry[0] == '*' and '|' in entry: gadget = entry.replace('*', '').strip() canonical = gadget.upper().\ replace('-', '').\ replace('_', '').\ replace(' ', '').\ split('|')[0].\ split('[')[0].\ strip() if canonical in gadget_entries: if gadget in gadget_entries[canonical]: gadget_entries[canonical][gadget][site] = {} else: gadget_entries[canonical][gadget] = {site: {}} else: gadget_entries[canonical] = {gadget: {site: {}}} gadget_pages = gadget gadget_pages = re.sub(r'\*+', '', gadget_pages) gadget_pages = re.sub(r'\[.*?\]', '', gadget_pages) gadget_pages = re.sub(r'', '', gadget_pages) # Must be first gadget_pages = re.sub(r'', '', gadget_pages) # Must be after gadget_pages = gadget_pages.split('|') gadget_pages = ['MediaWiki:Gadget-' + x.strip().replace(' ', '_') for x in gadget_pages \ if x.strip() != ''] for gadget_page in gadget_pages: print(site + '/wiki/' + gadget_page) api_request_gadget_definition = ('/w/api.php?action=parse&format=json&prop=wikitext&page=' + gadget_page) q = site + api_request_gadget_definition r = requests.get(q) r = r.json() if 'error' in r: # Check for values with an equal sign as they imply a typo (incorrect ResourceLoader array parameters) match1 = re.search('=', gadget_page) if match1: print('ERROR: Gadget definition for ' + gadget_page + ' includes an equal sign, potential typo on ' + site) else: # Check if listed pages actually exist but exclude description/translation page URLs that don't end in .js or .css match2 = re.search('\.js\Z', gadget_page) match3 = re.search('\.css\Z', gadget_page) if (match2 or match3): print('ERROR: Non-existing gadget page on ' + site + '/wiki/MediaWiki:Gadgets-definition : ' + gadget_page) gadget_entries[canonical][gadget][site][gadget_page] = site gadget_entries = OrderedDict(sorted(gadget_entries.items())) print('\nDone') for canonical, gadgetblob in gadget_entries.items(): counter=0 for gadget, siteblob in gadgetblob.items(): for site in siteblob.keys(): counter=counter+1 if(counter>10): print('== ' + canonical.lower() + ' ==\n') print(gadget) print(counter) # for gadget, siteblob in gadgetblob.items(): # for site in siteblob.keys(): # print(site)