Page MenuHomePhabricator
Paste P50425

Simple python script to migrate spam blacklist (plwiki)
ActivePublic

Authored by matmarex on Aug 10 2023, 5:36 PM.
Tags
None
Referenced Files
F37549745: Simple python script to migrate spam blacklist (plwiki)
Aug 17 2023, 5:18 PM
F37549279: Simple python script to migrate spam blacklist (plwiki)
Aug 17 2023, 4:56 PM
F37393333: Simple python script to migrate spam blacklist (plwiki)
Aug 10 2023, 5:36 PM
Subscribers
None
import requests
import json
import re
domain = 'https://pl.wikipedia.org'
current_spam_blacklist = requests.get(domain + '/wiki/MediaWiki:Spam-blacklist?action=raw&oldid=71023370').text
current_blocked_domains = requests.get(domain + '/wiki/MediaWiki:BlockedExternalDomains.json?action=raw')
query_res = requests.get(domain + '/w/api.php', params= { 'action': 'query', 'format': 'json', 'prop': 'revisions', 'titles': 'MediaWiki:Spam-blacklist', 'rvslots': 'main', 'rvprops': 'ids'}).json()['query']['pages']
spam_blacklist_rev_id = query_res[list(query_res.keys())[0]]['revisions'][0]['revid']
if current_blocked_domains.status_code == 404:
current_blocked_domains = []
else:
current_blocked_domains = current_blocked_domains.json()
new_spam_blacklist = []
section_notes = ''
for line in current_spam_blacklist.split('\n'):
if line.startswith('#'):
# plwiki: Migrate section comments (comment on a line by itself after an empty line)
if section_notes == '':
section_notes = line.strip(' #') + ' / '
new_spam_blacklist.append(line)
continue
# plwiki: Migrate comments
notes = ''
if '#' in line:
(domain_regex, _, notes) = line.partition('#')
notes = notes.strip() + ' / '
else:
domain_regex = line
domain_regex = domain_regex.strip()
if len(domain_regex) == 0:
# plwiki: End of section, clear the section comment
section_notes = ''
continue
if domain_regex.startswith('\\b') and domain_regex.endswith('\\b'):
domain_regex = domain_regex[2:-2]
# plwiki: Alternative syntax for word boundary
if domain_regex.startswith('(?<![\\w-])'):
domain_regex = domain_regex[10:]
# plwiki: We weren't properly using \b, so migrate entries without it too
# else:
# new_spam_blacklist.append(line)
# continue
if domain_regex[-1] == '/':
domain_regex = domain_regex[0:-1]
if '/' in domain_regex:
new_spam_blacklist.append(line)
continue
if re.search(r'[^\\][\$\^\{\[\(\|\)\*\+\?]', domain_regex):
new_spam_blacklist.append(line)
continue
if re.search(r'\\[^\.-]', domain_regex):
new_spam_blacklist.append(line)
continue
if '[' in domain_regex or ']' in domain_regex:
new_spam_blacklist.append(line)
continue
current_blocked_domains.append(
{ 'domain': domain_regex.replace('\\', ''), 'notes': section_notes + notes + 'migracja z [[Special:PermaLink/{}|MediaWiki:Spam-blacklist]]'.format(spam_blacklist_rev_id)}
)
print('\n'.join(new_spam_blacklist))
print(json.dumps(current_blocked_domains, ensure_ascii=False, indent='\t'))