Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F37549279
Simple python script to migrate spam blacklist (plwiki)
No One
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Authored By
matmarex
Aug 17 2023, 4:56 PM
2023-08-17 16:56:06 (UTC+0)
Size
2 KB
Referenced Files
None
Subscribers
None
Simple python script to migrate spam blacklist (plwiki)
View Options
import requests
import json
import re
domain = 'https://pl.wikipedia.org'
current_spam_blacklist = requests.get(domain + '/wiki/MediaWiki:Spam-blacklist?action=raw').text
current_blocked_domains = requests.get(domain + '/wiki/MediaWiki:BlockedExternalDomains.json?action=raw')
query_res = requests.get(domain + '/w/api.php', params= { 'action': 'query', 'format': 'json', 'prop': 'revisions', 'titles': 'MediaWiki:Spam-blacklist', 'rvslots': 'main', 'rvprops': 'ids'}).json()['query']['pages']
spam_blacklist_rev_id = query_res[list(query_res.keys())[0]]['revisions'][0]['revid']
if current_blocked_domains.status_code == 404:
current_blocked_domains = []
else:
current_blocked_domains = current_blocked_domains.json()
new_spam_blacklist = []
section_notes = ''
for line in current_spam_blacklist.split('\n'):
if line.startswith('#'):
# plwiki: Migrate section comments (comment on a line by itself after an empty line)
if section_notes == '':
section_notes = line.strip(' #') + ' / '
new_spam_blacklist.append(line)
continue
# plwiki: Migrate comments
notes = ''
if '#' in line:
(domain_regex, _, notes) = line.partition('#')
notes = notes.strip() + ' / '
else:
domain_regex = line
domain_regex = domain_regex.strip()
if len(domain_regex) == 0:
# plwiki: End of section, clear the section comment
section_notes = ''
continue
if domain_regex.startswith('\\b') and domain_regex.endswith('\\b'):
domain_regex = domain_regex[2:-2]
# plwiki: We weren't properly using \b, so migrate entries without it too
# else:
# new_spam_blacklist.append(line)
# continue
if '/' in domain_regex:
new_spam_blacklist.append(line)
continue
if re.search(r'[^\\][\.\$\^\{\[\(\|\)\*\+\?]', domain_regex):
new_spam_blacklist.append(line)
continue
if re.search(r'\\[^\.]', domain_regex):
new_spam_blacklist.append(line)
continue
if '[' in domain_regex or ']' in domain_regex:
new_spam_blacklist.append(line)
continue
current_blocked_domains.append(
{ 'domain': domain_regex.replace('\\', ''), 'notes': section_notes + notes + 'migracja z [[Special:PermaLink/{}|MediaWiki:Spam-blacklist]]'.format(spam_blacklist_rev_id)}
)
print('\n'.join(new_spam_blacklist))
print(json.dumps(current_blocked_domains, ensure_ascii=False, indent='\t'))
File Metadata
Details
Attached
Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
11271848
Default Alt Text
Simple python script to migrate spam blacklist (plwiki) (2 KB)
Attached To
Mode
P50425 Simple python script to migrate spam blacklist (plwiki)
Attached
Detach File
Event Timeline
Log In to Comment