diff --git a/run.py b/run.py index a6a51c3..2194b8e 100755 --- a/run.py +++ b/run.py @@ -1,111 +1,141 @@ #!/usr/bin/env python3 """ Generate torrents for Wikimedia dumps Copyright (C) 2017 Kunal Mehta This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . """ import os import subprocess import sys # Pending resolution of https://phabricator.wikimedia.org/T155470 LABS_MKTORRENT = '/data/project/dump-torrents/mktorrent-1.0/mktorrent' if os.path.isfile(LABS_MKTORRENT): MKTORRENT = LABS_MKTORRENT else: MKTORRENT = '/usr/bin/mktorrent' DUMP_DIR = '/public/dumps/public/' OUR_DIR = '/data/project/dump-torrents/public_html/' TRACKERS = [ 'udp://tracker.opentrackr.org:1337', 'udp://tracker.openbittorrent.com:80/announce', 'udp://tracker.coppersurfer.tk:6969/announce', ] MIRRORS = [ 'https://dumps.wikimedia.org', 'http://dumps.wikimedia.your.org', 'http://ftp.acc.umu.se/mirror/wikimedia.org/dumps', 'http://wikipedia.c3sl.ufpr.br' ] def get_all_wikis(): for fname in sorted(os.listdir(DUMP_DIR)): if os.path.isdir(os.path.join(DUMP_DIR, fname)): yield fname def process_wiki(dbname): for date in os.listdir(os.path.join(DUMP_DIR, dbname)): if date == 'latest': # Broken...? continue process_wikidate(dbname, date) +class DumpFile: + def __init__(self, dbname, date, fname): + self.dbname = dbname + self.date = date + self.fname = fname + + @property + def dest(self): + return os.path.join(OUR_DIR, self.dbname, self.date, self.fname + '.torrent') + + @property + def source(self): + return os.path.join(DUMP_DIR, self.dbname, self.date, self.fname) + + @property + def webseeds(self): + for mirror in MIRRORS: + yield '{mirror}/{dbname}/{date}/{fname}'.format( + mirror=mirror, dbname=self.dbname, date=self.date, + fname=self.fname + ) + # archive.org has a different URL format + yield 'https://archive.org/download/{dbname}-{date}/{fname}'.format( + dbname=self.dbname, date=self.date, fname=self.fname + ) + + def mktorrent(dbname, date, fname): - dest = os.path.join(OUR_DIR, dbname, date, fname + '.torrent') - if os.path.exists(dest): - print('Skipping creation of ' + fname) - return + dfile = DumpFile(dbname, date, fname) + if os.path.exists(dfile.dest): + if os.path.getsize(dfile.dest) == 0: + # Got interrupted somehow and mktorrent didn't write the file, + # so delete it + os.unlink(dfile.dest) + else: + print('Skipping creation of ' + fname) + return args = [MKTORRENT] for tracker in TRACKERS: args.append('-a') args.append(tracker) - for mirror in MIRRORS: + for webseed in dfile.webseeds: args.append('-w') - args.append('{mirror}/{dbname}/{date}/{fname}'.format( - mirror=mirror, dbname=dbname, date=date, fname=fname) - ) + args.append(webseed) args.append('-o') - args.append(dest) + args.append(dfile.dest) args.append('-l') # piece length args.append('20') # 2^20 bytes - args.append(os.path.join(DUMP_DIR, dbname, date, fname)) + args.append(dfile.source) try: os.makedirs(os.path.join(OUR_DIR, dbname, date), exist_ok=True) except OSError: # Before Python 3.4.1, if exist_ok was True and the directory existed, makedirs() would # still raise an error if mode did not match the mode of the existing directory. pass print(subprocess.check_output(args).decode()) def process_wikidate(dbname, date): path = os.path.join(DUMP_DIR, dbname, date) files = [] for file in os.listdir(path): if file.endswith(('.gz', '.bz2', '.7z')): files.append(file) mktorrent(dbname, date, file) with open(os.path.join(DUMP_DIR, dbname, date, 'index.html')) as f: html = f.read() for file in files: html = html.replace(file, file + '.torrent') # Be terrible and fix URL paths html = html.replace(' 2: process_wiki(sys.argv[1]) else: for wiki in get_all_wikis(): process_wiki(wiki)