diff --git a/wikiblocks.py b/wikiblocks.py index 160c4c6..fcd7cee 100755 --- a/wikiblocks.py +++ b/wikiblocks.py @@ -1,132 +1,132 @@ #! /usr/bin/python # -*- coding: utf-8 -*- import oursql, os, re, time class dbConn: "Connect to bot database" def __init__(self, host): self.host = host self.connection = None self.connect() self.count = 0 def connect(self): try: self.connection.close() except: pass self.connection = oursql.connect(host=self.host, read_default_file=os.path.expanduser('~/replica.my.cnf'), read_timeout=10, use_unicode=True, autoreconnect=True, autoping=True) self.cursor = self.connection.cursor() def execute(self, sql, params=()): try: self.cursor.execute(sql, params) self.last = time.time() self.count = 0 except oursql.OperationalError: # probably connection lost if self.count > 3: raise Exception('Connection lost more than 3 times') print 'Lost connection', sql, params if self.count > 0: time.sleep(10) self.count += 1 self.connect() self.execute(sql, params) def fetchall(self): return self.cursor.fetchall() def cleanIP(ip): "Clean IP range, e.g. '1.2.3.4/20' => '1.2.0.0/20'" ipv4 = re.match(r'(\d{1,3}\.(?:\*|\d{1,3})\.(?:\*|\d{1,3})\.(?:\*|\d{1,3}))(?:/([1-3]?\d))?', ip) if ipv4: octets = [int(o) for o in ipv4.group(1).split('.')] cdir = ipv4.group(2) and int(ipv4.group(2)) or 32 if cdir > 32: cdir = 32 num = reduce(lambda a, b: a << 8 | b, octets) >> 32 - cdir << 32 - cdir return '.'.join(str(num >> 24 - i * 8 & 255) for i in range(4)) + ('/%d' % cdir if cdir < 32 else '') ipv6 = re.match(r'([0-9a-fA-F]{1,4}(?:::?[0-9a-fA-F]{1,4}){1,7}(?:::)?)(?:/(1?\d{1,2}))?', ip) if ipv6: parts = [[g or '0' for g in part.split(':')] for part in ipv6.group(1).split('::')] groups = parts[0] + (len(parts) == 2 and ['0'] * (8 - len(parts[0]) - len(parts[1])) + parts[1] or []) cdir = ipv6.group(2) and int(ipv6.group(2)) or 128 if cdir > 128: cdir = 128 num = reduce(lambda a, b: a << 16 | b, [int(g, 16) for g in groups]) >> 128 - cdir << 128 - cdir return re.sub(':0(?::0)*$', '::', ':'.join('%x' % (num >> 112 - i * 16 & 65535) for i in range(8)), 1) + \ ('/%d' % cdir if cdir < 128 else '') else: return False def loadBlocks(db): if db == 'ruwiki_p': # ruwiki has 1300k+ IP blocks, jump to not make the db lost the connection log.write('\tjumping\n') return start = time.time() wiki = db == 'centralauth_p' and 'global' or db[:-2] if db == 'centralauth_p': query = "SELECT gb_address FROM %s.globalblocks" elif db in ('jawiki_p', 'nlwiki_p'): # too many /16 blocks - query = "SELECT ipb_address FROM %s.ipblocks WHERE ipb_user = 0 AND ipb_address AND ipb_address NOT LIKE '%%/16'" + query = "SELECT ipb_address FROM %s.ipblocks WHERE ipb_user = 0 AND ipb_address NOT LIKE '%%/16'" else: - query = "SELECT ipb_address FROM %s.ipblocks WHERE ipb_user = 0 AND ipb_address" + query = "SELECT ipb_address FROM %s.ipblocks WHERE ipb_user = 0" c.execute(query % db) - ips = [i[0].encode('utf-8') for i in c.fetchall()] + ips = [i[0] for i in c.fetchall() if i and i[0]] b = 0 for i in ips: ip = cleanIP(i) if not ip: - print 'Not recgnized IP: ' + i + #print 'Not recgnized IP: ' + i continue blockWiki.setdefault(ip, []).append(wiki) b += 1 log.write('\t%d blocks\t%.2f sec\n' % (b, time.time() - start)) log = open('wikiblocks.log', 'w') log.write(time.strftime('%Y-%m-%d %H:%M:%S\n')) print time.strftime('%Y-%m-%d %H:%M:%S') start = time.time() blockWiki = {} # Load blocks data from db replicas for s in ('s1', 's2', 's3', 's4', 's5', 's6', 's7'): c = dbConn(s +'.labsdb') if not 'dbs' in globals(): c.execute('SELECT dbname FROM meta_p.wiki') dbs = [db[0].encode('utf-8') + '_p' for db in c.fetchall()] + ['centralauth_p'] c.execute('SHOW DATABASES') sdbs = [db[0] for db in c.fetchall()] log.write(s + '\n') for db in sdbs: if db in dbs: log.write(db) loadBlocks(db) dbs.remove(db) c.connection.close() if not dbs: break # Save data in local db start2 = time.time() connection = oursql.connect(db='s53213__wmopbot', host='tools.labsdb', read_default_file=os.path.expanduser('~/replica.my.cnf'), read_timeout=10, use_unicode=True, autoreconnect=True, autoping=True) c = connection.cursor() blocks = sorted(blockWiki) log.write('Total\t%d distinct blocks' % len(blocks)) c.execute('ALTER TABLE wikiblocks DISABLE KEYS') c.execute('TRUNCATE wikiblocks') n = 0 while n < len(blocks): c.execute('INSERT INTO wikiblocks VALUES ' + ','.join('(?, ?)' for b in blocks[n:n + 10000]), tuple(i for x in ((ip, ' '.join(blockWiki[ip])) for ip in blocks[n:n + 10000]) for i in x)) n += 10000 c.execute('ALTER TABLE wikiblocks ENABLE KEYS') connection.close() log.write('\tsaved in %.2f sec\n' % (time.time() - start2)) t = time.time() - start log.write((t >= 60 and '%d min ' % (t / 60) or '') + '%d sec' % (t % 60)) log.close()