Page MenuHomePhabricator
Paste P13068

(An Untitled Masterwork)
ActivePublic

Authored by RLazarus on Oct 26 2020, 1:50 PM.
Tags
None
Referenced Files
F32414686: raw-paste-data.txt
Oct 26 2020, 1:50 PM
Subscribers
None
rzl@cumin1001:~$ cat pool_services.py
import datetime
import json
import logging
import pprint
import subprocess
import sys
import time
from typing import Dict
DC_TO = 'eqiad'
DC_FROM = 'codfw'
# /srv/deployment/spicerack$ python3
# >>> from cookbooks.sre.switchdc import services
# >>> set(services.load_services().keys()) - set(services.EXCLUDED_SERVICES) - set(services.MEDIAWIKI_SERVICES)
SERVICES = ['apertium', 'api-gateway', 'citoid', 'cxserver', 'echostore', 'eventgate-analytics',
'eventgate-analytics-external', 'eventgate-logging-external', 'eventgate-main',
'eventstreams', 'graphoid', 'kartotherian', 'mathoid', 'mobileapps', 'ores', 'parsoid',
'proton', 'push-notifications', 'recommendation-api', 'restbase', 'restbase-async',
'schema', 'search', 'sessionstore', 'termbox', 'wdqs', 'wdqs-internal', 'wikifeeds',
'zotero']
SERVICE_RE = '|'.join(SERVICES)
logging.basicConfig(format='[%(levelname)s %(asctime)s] %(message)s', level=logging.INFO)
def check_preconditions():
# Check preconditions: Each service should be pooled in DC_FROM. We don't check DC_TO status
# because repooling is idempotent.
out = subprocess.run(
['/usr/bin/confctl', '--object-type', 'discovery', 'select', f'dnsdisc={SERVICE_RE}',
'get'], check=True, capture_output=True, text=True).stdout
pooled: Dict[str, Dict[str, bool]] = {} # e.g. pooled['swift']['eqiad'] = False (depooled)
for line in out.splitlines():
data = json.loads(line)
dc = next(key for key in data if key != 'tags')
service = data['tags'].split('=', 1)[1]
is_pooled = data[dc]['pooled']
pooled.setdefault(service, {})[dc] = is_pooled
logging.info(f'Starting state:\n{pprint.pformat(pooled)}')
errors = False
for service in SERVICES:
if service not in pooled:
logging.error(f'{service} is not in conftool data.')
errors = True
continue
if not pooled[service][DC_FROM]:
logging.error(f'{service} is not pooled in {DC_FROM}.')
errors = True
return errors
def set_ttls(seconds: int):
logging.info(f'Setting TTLs to {seconds} seconds...')
subprocess.run(['/usr/bin/confctl', '--object-type', 'discovery', 'select',
f'dnsdisc={SERVICE_RE},name={DC_TO}', f'set/ttl={seconds}'], check=True)
def pool_services():
# Pool services one at a time, sleeping in between.
for i, service in enumerate(SERVICES):
if i > 0: # No need to sleep before the first one.
logging.info(f'Pausing 3m before pooling {service}...')
time.sleep(180)
logging.info(f'Pooling {service}...')
subprocess.run(['/usr/bin/confctl', '--object-type', 'discovery', 'select',
f'dnsdisc={service},name={DC_TO}', 'set/pooled=true'], check=True)
def main() -> int:
errors = check_preconditions()
if errors:
return 1
# Reduce TTLs. This isn't strictly needed when repooling, but it means we can revert more
# quickly if needed, and means troubleshooting data in graphs will be sharper.
set_ttls(10)
logging.info('Waiting 5m for original TTL to expire...')
time.sleep(300)
pool_services()
# Restore TTLs.
set_ttls(300)
logging.info('Done.')
return 0
if __name__ == '__main__':
sys.exit(main())