Page MenuHomePhabricator
Paste P3855

etcd_recovery_generator.py
ActivePublic

Authored by Joe on Aug 19 2016, 4:11 PM.
Tags
Referenced Files
F23830013: etcd_recovery_generator.py
Jul 17 2018, 6:06 AM
F23830005: etcd_recovery_generator.py
Jul 17 2018, 6:05 AM
F4377073: etcd_recovery_generator.py
Aug 19 2016, 4:11 PM
Subscribers
None
import json
import logging
from urlparse import urlparse
log = logging.getLogger(__name__)
## Globals
etcd_clusters = {
'eqiad.wmnet': {
'conf1001': 'http://conf1001.eqiad.wmnet:2380',
'conf1002': 'http://conf1002.eqiad.wmnet:2380',
'conf1003': 'http://conf1003.eqiad.wmnet:2380',
},
'codfw.wmnet': {
'conf2001': 'http://conf2001.eqiad.wmnet:2380',
'conf2002': 'http://conf2002.eqiad.wmnet:2380',
'conf2003': 'http://conf2003.eqiad.wmnet:2380',
},
}
recovery = "/tmp/etcd-recovery"
class Generator(object):
def __init__(self, cluster_name, master):
self.cluster_name = cluster_name
self.master = master
self.cluster = etcd_clusters[cluster_name]
self.current_cluster = ""
self.etcd_dir = '/var/lib/etcd/etcd-' + self.cluster_name
def fqdn(self, host):
return urlparse(self.cluster[host]).netloc.split(':')[0]
def ssh(self, host, cmd):
return "ssh {} {}".format(self.fqdn(host), cmd)
def adv_client_url(self, host):
return "https://{}:2379".format(self.fqdn(host))
def listen_client_url(self):
return "http://127.0.0.1:2378"
def stop_etcd_service(self):
for host in self.cluster.keys():
log.info('Stopping etcd on %s', host)
print self.ssh(host, 'sudo systemctl stop etcd.service')
def launch_temp_etcd(self):
"""
Copies the latest backup to the recovery dir, starts etcd there
"""
peer_url = self.cluster[self.master]
listen_client_url = self.listen_client_url()
adv_client_url = self.adv_client_url(self.master)
print self.ssh(
self.master,
'sudo cp -ax /srv/backups/etcd/etcd-{0}-backup {1}'.format(
self.cluster_name,
recovery)
)
print self.ssh(
self.master,
'sudo chown -R etcd:etcd ' + recovery
)
data = {
'name': self.master,
'data_dir': recovery,
'listen': listen_client_url,
'adv': adv_client_url,
'peer': peer_url,
'args': "--force-new-cluster"
}
etcd_cmd = self._etcd_cmd(data)
print self.ssh(self.master, etcd_cmd)
def _etcd_cmd(self, data):
return """sudo -u etcd etcd --data-dir {data_dir} --name {name} \
-initial-advertise-peer-urls {peer} \
-listen-peer-urls {peer} \
-listen-client-urls {listen} \
-advertise-client-urls {adv} \
{args}
""".format(**data)
def _curl(self, url, method='GET', req=None):
client_url = self.listen_client_url() + '/v2' + url
cmd = "curl {} -L -X {}".format(client_url, method)
if req is not None:
cmd += ' -H "Content-Type: application/json" -d \'{}\''.format(
json.dumps(req))
return cmd
def _etcdctl(self, cmd, username=None):
if username:
usr = "--username " + username
else:
usr = ""
etcdctl = "etcdctl {} --endpoint {} {}".format(
usr,
self.listen_client_url(),
cmd
)
return etcdctl
def change_etcd_peer_url(self):
req = {'peerURLs': [etcd_clusters[self.cluster_name][self.master]] }
print "member=$({} | grep {} | cut -d\: -f1)".format(
self._etcdctl("member list"), self.master)
print self._curl('/members/$member', method='PUT', req=req)
self.current_cluster += "{}={}".format(self.master,
self.cluster[self.master])
def move_temp_dir(self):
# kill any etcd running on the server
print self.ssh(self.master, 'sudo killall -15 etcd')
self.wipe_etcd_dir(self.master)
print self.ssh(self.master, 'sudo mv {} {}'.format(recovery,
self.etcd_dir))
def wipe_etcd_dir(self, host):
cmd = 'sudo rm -rf {}'.format(self.etcd_dir)
print self.ssh(host, cmd)
def add_to_cluster(self, host):
print self.ssh(self.master, self._etcdctl(
"member add {} {}".format(host, self.cluster[host]),
))
self.current_cluster += ',{}={}'.format(host, self.cluster[host])
def start_etcd(self, host):
data = {
'name': host,
'data_dir': self.etcd_dir,
'listen': self.listen_client_url(),
'adv': self.adv_client_url(host),
'peer': self.cluster[host],
'args': "--initial-cluster-state existing "
"--initial-cluster {}".format(self.current_cluster)
}
etcd_cmd = self._etcd_cmd(data)
print self.ssh(host, etcd_cmd)
def enable_auth(self):
print self.ssh(self.master, self._etcdctl("auth enable"))
def main():
import sys
cluster_name = sys.argv[1]
master = sys.argv[2]
gen = Generator(cluster_name, master)
print """#############################
# ETCD Recovery instructions (generated via etcd_recovery)
# Cluster : {cluster_name}
# Master : {master}
# Nodes: {nodes}
#############################
""".format(**{'cluster_name': cluster_name, 'master': master, 'nodes': gen.cluster})
print """
### STEP 1: stop etcd across the cluster.
"""
gen.stop_etcd_service()
print """
### STEP 2: set up the new master from its backup
"""
print "# Now launch the temporary etcd master from backup"
gen.launch_temp_etcd()
print
print "# When it works, SSH TO THE MASTER AND launch the following"
gen.change_etcd_peer_url()
print
print "# Now kill the original etcd running in the original shell, and start it from the right position"
gen.move_temp_dir()
gen.start_etcd(gen.master)
print """
### STEP 3: Add back and start the other nodes
"""
for host in gen.cluster.keys():
if host == master:
continue
gen.add_to_cluster(host)
gen.wipe_etcd_dir(host)
gen.start_etcd(host)
#gen.enable_auth()
if __name__ == '__main__':
main()