Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Paste
P3855
etcd_recovery_generator.py
Active
Public
Actions
Authored by
Joe
on Aug 19 2016, 4:11 PM.
Edit Paste
Archive Paste
View Raw File
Subscribe
Mute Notifications
Award Token
Flag For Later
Tags
SRE
Subscribers
None
import
json
import
logging
from
urlparse
import
urlparse
log
=
logging
.
getLogger
(
__name__
)
## Globals
etcd_clusters
=
{
'eqiad.wmnet'
:
{
'conf1001'
:
'http://conf1001.eqiad.wmnet:2380'
,
'conf1002'
:
'http://conf1002.eqiad.wmnet:2380'
,
'conf1003'
:
'http://conf1003.eqiad.wmnet:2380'
,
},
'codfw.wmnet'
:
{
'conf2001'
:
'http://conf2001.eqiad.wmnet:2380'
,
'conf2002'
:
'http://conf2002.eqiad.wmnet:2380'
,
'conf2003'
:
'http://conf2003.eqiad.wmnet:2380'
,
},
}
recovery
=
"/tmp/etcd-recovery"
class
Generator
(
object
):
def
__init__
(
self
,
cluster_name
,
master
):
self
.
cluster_name
=
cluster_name
self
.
master
=
master
self
.
cluster
=
etcd_clusters
[
cluster_name
]
self
.
current_cluster
=
""
self
.
etcd_dir
=
'/var/lib/etcd/etcd-'
+
self
.
cluster_name
def
fqdn
(
self
,
host
):
return
urlparse
(
self
.
cluster
[
host
])
.
netloc
.
split
(
':'
)[
0
]
def
ssh
(
self
,
host
,
cmd
):
return
"ssh {} {}"
.
format
(
self
.
fqdn
(
host
),
cmd
)
def
adv_client_url
(
self
,
host
):
return
"https://{}:2379"
.
format
(
self
.
fqdn
(
host
))
def
listen_client_url
(
self
):
return
"http://127.0.0.1:2378"
def
stop_etcd_service
(
self
):
for
host
in
self
.
cluster
.
keys
():
log
.
info
(
'Stopping etcd on
%s
'
,
host
)
print
self
.
ssh
(
host
,
'sudo systemctl stop etcd.service'
)
def
launch_temp_etcd
(
self
):
"""
Copies the latest backup to the recovery dir, starts etcd there
"""
peer_url
=
self
.
cluster
[
self
.
master
]
listen_client_url
=
self
.
listen_client_url
()
adv_client_url
=
self
.
adv_client_url
(
self
.
master
)
print
self
.
ssh
(
self
.
master
,
'sudo cp -ax /srv/backups/etcd/etcd-{0}-backup {1}'
.
format
(
self
.
cluster_name
,
recovery
)
)
print
self
.
ssh
(
self
.
master
,
'sudo chown -R etcd:etcd '
+
recovery
)
data
=
{
'name'
:
self
.
master
,
'data_dir'
:
recovery
,
'listen'
:
listen_client_url
,
'adv'
:
adv_client_url
,
'peer'
:
peer_url
,
'args'
:
"--force-new-cluster"
}
etcd_cmd
=
self
.
_etcd_cmd
(
data
)
print
self
.
ssh
(
self
.
master
,
etcd_cmd
)
def
_etcd_cmd
(
self
,
data
):
return
"""sudo -u etcd etcd --data-dir {data_dir} --name {name} \
-initial-advertise-peer-urls {peer} \
-listen-peer-urls {peer} \
-listen-client-urls {listen} \
-advertise-client-urls {adv} \
{args}
"""
.
format
(
**
data
)
def
_curl
(
self
,
url
,
method
=
'GET'
,
req
=
None
):
client_url
=
self
.
listen_client_url
()
+
'/v2'
+
url
cmd
=
"curl {} -L -X {}"
.
format
(
client_url
,
method
)
if
req
is
not
None
:
cmd
+=
' -H "Content-Type: application/json" -d
\'
{}
\'
'
.
format
(
json
.
dumps
(
req
))
return
cmd
def
_etcdctl
(
self
,
cmd
,
username
=
None
):
if
username
:
usr
=
"--username "
+
username
else
:
usr
=
""
etcdctl
=
"etcdctl {} --endpoint {} {}"
.
format
(
usr
,
self
.
listen_client_url
(),
cmd
)
return
etcdctl
def
change_etcd_peer_url
(
self
):
req
=
{
'peerURLs'
:
[
etcd_clusters
[
self
.
cluster_name
][
self
.
master
]]
}
print
"member=$({} | grep {} | cut -d\: -f1)"
.
format
(
self
.
_etcdctl
(
"member list"
),
self
.
master
)
print
self
.
_curl
(
'/members/$member'
,
method
=
'PUT'
,
req
=
req
)
self
.
current_cluster
+=
"{}={}"
.
format
(
self
.
master
,
self
.
cluster
[
self
.
master
])
def
move_temp_dir
(
self
):
# kill any etcd running on the server
print
self
.
ssh
(
self
.
master
,
'sudo killall -15 etcd'
)
self
.
wipe_etcd_dir
(
self
.
master
)
print
self
.
ssh
(
self
.
master
,
'sudo mv {} {}'
.
format
(
recovery
,
self
.
etcd_dir
))
def
wipe_etcd_dir
(
self
,
host
):
cmd
=
'sudo rm -rf {}'
.
format
(
self
.
etcd_dir
)
print
self
.
ssh
(
host
,
cmd
)
def
add_to_cluster
(
self
,
host
):
print
self
.
ssh
(
self
.
master
,
self
.
_etcdctl
(
"member add {} {}"
.
format
(
host
,
self
.
cluster
[
host
]),
))
self
.
current_cluster
+=
',{}={}'
.
format
(
host
,
self
.
cluster
[
host
])
def
start_etcd
(
self
,
host
):
data
=
{
'name'
:
host
,
'data_dir'
:
self
.
etcd_dir
,
'listen'
:
self
.
listen_client_url
(),
'adv'
:
self
.
adv_client_url
(
host
),
'peer'
:
self
.
cluster
[
host
],
'args'
:
"--initial-cluster-state existing "
"--initial-cluster {}"
.
format
(
self
.
current_cluster
)
}
etcd_cmd
=
self
.
_etcd_cmd
(
data
)
print
self
.
ssh
(
host
,
etcd_cmd
)
def
enable_auth
(
self
):
print
self
.
ssh
(
self
.
master
,
self
.
_etcdctl
(
"auth enable"
))
def
main
():
import
sys
cluster_name
=
sys
.
argv
[
1
]
master
=
sys
.
argv
[
2
]
gen
=
Generator
(
cluster_name
,
master
)
print
"""#############################
# ETCD Recovery instructions (generated via etcd_recovery)
# Cluster : {cluster_name}
# Master : {master}
# Nodes: {nodes}
#############################
"""
.
format
(
**
{
'cluster_name'
:
cluster_name
,
'master'
:
master
,
'nodes'
:
gen
.
cluster
})
print
"""
### STEP 1: stop etcd across the cluster.
"""
gen
.
stop_etcd_service
()
print
"""
### STEP 2: set up the new master from its backup
"""
print
"# Now launch the temporary etcd master from backup"
gen
.
launch_temp_etcd
()
print
print
"# When it works, SSH TO THE MASTER AND launch the following"
gen
.
change_etcd_peer_url
()
print
print
"# Now kill the original etcd running in the original shell, and start it from the right position"
gen
.
move_temp_dir
()
gen
.
start_etcd
(
gen
.
master
)
print
"""
### STEP 3: Add back and start the other nodes
"""
for
host
in
gen
.
cluster
.
keys
():
if
host
==
master
:
continue
gen
.
add_to_cluster
(
host
)
gen
.
wipe_etcd_dir
(
host
)
gen
.
start_etcd
(
host
)
#gen.enable_auth()
if
__name__
==
'__main__'
:
main
()
Event Timeline
Joe
created this paste.
Aug 19 2016, 4:11 PM
Joe
mentioned this in
T135129: Create backup/restore scripts for etcd
.
Joe
edited the content of this paste.
(Show Details)
Jul 17 2018, 6:05 AM
Joe
edited the content of this paste.
(Show Details)
Log In to Comment