IRC report that tools db was down. Seems many things alerting as down in #wikimedia-cloud-feed
Incident report: https://wikitech.wikimedia.org/wiki/Incidents/2023-09-29_CloudVPS_vms_losing_network_connectivity
Script being used to bring the vms online (we might want to reuse the console handling parts):
1 | |
---|---|
2 | import os |
3 | import sys |
4 | import argparse |
5 | import yaml |
6 | import keystoneauth1 |
7 | from keystoneauth1.identity import v3 |
8 | from keystoneauth1 import session as keystone_session |
9 | from keystoneclient.v3 import client as keystoneclient |
10 | from novaclient import client as novaclient |
11 | import glanceclient |
12 | import subprocess |
13 | import time |
14 | |
15 | |
16 | def keystone_session(project="cloudinfra"): |
17 | auth = v3.Password( |
18 | auth_url="https://openstack.eqiad1.wikimediacloud.org:25000/v3", |
19 | username="novaobserver", |
20 | password="Fs6Dq2RtG8KwmM2Z", # https://gerrit.wikimedia.org/g/operations/puppet/+/16906c693da99eacdf7be557cc19e110a30c96f1/hieradata/cloud/eqiad1.yaml#36 |
21 | user_domain_name='Default', |
22 | project_domain_name='Default', |
23 | project_name=project, |
24 | ) |
25 | return keystoneauth1.session.Session(auth=auth) |
26 | |
27 | |
28 | def fix_instance(vm_hypervisor, vm_libvirt_id, ip_address): |
29 | one_liner = f"( ip route get 208.80.154.224 | grep 172.16.0.1 ) || ( ip link set ens3 up; ip addr add {ip_address}/21 dev ens3; ip route add default via 172.16.0.1 )" |
30 | p = subprocess.Popen( |
31 | [ |
32 | "/usr/bin/ssh", |
33 | # "-v", |
34 | "-tt", |
35 | # "-F/etc/cumin/ssh_config", |
36 | vm_hypervisor, |
37 | f"sudo virsh console {vm_libvirt_id} --force" |
38 | ], |
39 | bufsize=0, |
40 | stdin=subprocess.PIPE, |
41 | stdout=subprocess.PIPE, |
42 | stderr=subprocess.PIPE, |
43 | text=True, |
44 | # env={"SSH_AUTH_SOCK": "/run/keyholder/proxy.sock"} |
45 | ) |
46 | |
47 | for line in iter(p.stdout.readline, b''): |
48 | #print(" ", line, end="") |
49 | if "Escape character" in line: |
50 | time.sleep(0.5) |
51 | p.stdin.write("\r\n") |
52 | p.stdin.flush() |
53 | elif "root@" in line: |
54 | time.sleep(0.2) |
55 | p.stdin.write(one_liner) |
56 | p.stdin.flush() |
57 | time.sleep(0.2) |
58 | p.stdin.write("\r\n") |
59 | p.stdin.flush() |
60 | time.sleep(0.2) |
61 | p.kill() |
62 | break |
63 | |
64 | |
65 | |
66 | def fix_project(project, broken_images): |
67 | nova = novaclient.Client("2.0", session=keystone_session(project)) |
68 | servers = nova.servers.list( |
69 | detailed=True, |
70 | sort_keys=["display_name"], |
71 | sort_dirs=["asc"], |
72 | ) |
73 | for server in servers: |
74 | if server.image["id"] not in broken_images: |
75 | continue |
76 | if server.status != "ACTIVE": |
77 | print(server.name, project, server.status, "SKIP") |
78 | continue |
79 | vm_info = server._info |
80 | vm_hypervisor = vm_info['OS-EXT-SRV-ATTR:hypervisor_hostname'] |
81 | vm_libvirt_id = vm_info['OS-EXT-SRV-ATTR:instance_name'] |
82 | for sdn, interfaces in server.addresses.items(): |
83 | for interface in interfaces: |
84 | if interface["addr"].startswith("172.16."): |
85 | print(server.name, project, interface["addr"], vm_hypervisor, vm_libvirt_id) |
86 | fix_instance(vm_hypervisor, vm_libvirt_id, interface["addr"]) |
87 | |
88 | |
89 | def get_broken_images(): |
90 | glance = glanceclient.Client( |
91 | version="2", |
92 | session=keystone_session(), |
93 | interface="public", |
94 | ) |
95 | |
96 | broken = [] |
97 | for image in glance.images.list(): |
98 | if "bullseye" in image["name"]: |
99 | broken.append(image["id"]) |
100 | return broken |
101 | |
102 | |
103 | |
104 | def main(): |
105 | broken_images = get_broken_images() |
106 | keystone = keystoneclient.Client( |
107 | session=keystone_session(), |
108 | interface="public", |
109 | timeout=2, |
110 | ) |
111 | |
112 | for project in keystone.projects.list(enabled=True, domain="default"): |
113 | if project.name.startswith("a") or project.name.startswith("b") or project.name.startswith("c"): |
114 | continue |
115 | print("*", project.name) |
116 | fix_project(project.name, broken_images) |
117 | |
118 | |
119 | |
120 | if __name__ == "__main__": |
121 | main() |