IRC report that tools db was down. Seems many things alerting as down in #wikimedia-cloud-feed
Incident report: https://wikitech.wikimedia.org/wiki/Incidents/2023-09-29_CloudVPS_vms_losing_network_connectivity
Script being used to bring the vms online (we might want to reuse the console handling parts):
| 1 | |
|---|---|
| 2 | import os |
| 3 | import sys |
| 4 | import argparse |
| 5 | import yaml |
| 6 | import keystoneauth1 |
| 7 | from keystoneauth1.identity import v3 |
| 8 | from keystoneauth1 import session as keystone_session |
| 9 | from keystoneclient.v3 import client as keystoneclient |
| 10 | from novaclient import client as novaclient |
| 11 | import glanceclient |
| 12 | import subprocess |
| 13 | import time |
| 14 | |
| 15 | |
| 16 | def keystone_session(project="cloudinfra"): |
| 17 | auth = v3.Password( |
| 18 | auth_url="https://openstack.eqiad1.wikimediacloud.org:25000/v3", |
| 19 | username="novaobserver", |
| 20 | password="Fs6Dq2RtG8KwmM2Z", # https://gerrit.wikimedia.org/g/operations/puppet/+/16906c693da99eacdf7be557cc19e110a30c96f1/hieradata/cloud/eqiad1.yaml#36 |
| 21 | user_domain_name='Default', |
| 22 | project_domain_name='Default', |
| 23 | project_name=project, |
| 24 | ) |
| 25 | return keystoneauth1.session.Session(auth=auth) |
| 26 | |
| 27 | |
| 28 | def fix_instance(vm_hypervisor, vm_libvirt_id, ip_address): |
| 29 | one_liner = f"( ip route get 208.80.154.224 | grep 172.16.0.1 ) || ( ip link set ens3 up; ip addr add {ip_address}/21 dev ens3; ip route add default via 172.16.0.1 )" |
| 30 | p = subprocess.Popen( |
| 31 | [ |
| 32 | "/usr/bin/ssh", |
| 33 | # "-v", |
| 34 | "-tt", |
| 35 | # "-F/etc/cumin/ssh_config", |
| 36 | vm_hypervisor, |
| 37 | f"sudo virsh console {vm_libvirt_id} --force" |
| 38 | ], |
| 39 | bufsize=0, |
| 40 | stdin=subprocess.PIPE, |
| 41 | stdout=subprocess.PIPE, |
| 42 | stderr=subprocess.PIPE, |
| 43 | text=True, |
| 44 | # env={"SSH_AUTH_SOCK": "/run/keyholder/proxy.sock"} |
| 45 | ) |
| 46 | |
| 47 | for line in iter(p.stdout.readline, b''): |
| 48 | #print(" ", line, end="") |
| 49 | if "Escape character" in line: |
| 50 | time.sleep(0.5) |
| 51 | p.stdin.write("\r\n") |
| 52 | p.stdin.flush() |
| 53 | elif "root@" in line: |
| 54 | time.sleep(0.2) |
| 55 | p.stdin.write(one_liner) |
| 56 | p.stdin.flush() |
| 57 | time.sleep(0.2) |
| 58 | p.stdin.write("\r\n") |
| 59 | p.stdin.flush() |
| 60 | time.sleep(0.2) |
| 61 | p.kill() |
| 62 | break |
| 63 | |
| 64 | |
| 65 | |
| 66 | def fix_project(project, broken_images): |
| 67 | nova = novaclient.Client("2.0", session=keystone_session(project)) |
| 68 | servers = nova.servers.list( |
| 69 | detailed=True, |
| 70 | sort_keys=["display_name"], |
| 71 | sort_dirs=["asc"], |
| 72 | ) |
| 73 | for server in servers: |
| 74 | if server.image["id"] not in broken_images: |
| 75 | continue |
| 76 | if server.status != "ACTIVE": |
| 77 | print(server.name, project, server.status, "SKIP") |
| 78 | continue |
| 79 | vm_info = server._info |
| 80 | vm_hypervisor = vm_info['OS-EXT-SRV-ATTR:hypervisor_hostname'] |
| 81 | vm_libvirt_id = vm_info['OS-EXT-SRV-ATTR:instance_name'] |
| 82 | for sdn, interfaces in server.addresses.items(): |
| 83 | for interface in interfaces: |
| 84 | if interface["addr"].startswith("172.16."): |
| 85 | print(server.name, project, interface["addr"], vm_hypervisor, vm_libvirt_id) |
| 86 | fix_instance(vm_hypervisor, vm_libvirt_id, interface["addr"]) |
| 87 | |
| 88 | |
| 89 | def get_broken_images(): |
| 90 | glance = glanceclient.Client( |
| 91 | version="2", |
| 92 | session=keystone_session(), |
| 93 | interface="public", |
| 94 | ) |
| 95 | |
| 96 | broken = [] |
| 97 | for image in glance.images.list(): |
| 98 | if "bullseye" in image["name"]: |
| 99 | broken.append(image["id"]) |
| 100 | return broken |
| 101 | |
| 102 | |
| 103 | |
| 104 | def main(): |
| 105 | broken_images = get_broken_images() |
| 106 | keystone = keystoneclient.Client( |
| 107 | session=keystone_session(), |
| 108 | interface="public", |
| 109 | timeout=2, |
| 110 | ) |
| 111 | |
| 112 | for project in keystone.projects.list(enabled=True, domain="default"): |
| 113 | if project.name.startswith("a") or project.name.startswith("b") or project.name.startswith("c"): |
| 114 | continue |
| 115 | print("*", project.name) |
| 116 | fix_project(project.name, broken_images) |
| 117 | |
| 118 | |
| 119 | |
| 120 | if __name__ == "__main__": |
| 121 | main() |