This is a node where the installer picked a wrong drive to try and build the md-raid set for / onto; this results in the installer failing at the run grub step, and a destroyed large swift filesystem that takes hours to rebuild (blocking further reimages in this cluster).
Pre-reimage state:
mvernon@ms-be2060:~$ df -lh Filesystem Size Used Avail Use% Mounted on udev 252G 0 252G 0% /dev tmpfs 51G 4.1G 47G 8% /run /dev/md0 55G 9.5G 43G 19% / tmpfs 252G 4.0K 252G 1% /dev/shm tmpfs 5.0M 0 5.0M 0% /run/lock tmpfs 252G 0 252G 0% /sys/fs/cgroup /dev/sdb4 297G 339M 297G 1% /srv/swift-storage/sdb4 /dev/sdb3 94G 30G 64G 32% /srv/swift-storage/sdb3 /dev/sda4 297G 338M 297G 1% /srv/swift-storage/sda4 /dev/sds1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sds1 /dev/sdc1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdc1 /dev/sdr1 7.3T 5.3T 2.0T 73% /srv/swift-storage/sdr1 /dev/sdv1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdv1 /dev/sdu1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdu1 /dev/sda3 94G 18G 76G 19% /srv/swift-storage/sda3 /dev/sdt1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdt1 /dev/sdq1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdq1 /dev/sdp1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdp1 /dev/sdo1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdo1 /dev/sdl1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdl1 /dev/sdm1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdm1 /dev/sdk1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdk1 /dev/sdn1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdn1 /dev/sdg1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdg1 /dev/sdh1 7.3T 5.2T 2.1T 72% /srv/swift-storage/sdh1 /dev/sdj1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdj1 /dev/sdi1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdi1 /dev/sdd1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdd1 /dev/sde1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sde1 /dev/sdf1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdf1 /dev/sdx1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdx1 /dev/sdw1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdw1 /dev/sdy1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdy1 /dev/sdz1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdz1 tmpfs 51G 0 51G 0% /run/user/33349 mvernon@ms-be2060:~$ sudo blkid /dev/md0: UUID="305517c2-0808-4589-a213-4cc37ae382d4" TYPE="ext4" /dev/sdc1: LABEL="swift-sdc1" UUID="17936d56-c4d9-451a-bf3a-dfd5ab0c3d07" TYPE="xfs" PARTLABEL="swift-sdc1" PARTUUID="36568373-61aa-4c7f-a1ff-9eafda6e5897" /dev/sdb1: UUID="18da7c90-7e68-0e9d-56ee-e30021164f77" UUID_SUB="1ba5819c-fd43-b741-9b2b-92ea8bda6b24" LABEL="ms-be2060:0" TYPE="linux_raid_member" PARTUUID="48f4f878-01" /dev/sdb2: UUID="2f0bd683-249f-8367-6eed-23ee26fda0e5" UUID_SUB="6652dcaf-e59b-f49b-5aa3-e0026eda7286" LABEL="ms-be2060:1" TYPE="linux_raid_member" PARTUUID="48f4f878-02" /dev/sdb3: LABEL="swift-sdb3" UUID="a29c83a2-c844-40a6-9310-1c641f44b20d" TYPE="xfs" PARTUUID="48f4f878-03" /dev/sdb4: LABEL="swift-sdb4" UUID="6b2d1529-e071-4b88-b087-f5378a384a4e" TYPE="xfs" PARTUUID="48f4f878-04" /dev/sdj1: LABEL="swift-sdj1" UUID="4f5a63f1-49e7-4138-a574-62bbddcb15bb" TYPE="xfs" PARTLABEL="swift-sdj1" PARTUUID="36d62e38-69dc-4e33-a2cb-43e8ddb2bf55" /dev/sdh1: LABEL="swift-sdh1" UUID="5f1e9875-b1db-403f-9a80-80adc8a1a99e" TYPE="xfs" PARTLABEL="swift-sdh1" PARTUUID="ed5f5a18-8732-4a88-8695-7e1837bd6e96" /dev/sdd1: LABEL="swift-sdd1" UUID="ff341b95-7587-4ea6-8276-38a5cd03f41a" TYPE="xfs" PARTLABEL="swift-sdd1" PARTUUID="c9f91b02-50e5-4f56-8012-7b279bdfa389" /dev/sde1: LABEL="swift-sde1" UUID="e3c45c18-3302-4eae-983c-136f2b5525fd" TYPE="xfs" PARTLABEL="swift-sde1" PARTUUID="13b51dc5-74da-4228-b628-fe73d89e9483" /dev/sdm1: LABEL="swift-sdm1" UUID="4005dc6a-e96c-4646-8687-634dd73d065f" TYPE="xfs" PARTLABEL="swift-sdm1" PARTUUID="d5b6d685-e958-4468-912c-a52b1044251d" /dev/sdf1: LABEL="swift-sdf1" UUID="bdea7f93-9602-448d-b15e-9be5c34ae583" TYPE="xfs" PARTLABEL="swift-sdf1" PARTUUID="31fede44-566a-4ef1-94c3-1640327a18c9" /dev/sdr1: LABEL="swift-sdr1" UUID="fe58126d-c971-4531-b3ae-75fb38b1e2e4" TYPE="xfs" PARTLABEL="swift-sdr1" PARTUUID="12794c2a-d470-4cfb-91a7-959745d4f870" /dev/sdp1: LABEL="swift-sdp1" UUID="c91ff1e8-39b8-4f31-8d44-126667f3304e" TYPE="xfs" PARTLABEL="swift-sdp1" PARTUUID="85c13299-e07e-4e43-8aab-79f62797ea2f" /dev/sdv1: LABEL="swift-sdv1" UUID="8ff60693-4aba-4a0d-92b7-2e1deae53989" TYPE="xfs" PARTLABEL="swift-sdv1" PARTUUID="90958a37-bfa7-4700-88c5-9adc9da570d3" /dev/sdn1: LABEL="swift-sdn1" UUID="aeb04028-c6ea-412a-ac85-d90fa2cef7b1" TYPE="xfs" PARTLABEL="swift-sdn1" PARTUUID="05031404-a10d-4307-9b44-d8ed95aa3ab2" /dev/sdo1: LABEL="swift-sdo1" UUID="05a05184-c3e9-45fc-be50-d8e12079b4a4" TYPE="xfs" PARTLABEL="swift-sdo1" PARTUUID="224d5e68-7e31-4e25-aa22-4cfffd8fb3e3" /dev/sdk1: LABEL="swift-sdk1" UUID="06935047-b818-4167-9e15-4581144d2c12" TYPE="xfs" PARTLABEL="swift-sdk1" PARTUUID="c8b89584-4015-4b44-a953-0ac48f7f446f" /dev/sdi1: LABEL="swift-sdi1" UUID="66558e6d-720d-4616-8cac-d59b0bc7f15d" TYPE="xfs" PARTLABEL="swift-sdi1" PARTUUID="c37a6f7d-9ef2-4d99-8d5f-5b8c3cd92069" /dev/sdg1: LABEL="swift-sdg1" UUID="ae70febe-303e-43cd-8182-6446a93a8e18" TYPE="xfs" PARTLABEL="swift-sdg1" PARTUUID="26f48171-136e-4b72-8602-0b2b3e614a15" /dev/sdt1: LABEL="swift-sdt1" UUID="9aa4a95f-a5f6-461d-82ab-6393fa246f24" TYPE="xfs" PARTLABEL="swift-sdt1" PARTUUID="f91ce737-6cc7-451d-a4a5-21e39d2a6dc3" /dev/sdl1: LABEL="swift-sdl1" UUID="d9bf2f08-3f39-4044-b054-8e678663dcd1" TYPE="xfs" PARTLABEL="swift-sdl1" PARTUUID="a9606e83-0129-49b8-9cc7-93dd243519c2" /dev/sds1: LABEL="swift-sds1" UUID="a3f8b82b-a440-4a28-a7bf-c6ab50962ac0" TYPE="xfs" PARTLABEL="swift-sds1" PARTUUID="d3936cf9-43b4-445a-b57d-5ad19f59de47" /dev/sdy1: LABEL="swift-sdy1" UUID="da7fcb1e-fa41-4c96-934c-26340a3fbb03" TYPE="xfs" PARTLABEL="swift-sdy1" PARTUUID="e379af10-9cb7-45eb-8354-a62ec8a59aa9" /dev/sdx1: LABEL="swift-sdx1" UUID="7fc6d355-ee6f-4160-b046-cc0eef0922de" TYPE="xfs" PARTLABEL="swift-sdx1" PARTUUID="bc5260d4-84ab-48f4-8b08-66029e849416" /dev/sdw1: LABEL="swift-sdw1" UUID="70612c8e-823f-4b71-b25b-0ce5c571d6f7" TYPE="xfs" PARTLABEL="swift-sdw1" PARTUUID="4329abcf-e4c4-4eb5-ac9c-8e265a81c2a1" /dev/sdq1: LABEL="swift-sdq1" UUID="5a18629a-0f8a-46e9-be41-5eba631c98f3" TYPE="xfs" PARTLABEL="swift-sdq1" PARTUUID="d9c0d534-bea9-4cf4-a71c-8ec39e31bc66" /dev/sdu1: LABEL="swift-sdu1" UUID="e35b9c89-5e2c-4035-a9c6-8c44d33388b4" TYPE="xfs" PARTLABEL="swift-sdu1" PARTUUID="1495cd77-a802-4dd7-b26d-55f18b00100e" /dev/sdz1: LABEL="swift-sdz1" UUID="c589b4a3-f52e-4278-b401-709ebbcf015f" TYPE="xfs" PARTLABEL="swift-sdz1" PARTUUID="f501772c-f4d9-4e05-809c-7cb652efd053" /dev/sda1: UUID="18da7c90-7e68-0e9d-56ee-e30021164f77" UUID_SUB="9341a737-7bb2-1a57-3520-4fddf17f3460" LABEL="ms-be2060:0" TYPE="linux_raid_member" PARTUUID="e09d002c-01" /dev/sda2: UUID="2f0bd683-249f-8367-6eed-23ee26fda0e5" UUID_SUB="bb568916-62d8-0b5b-8ea6-902e33ab19ab" LABEL="ms-be2060:1" TYPE="linux_raid_member" PARTUUID="e09d002c-02" /dev/sda3: LABEL="swift-sda3" UUID="d0a2b599-1010-45a6-839d-e634bcba151d" TYPE="xfs" PARTUUID="e09d002c-03" /dev/sda4: LABEL="swift-sda4" UUID="ca13477b-f15b-4ab5-b7c7-230024aadac6" TYPE="xfs" PARTUUID="e09d002c-04" /dev/md1: UUID="29f14cd2-8c33-4a3c-a68d-3f7039f6b6db" TYPE="swap"
State once the installer had failed:
~ # blkid | more /dev/md0: UUID="1c189c09-a080-41e0-b816-c28cd1b06b98" BLOCK_SIZE="4096" TYPE="e" /dev/md1: UUID="ffca2266-f2db-4512-8899-0e544081f6a8" TYPE="swap" /dev/sda3: UUID="2677c651-8778-43b7-831c-3607f933bb58" BLOCK_SIZE="4096" TYPE="" /dev/sda4: UUID="5875309a-ea75-4a89-9a92-d60e9316d71d" BLOCK_SIZE="4096" TYPE="" /dev/sdb3: UUID="03caf1fe-1629-41ff-aa5e-8e5fb08149c5" BLOCK_SIZE="4096" TYPE="" /dev/sdb4: UUID="324e82b2-c7b4-429c-b547-258fc4d14a8a" BLOCK_SIZE="4096" TYPE="" /dev/sdc3: LABEL="swift-sdb3" UUID="a29c83a2-c844-40a6-9310-1c641f44b20d" BLOCK" /dev/sdc4: LABEL="swift-sdb4" UUID="6b2d1529-e071-4b88-b087-f5378a384a4e" BLOCK" /dev/sdd1: LABEL="swift-sdd1" UUID="ff341b95-7587-4ea6-8276-38a5cd03f41a" BLOCK" /dev/sde1: LABEL="swift-sde1" UUID="e3c45c18-3302-4eae-983c-136f2b5525fd" BLOCK" /dev/sdf1: LABEL="swift-sdg1" UUID="ae70febe-303e-43cd-8182-6446a93a8e18" BLOCK SIZE="4096" TYPE="xfs" PARTLABEL="swift-sdg1" PARTUUID="26f48171-136e-4b72-8602" /dev/sdg1: LABEL="swift-sdh1" UUID="5f1e9875-b1db-403f-9a80-80adc8a1a99e" BLOCK" /dev/sdh1: LABEL="swift-sdf1" UUID="bdea7f93-9602-448d-b15e-9be5c34ae583" BLOCK" /dev/sdi1: LABEL="swift-sdi1" UUID="66558e6d-720d-4616-8cac-d59b0bc7f15d" BLOCK" /dev/sdj1: LABEL="swift-sdj1" UUID="4f5a63f1-49e7-4138-a574-62bbddcb15bb" BLOCK" /dev/sdk1: LABEL="swift-sdk1" UUID="06935047-b818-4167-9e15-4581144d2c12" BLOCK" /dev/sdl1: LABEL="swift-sdm1" UUID="4005dc6a-e96c-4646-8687-634dd73d065f" BLOCK" /dev/sdm1: LABEL="swift-sdn1" UUID="aeb04028-c6ea-412a-ac85-d90fa2cef7b1" BLOCK /dev/sdn1: LABEL="swift-sdl1" UUID="d9bf2f08-3f39-4044-b054-8e678663dcd1" BLOCK" /dev/sdo1: LABEL="swift-sdo1" UUID="05a05184-c3e9-45fc-be50-d8e12079b4a4" BLOCK" /dev/sdp1: LABEL="swift-sdq1" UUID="5a18629a-0f8a-46e9-be41-5eba631c98f3" BLOCK" /dev/sdq1: LABEL="swift-sdp1" UUID="c91ff1e8-39b8-4f31-8d44-126667f3304e" BLOCK" /dev/sdr1: LABEL="swift-sds1" UUID="a3f8b82b-a440-4a28-a7bf-c6ab50962ac0" BLOCK" /dev/sds1: LABEL="swift-sdr1" UUID="fe58126d-c971-4531-b3ae-75fb38b1e2e4" BLOCK" /dev/sdt1: LABEL="swift-sdu1" UUID="e35b9c89-5e2c-4035-a9c6-8c44d33388b4" BLOCK" /dev/sdu1: LABEL="swift-sdt1" UUID="9aa4a95f-a5f6-461d-82ab-6393fa246f24" BLOCK" /dev/sdv1: LABEL="swift-sdv1" UUID="8ff60693-4aba-4a0d-92b7-2e1deae53989" BLOCK" /dev/sdw1: LABEL="swift-sdw1" UUID="70612c8e-823f-4b71-b25b-0ce5c571d6f7" BLOCK" /dev/sdx1: LABEL="swift-sdx1" UUID="7fc6d355-ee6f-4160-b046-cc0eef0922de" BLOCK" /dev/sdy1: LABEL="swift-sdy1" UUID="da7fcb1e-fa41-4c96-934c-26340a3fbb03" BLOCK" /dev/sdz1: LABEL="swift-sdz1" UUID="c589b4a3-f52e-4278-b401-709ebbcf015f" BLOCK" /dev/sdb1: UUID="91100df9-5d7a-0bf0-c298-8f4d242e724f" UUID_SUB="0e9cd793-bfaf-" /dev/sdb2: UUID="bbd03217-5ffd-dd88-40dd-2916c499bbc9" UUID_SUB="41ae9a63-af0e- /dev/sda1: UUID="91100df9-5d7a-0bf0-c298-8f4d242e724f" UUID_SUB="7eaf1110-5fb6-" /dev/sda2: UUID="bbd03217-5ffd-dd88-40dd-2916c499bbc9" UUID_SUB="57a42d7d-ab63-" /dev/sdc1: PARTUUID="48f4f878-01" /dev/sdc2: PARTUUID="48f4f878-02"
So the SSDs were sda and sdc. The following installer ran successfully, puppet run & reboot and subsequent puppet run (i.e. the rest of the regular reimage playbook) ran to completion, with disks thus:
Filesystem Size Used Avail Use% Mounted on udev 252G 0 252G 0% /dev tmpfs 51G 2.0M 51G 1% /run /dev/md0 55G 2.8G 50G 6% / tmpfs 252G 4.0K 252G 1% /dev/shm tmpfs 5.0M 0 5.0M 0% /run/lock /dev/sda3 94G 16G 78G 17% /srv/swift-storage/sda3 /dev/sdb3 94G 27G 67G 29% /srv/swift-storage/sdb3 /dev/sdb4 297G 2.1G 295G 1% /srv/swift-storage/sdb4 /dev/sda4 297G 2.1G 295G 1% /srv/swift-storage/sda4 /dev/sdf1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sde1 /dev/sdd1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdd1 /dev/sde1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdf1 /dev/sdj1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdj1 /dev/sdh1 7.3T 5.2T 2.1T 72% /srv/swift-storage/sdh1 /dev/sdk1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdk1 /dev/sdg1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdg1 /dev/sdi1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdi1 /dev/sdn1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdn1 /dev/sdm1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdm1 /dev/sdl1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdl1 /dev/sdp1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdp1 /dev/sdo1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdo1 /dev/sdq1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdq1 /dev/sds1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sds1 /dev/sdr1 7.3T 5.3T 2.0T 73% /srv/swift-storage/sdr1 /dev/sdt1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdt1 /dev/sdu1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdu1 /dev/sdv1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdv1 /dev/sdx1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdx1 /dev/sdy1 7.3T 5.3T 2.1T 72% /srv/swift-storage/sdy1 /dev/sdw1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdw1 /dev/sdz1 7.3T 5.3T 2.1T 73% /srv/swift-storage/sdz1 tmpfs 51G 0 51G 0% /run/user/33349
Note that puppet runs OK despite /dev/sdc1 in fact now being a part-finished RAID
mvernon@ms-be2060:~$ cat /proc/mdstat Personalities : [raid1] [linear] [multipath] [raid0] [raid6] [raid5] [raid4] [raid10] md127 : inactive sdc1[1](S) 58558464 blocks super 1.2 md1 : active (auto-read-only) raid1 sda2[0] sdb2[1] 975872 blocks super 1.2 [2/2] [UU] resync=PENDING md0 : active raid1 sda1[0] sdb1[1] 58558464 blocks super 1.2 [2/2] [UU] unused devices: <none>
This requires manual fixing - stop md127, remove the superblock, and make a new filesystem on /dev/sdc1, mount it, and re-run puppet. Then wait O(6) hours for swift to backfill.