| | 1 | [[PageOutline]] |
| | 2 | |
| | 3 | = 2010/05/08のRAID5の復旧 = |
| | 4 | |
| | 5 | == 対象 == |
| | 6 | * 500GB x 6 RAID5 array |
| | 7 | |
| | 8 | == 経過 == |
| | 9 | === 原因 === |
| | 10 | 1. RAID5構成diskの6台中1台を誤って接続したまま、別のdiskへOSをインストールした際に、気付かずにRAID5構成disk(のおそらくsuperblock)を消去 |
| | 11 | * sudo fdisk -l /dev/sdg |
| | 12 | {{{ |
| | 13 | /dev/sdg1 1 60789 488287611 fd Linux raid autodetect |
| | 14 | }}} |
| | 15 | * sudo mdadm --examine /dev/sdg1 |
| | 16 | {{{ |
| | 17 | mdadm: No md superblock detected on /dev/sdg1. |
| | 18 | }}} |
| | 19 | * cat /proc/mdstat |
| | 20 | {{{ |
| | 21 | md2 : inactive sdj1[4](S) sdh1[5](S) sdd1[1](S) sdf1[0](S) sdi1[3](S) |
| | 22 | 2441437440 blocks |
| | 23 | }}} |
| | 24 | === 縮退起動 === |
| | 25 | 1. mdadm --runで degraded start (5/6) |
| | 26 | * sudo mdadm --run /dev/md2 --verbose |
| | 27 | {{{ |
| | 28 | mdadm: started /dev/md2 |
| | 29 | }}} |
| | 30 | * cat /proc/mdstat |
| | 31 | {{{ |
| | 32 | md2 : active raid5 sdj1[4] sdh1[5] sdd1[1] sdf1[0] sdi1[3] |
| | 33 | 2441437440 blocks level 5, 64k chunk, algorithm 2 [6/5] [UU_UUU] |
| | 34 | }}} |
| | 35 | 1. 縮退運転中にまた別のdiskのI/Oエラー、および複数のdiskにhard resetting linkが発生 |
| | 36 | * => [./kern.log#fail] |
| | 37 | * SATA-I/Fとdiskの相性が悪い模様。以前よりたびたび発生していたが、今回は致命的 |
| | 38 | * cat /proc/mdstat |
| | 39 | {{{ |
| | 40 | md2 : active raid5 sdj1[6](F) sdh1[7](F) sdd1[1] sdf1[0] sdi1[8](F) |
| | 41 | 2441437440 blocks level 5, 64k chunk, algorithm 2 [6/2] [UU____] |
| | 42 | }}} |
| | 43 | 1. 再始動不能 |
| | 44 | * sudo mdadm --run /dev/md2 --verbose |
| | 45 | {{{ |
| | 46 | mdadm: failed to run array /dev/md2: Device or resource busy |
| | 47 | }}} |
| | 48 | === OS再起動 === |
| | 49 | 1. system reboot |
| | 50 | 1. OS再起動するも、2/6 failedとなり始動不可 |
| | 51 | * cat /proc/mdstat |
| | 52 | {{{ |
| | 53 | md2 : inactive sdc1[3](S) sdb1[5](S) sdd1[4](S) sdh1[1](S) sdj1[0](S) |
| | 54 | 2441437440 blocks |
| | 55 | }}} |
| | 56 | * sudo mdadm --run /dev/md2 --verbose |
| | 57 | {{{ |
| | 58 | mdadm: failed to run array /dev/md2: Input/output error |
| | 59 | }}} |
| | 60 | * dmesg | tail -n 30 |
| | 61 | {{{ |
| | 62 | [ 78.297675] md: kicking non-fresh sdb1 from array! |
| | 63 | [ 78.297685] md: unbind<sdb1> |
| | 64 | [ 78.321292] md: export_rdev(sdb1) |
| | 65 | [ 78.410382] raid5: device sdc1 operational as raid disk 3 |
| | 66 | [ 78.410384] raid5: device sdd1 operational as raid disk 4 |
| | 67 | [ 78.410386] raid5: device sdh1 operational as raid disk 1 |
| | 68 | [ 78.410388] raid5: device sdj1 operational as raid disk 0 |
| | 69 | [ 78.410861] raid5: allocated 6386kB for md2 |
| | 70 | [ 78.410907] 3: w=1 pa=0 pr=6 m=1 a=2 r=6 op1=0 op2=0 |
| | 71 | [ 78.410910] 4: w=2 pa=0 pr=6 m=1 a=2 r=6 op1=0 op2=0 |
| | 72 | [ 78.410912] 1: w=3 pa=0 pr=6 m=1 a=2 r=6 op1=0 op2=0 |
| | 73 | [ 78.410914] 0: w=4 pa=0 pr=6 m=1 a=2 r=6 op1=0 op2=0 |
| | 74 | [ 78.410916] raid5: not enough operational devices for md2 (2/6 failed) |
| | 75 | [ 78.411098] RAID5 conf printout: |
| | 76 | [ 78.411100] --- rd:6 wd:4 |
| | 77 | [ 78.411102] disk 0, o:1, dev:sdj1 |
| | 78 | [ 78.411103] disk 1, o:1, dev:sdh1 |
| | 79 | [ 78.411105] disk 3, o:1, dev:sdc1 |
| | 80 | [ 78.411107] disk 4, o:1, dev:sdd1 |
| | 81 | [ 78.411529] raid5: failed to run raid set md2 |
| | 82 | [ 78.411651] md: pers->run() failed ... |
| | 83 | }}} |
| | 84 | 1. /dev/sdb1がmdstatから消滅 |
| | 85 | * cat /proc/mdstat |
| | 86 | {{{ |
| | 87 | md2 : inactive sda1[6](S) sdc1[3] sdd1[4] sdh1[1] sdj1[0] |
| | 88 | 2441437440 blocks |
| | 89 | }}} |
| | 90 | 1. /dev/sdb1をre-addし、再始動 |
| | 91 | * sudo mdadm /dev/md2 -a /dev/sdb1 |
| | 92 | {{{ |
| | 93 | mdadm: re-added /dev/sdb1 |
| | 94 | }}} |
| | 95 | * sudo mdadm --run /dev/md2 --verbose |
| | 96 | {{{ |
| | 97 | mdadm: started /dev/md2 |
| | 98 | }}} |
| | 99 | * cat kern.log |
| | 100 | {{{ |
| | 101 | May 8 23:07:16 HOSTNAME kernel: [ 308.856084] md: bind<sdb1> |
| | 102 | May 8 23:07:19 HOSTNAME kernel: [ 311.836915] raid5: device sdb1 operational as raid disk 5 |
| | 103 | May 8 23:07:19 HOSTNAME kernel: [ 311.836923] raid5: device sdc1 operational as raid disk 3 |
| | 104 | May 8 23:07:19 HOSTNAME kernel: [ 311.836929] raid5: device sdd1 operational as raid disk 4 |
| | 105 | May 8 23:07:19 HOSTNAME kernel: [ 311.836934] raid5: device sdh1 operational as raid disk 1 |
| | 106 | May 8 23:07:19 HOSTNAME kernel: [ 311.836939] raid5: device sdj1 operational as raid disk 0 |
| | 107 | May 8 23:07:19 HOSTNAME kernel: [ 311.838484] raid5: allocated 6386kB for md2 |
| | 108 | May 8 23:07:19 HOSTNAME kernel: [ 311.838789] 5: w=1 pa=0 pr=6 m=1 a=2 r=6 op1=0 op2=0 |
| | 109 | May 8 23:07:19 HOSTNAME kernel: [ 311.838796] 3: w=2 pa=0 pr=6 m=1 a=2 r=6 op1=0 op2=0 |
| | 110 | May 8 23:07:19 HOSTNAME kernel: [ 311.838801] 4: w=3 pa=0 pr=6 m=1 a=2 r=6 op1=0 op2=0 |
| | 111 | May 8 23:07:19 HOSTNAME kernel: [ 311.838807] 1: w=4 pa=0 pr=6 m=1 a=2 r=6 op1=0 op2=0 |
| | 112 | May 8 23:07:19 HOSTNAME kernel: [ 311.838812] 0: w=5 pa=0 pr=6 m=1 a=2 r=6 op1=0 op2=0 |
| | 113 | May 8 23:07:19 HOSTNAME kernel: [ 311.838818] raid5: raid level 5 set md2 active with 5 out of 6 devices, algorithm 2 |
| | 114 | May 8 23:07:19 HOSTNAME kernel: [ 311.852170] RAID5 conf printout: |
| | 115 | May 8 23:07:19 HOSTNAME kernel: [ 311.852174] --- rd:6 wd:5 |
| | 116 | May 8 23:07:19 HOSTNAME kernel: [ 311.852179] disk 0, o:1, dev:sdj1 |
| | 117 | May 8 23:07:19 HOSTNAME kernel: [ 311.852184] disk 1, o:1, dev:sdh1 |
| | 118 | May 8 23:07:19 HOSTNAME kernel: [ 311.852188] disk 3, o:1, dev:sdc1 |
| | 119 | May 8 23:07:19 HOSTNAME kernel: [ 311.852192] disk 4, o:1, dev:sdd1 |
| | 120 | May 8 23:07:19 HOSTNAME kernel: [ 311.852196] disk 5, o:1, dev:sdb1 |
| | 121 | May 8 23:07:19 HOSTNAME kernel: [ 311.852306] md2: detected capacity change from 0 to 2500031938560 |
| | 122 | May 8 23:07:19 HOSTNAME kernel: [ 311.852642] md2:RAID5 conf printout: |
| | 123 | May 8 23:07:19 HOSTNAME kernel: [ 311.853380] --- rd:6 wd:5 |
| | 124 | May 8 23:07:19 HOSTNAME kernel: [ 311.853386] disk 0, o:1, dev:sdj1 |
| | 125 | May 8 23:07:19 HOSTNAME kernel: [ 311.853390] disk 1, o:1, dev:sdh1 |
| | 126 | May 8 23:07:19 HOSTNAME kernel: [ 311.853394] disk 2, o:1, dev:sda1 |
| | 127 | May 8 23:07:19 HOSTNAME kernel: [ 311.853398] disk 3, o:1, dev:sdc1 |
| | 128 | May 8 23:07:19 HOSTNAME kernel: [ 311.853402] disk 4, o:1, dev:sdd1 |
| | 129 | May 8 23:07:19 HOSTNAME kernel: [ 311.853406] disk 5, o:1, dev:sdb1 |
| | 130 | May 8 23:07:19 HOSTNAME kernel: [ 311.853513] unknown partition table |
| | 131 | May 8 23:07:19 HOSTNAME kernel: [ 311.855855] md: recovery of RAID array md2 |
| | 132 | May 8 23:07:19 HOSTNAME kernel: [ 311.855863] md: minimum _guaranteed_ speed: 1000 KB/sec/disk. |
| | 133 | May 8 23:07:19 HOSTNAME kernel: [ 311.855868] md: using maximum available idle IO bandwidth (but not more than 200000 KB/sec) for recovery. |
| | 134 | May 8 23:07:19 HOSTNAME kernel: [ 311.855883] md: using 128k window, over a total of 488287488 blocks. |
| | 135 | }}} |
| | 136 | * cat /proc/mdstat |
| | 137 | {{{ |
| | 138 | md2 : active raid5 sdb1[5] sda1[6] sdc1[3] sdd1[4] sdh1[1] sdj1[0] |
| | 139 | 2441437440 blocks level 5, 64k chunk, algorithm 2 [6/5] [UU_UUU] |
| | 140 | [>....................] recovery = 0.0% (117376/488287488) finish=415.8min speed=19562K/sec |
| | 141 | }}} |
| | 142 | 1. 再度diskのI/Oエラー、および複数のdiskにhard resetting linkが発生 |
| | 143 | * => [./kern.log#fail2] |
| | 144 | === 物理的配置換え === |
| | 145 | 1. 問題の起きるSATA-I/Fの使用を諦め、別のM/Bにdisk6台を繋ぎなおす |
| | 146 | * cat /proc/mdstat |
| | 147 | {{{ |
| | 148 | md2 : inactive sdd1[5](S) sdg1[7](S) sdf1[0](S) sde1[4](S) sdc1[1](S) sdh1[3](S) |
| | 149 | 2929724928 blocks |
| | 150 | }}} |
| | 151 | 1. 今度は4/6 failedとなり再始動不可 |
| | 152 | * sudo mdadm --run /dev/md2 |
| | 153 | {{{ |
| | 154 | mdadm: failed to run array /dev/md2: Input/output error |
| | 155 | }}} |
| | 156 | * dmesg | tail -n 30 |
| | 157 | {{{ |
| | 158 | [ 128.378868] md: kicking non-fresh sdd1 from array! |
| | 159 | [ 128.378876] md: unbind<sdd1> |
| | 160 | [ 128.400016] md: export_rdev(sdd1) |
| | 161 | [ 128.400096] md: kicking non-fresh sde1 from array! |
| | 162 | [ 128.400101] md: unbind<sde1> |
| | 163 | [ 128.430012] md: export_rdev(sde1) |
| | 164 | [ 128.430082] md: kicking non-fresh sdh1 from array! |
| | 165 | [ 128.430087] md: unbind<sdh1> |
| | 166 | [ 128.500012] md: export_rdev(sdh1) |
| | 167 | [ 128.564040] raid5: device sdf1 operational as raid disk 0 |
| | 168 | [ 128.564043] raid5: device sdc1 operational as raid disk 1 |
| | 169 | [ 128.564449] raid5: allocated 6386kB for md2 |
| | 170 | [ 128.564469] 0: w=1 pa=0 pr=6 m=1 a=2 r=6 op1=0 op2=0 |
| | 171 | [ 128.564471] 1: w=2 pa=0 pr=6 m=1 a=2 r=6 op1=0 op2=0 |
| | 172 | [ 128.564472] raid5: not enough operational devices for md2 (4/6 failed) |
| | 173 | [ 128.564720] RAID5 conf printout: |
| | 174 | [ 128.564722] --- rd:6 wd:2 |
| | 175 | [ 128.564723] disk 0, o:1, dev:sdf1 |
| | 176 | [ 128.564725] disk 1, o:1, dev:sdc1 |
| | 177 | [ 128.564917] raid5: failed to run raid set md2 |
| | 178 | [ 128.565090] md: pers->run() failed ... |
| | 179 | }}} |
| | 180 | 1. unbindされたdeviceをre-add |
| | 181 | * sudo mdadm /dev/md2 -a /dev/sdc1 |
| | 182 | {{{ |
| | 183 | mdadm: Cannot open /dev/sdc1: Device or resource busy |
| | 184 | }}} |
| | 185 | * sudo mdadm /dev/md2 -a /dev/sdd1 |
| | 186 | {{{ |
| | 187 | mdadm: re-added /dev/sdd1 |
| | 188 | }}} |
| | 189 | * sudo mdadm /dev/md2 -a /dev/sde1 |
| | 190 | {{{ |
| | 191 | mdadm: re-added /dev/sde1 |
| | 192 | }}} |
| | 193 | * sudo mdadm /dev/md2 -a /dev/sdf1 |
| | 194 | {{{ |
| | 195 | mdadm: Cannot open /dev/sdf1: Device or resource busy |
| | 196 | }}} |
| | 197 | * sudo mdadm /dev/md2 -a /dev/sdg1 |
| | 198 | {{{ |
| | 199 | mdadm: Cannot open /dev/sdg1: Device or resource busy |
| | 200 | }}} |
| | 201 | * sudo mdadm /dev/md2 -a /dev/sdh1 |
| | 202 | {{{ |
| | 203 | mdadm: re-added /dev/sdh1 |
| | 204 | }}} |
| | 205 | * sudo mdadm /dev/md2 -a /dev/sdi1 |
| | 206 | {{{ |
| | 207 | mdadm: cannot find /dev/sdi1: No such file or directory |
| | 208 | }}} |
| | 209 | * sudo mdadm /dev/md2 -r /dev/sdg1 |
| | 210 | {{{ |
| | 211 | mdadm: hot removed /dev/sdg1 |
| | 212 | }}} |
| | 213 | * sudo mdadm /dev/md2 -a /dev/sdg1 |
| | 214 | {{{ |
| | 215 | mdadm: re-added /dev/sdg1 |
| | 216 | }}} |
| | 217 | 1. 再始動 |
| | 218 | * sudo mdadm --run /dev/md2 |
| | 219 | {{{ |
| | 220 | mdadm: started /dev/md2 |
| | 221 | }}} |
| | 222 | 1. rebuilding |
| | 223 | * cat /proc/mdstat |
| | 224 | {{{ |
| | 225 | md2 : active raid5 sdg1[7] sdh1[3] sde1[4] sdd1[5] sdf1[0] sdc1[1] |
| | 226 | 2441437440 blocks level 5, 64k chunk, algorithm 2 [6/5] [UU_UUU] |
| | 227 | [>....................] recovery = 1.0% (5284864/488287488) finish=128.6min speed=62558K/sec |
| | 228 | }}} |
| | 229 | 1. rebuild completed |
| | 230 | * cat /proc/mdstat |
| | 231 | {{{ |
| | 232 | md2 : active raid5 sdg1[2] sdh1[3] sde1[4] sdd1[5] sdf1[0] sdc1[1] |
| | 233 | 2441437440 blocks level 5, 64k chunk, algorithm 2 [6/6] [UUUUUU] |
| | 234 | }}} |
| | 235 | * => [./kern.log#recover] |