Looking into this, on the surface level, I don’t see any driver differences between Cobia and Dragonfish that would be a breaking change, they are the same.
For those that are affected here, was it working on a version of SCALE before?
root@prod[~]# modinfo mpt2sas
filename: /lib/modules/6.6.29-production+truenas/kernel/drivers/scsi/mpt3sas/mpt3sas.ko
alias: mpt2sas
version: 43.100.00.00
license: GPL
description: LSI MPT Fusion SAS 3.0 Device Driver
author: Avago Technologies <MPT-FusionLinux.pdl@avagotech.com>
srcversion: 595E9DBF21F4E99E6DDADC4
alias: pci:v00001000d000000E7sv*sd*bc*sc*i*
alias: pci:v00001000d000000E4sv*sd*bc*sc*i*
alias: pci:v0000117Cd000000E6sv*sd*bc*sc*i*
alias: pci:v00001000d000000E6sv*sd*bc*sc*i*
alias: pci:v00001000d000000E5sv*sd*bc*sc*i*
alias: pci:v00001000d000000B2sv*sd*bc*sc*i*
alias: pci:v00001000d000000E3sv*sd*bc*sc*i*
alias: pci:v00001000d000000E0sv*sd*bc*sc*i*
alias: pci:v00001000d000000E2sv*sd*bc*sc*i*
alias: pci:v00001000d000000E1sv*sd*bc*sc*i*
alias: pci:v00001000d000000D1sv*sd*bc*sc*i*
alias: pci:v00001000d000000ACsv*sd*bc*sc*i*
alias: pci:v00001000d000000ABsv*sd*bc*sc*i*
alias: pci:v00001000d000000AAsv*sd*bc*sc*i*
alias: pci:v00001000d000000AFsv*sd*bc*sc*i*
alias: pci:v00001000d000000AEsv*sd*bc*sc*i*
alias: pci:v00001000d000000ADsv*sd*bc*sc*i*
alias: pci:v00001000d000000C3sv*sd*bc*sc*i*
alias: pci:v00001000d000000C2sv*sd*bc*sc*i*
alias: pci:v00001000d000000C1sv*sd*bc*sc*i*
alias: pci:v00001000d000000C0sv*sd*bc*sc*i*
alias: pci:v00001000d000000C8sv*sd*bc*sc*i*
alias: pci:v00001000d000000C7sv*sd*bc*sc*i*
alias: pci:v00001000d000000C6sv*sd*bc*sc*i*
alias: pci:v00001000d000000C5sv*sd*bc*sc*i*
alias: pci:v00001000d000000C4sv*sd*bc*sc*i*
alias: pci:v00001000d000000C9sv*sd*bc*sc*i*
alias: pci:v00001000d00000095sv*sd*bc*sc*i*
alias: pci:v00001000d00000094sv*sd*bc*sc*i*
alias: pci:v00001000d00000091sv*sd*bc*sc*i*
alias: pci:v00001000d00000090sv*sd*bc*sc*i*
alias: pci:v00001000d00000097sv*sd*bc*sc*i*
alias: pci:v00001000d00000096sv*sd*bc*sc*i*
alias: pci:v00001000d0000007Esv*sd*bc*sc*i*
alias: pci:v00001000d000002B1sv*sd*bc*sc*i*
alias: pci:v00001000d000002B0sv*sd*bc*sc*i*
alias: pci:v00001000d0000006Esv*sd*bc*sc*i*
alias: pci:v00001000d00000087sv*sd*bc*sc*i*
alias: pci:v00001000d00000086sv*sd*bc*sc*i*
alias: pci:v00001000d00000085sv*sd*bc*sc*i*
alias: pci:v00001000d00000084sv*sd*bc*sc*i*
alias: pci:v00001000d00000083sv*sd*bc*sc*i*
alias: pci:v00001000d00000082sv*sd*bc*sc*i*
alias: pci:v00001000d00000081sv*sd*bc*sc*i*
alias: pci:v00001000d00000080sv*sd*bc*sc*i*
alias: pci:v00001000d00000065sv*sd*bc*sc*i*
alias: pci:v00001000d00000064sv*sd*bc*sc*i*
alias: pci:v00001000d00000077sv*sd*bc*sc*i*
alias: pci:v00001000d00000076sv*sd*bc*sc*i*
alias: pci:v00001000d00000074sv*sd*bc*sc*i*
alias: pci:v00001000d00000072sv*sd*bc*sc*i*
alias: pci:v00001000d00000070sv*sd*bc*sc*i*
depends: scsi_mod,scsi_transport_sas,raid_class,scsi_common
retpoline: Y
intree: Y
name: mpt3sas
vermagic: 6.6.29-production+truenas SMP preempt mod_unload modversions
parm: logging_level: bits for enabling additional logging info (default=0)
parm: max_sectors:max sectors, range 64 to 32767 default=32767 (ushort)
parm: missing_delay: device missing delay , io missing delay (array of int)
parm: max_lun: max lun, default=16895 (ullong)
parm: hbas_to_enumerate: 0 - enumerates both SAS 2.0 & SAS 3.0 generation HBAs
1 - enumerates only SAS 2.0 generation HBAs
2 - enumerates only SAS 3.0 generation HBAs (default=0) (ushort)
parm: diag_buffer_enable: post diag buffers (TRACE=1/SNAPSHOT=2/EXTENDED=4/default=0) (int)
parm: disable_discovery: disable discovery (int)
parm: prot_mask: host protection capabilities mask, def=7 (int)
parm: enable_sdev_max_qd:Enable sdev max qd as can_queue, def=disabled(0) (bool)
parm: multipath_on_hba:Multipath support to add same target device
as many times as it is visible to HBA from various paths
(by default:
SAS 2.0 & SAS 3.0 HBA - This will be disabled,
SAS 3.5 HBA - This will be enabled) (int)
parm: host_tagset_enable:Shared host tagset enable/disable Default: enable(1) (int)
parm: max_queue_depth: max controller queue depth (int)
parm: max_sgl_entries: max sg entries (int)
parm: msix_disable: disable msix routed interrupts (default=0) (int)
parm: smp_affinity_enable:SMP affinity feature enable/disable Default: enable(1) (int)
parm: max_msix_vectors: max msix vectors (int)
parm: irqpoll_weight:irq poll weight (default= one fourth of HBA queue depth) (int)
parm: mpt3sas_fwfault_debug: enable detection of firmware fault and halt firmware - (default=0)
parm: perf_mode:Performance mode (only for Aero/Sea Generation), options:
0 - balanced: high iops mode is enabled &
interrupt coalescing is enabled only on high iops queues,
1 - iops: high iops mode is disabled &
interrupt coalescing is enabled on all queues,
2 - latency: high iops mode is disabled &
interrupt coalescing is enabled on all queues with timeout value 0xA,
default - default perf_mode is 'balanced' (int)
parm: poll_queues:Number of queues to be use for io_uring poll mode.
This parameter is effective only if host_tagset_enable=1. &
when poll_queues are enabled then &
perf_mode is set to latency mode. &
(int)
root@prod[~]#
root@prod[~]# cat /etc/version
24.04.1.1# root@prod[~]#
root@rawht[~]# modinfo mpt2sas
filename: /lib/modules/6.1.55-production+truenas/kernel/drivers/scsi/mpt3sas/mpt3sas.ko
alias: mpt2sas
version: 43.100.00.00
license: GPL
description: LSI MPT Fusion SAS 3.0 Device Driver
author: Avago Technologies <MPT-FusionLinux.pdl@avagotech.com>
srcversion: 83B7953062D26A5B86911CD
alias: pci:v00001000d000000E7sv*sd*bc*sc*i*
alias: pci:v00001000d000000E4sv*sd*bc*sc*i*
alias: pci:v0000117Cd000000E6sv*sd*bc*sc*i*
alias: pci:v00001000d000000E6sv*sd*bc*sc*i*
alias: pci:v00001000d000000E5sv*sd*bc*sc*i*
alias: pci:v00001000d000000B2sv*sd*bc*sc*i*
alias: pci:v00001000d000000E3sv*sd*bc*sc*i*
alias: pci:v00001000d000000E0sv*sd*bc*sc*i*
alias: pci:v00001000d000000E2sv*sd*bc*sc*i*
alias: pci:v00001000d000000E1sv*sd*bc*sc*i*
alias: pci:v00001000d000000D1sv*sd*bc*sc*i*
alias: pci:v00001000d000000ACsv*sd*bc*sc*i*
alias: pci:v00001000d000000ABsv*sd*bc*sc*i*
alias: pci:v00001000d000000AAsv*sd*bc*sc*i*
alias: pci:v00001000d000000AFsv*sd*bc*sc*i*
alias: pci:v00001000d000000AEsv*sd*bc*sc*i*
alias: pci:v00001000d000000ADsv*sd*bc*sc*i*
alias: pci:v00001000d000000C3sv*sd*bc*sc*i*
alias: pci:v00001000d000000C2sv*sd*bc*sc*i*
alias: pci:v00001000d000000C1sv*sd*bc*sc*i*
alias: pci:v00001000d000000C0sv*sd*bc*sc*i*
alias: pci:v00001000d000000C8sv*sd*bc*sc*i*
alias: pci:v00001000d000000C7sv*sd*bc*sc*i*
alias: pci:v00001000d000000C6sv*sd*bc*sc*i*
alias: pci:v00001000d000000C5sv*sd*bc*sc*i*
alias: pci:v00001000d000000C4sv*sd*bc*sc*i*
alias: pci:v00001000d000000C9sv*sd*bc*sc*i*
alias: pci:v00001000d00000095sv*sd*bc*sc*i*
alias: pci:v00001000d00000094sv*sd*bc*sc*i*
alias: pci:v00001000d00000091sv*sd*bc*sc*i*
alias: pci:v00001000d00000090sv*sd*bc*sc*i*
alias: pci:v00001000d00000097sv*sd*bc*sc*i*
alias: pci:v00001000d00000096sv*sd*bc*sc*i*
alias: pci:v00001000d0000007Esv*sd*bc*sc*i*
alias: pci:v00001000d000002B1sv*sd*bc*sc*i*
alias: pci:v00001000d000002B0sv*sd*bc*sc*i*
alias: pci:v00001000d0000006Esv*sd*bc*sc*i*
alias: pci:v00001000d00000087sv*sd*bc*sc*i*
alias: pci:v00001000d00000086sv*sd*bc*sc*i*
alias: pci:v00001000d00000085sv*sd*bc*sc*i*
alias: pci:v00001000d00000084sv*sd*bc*sc*i*
alias: pci:v00001000d00000083sv*sd*bc*sc*i*
alias: pci:v00001000d00000082sv*sd*bc*sc*i*
alias: pci:v00001000d00000081sv*sd*bc*sc*i*
alias: pci:v00001000d00000080sv*sd*bc*sc*i*
alias: pci:v00001000d00000065sv*sd*bc*sc*i*
alias: pci:v00001000d00000064sv*sd*bc*sc*i*
alias: pci:v00001000d00000077sv*sd*bc*sc*i*
alias: pci:v00001000d00000076sv*sd*bc*sc*i*
alias: pci:v00001000d00000074sv*sd*bc*sc*i*
alias: pci:v00001000d00000072sv*sd*bc*sc*i*
alias: pci:v00001000d00000070sv*sd*bc*sc*i*
depends: scsi_mod,scsi_transport_sas,scsi_common,raid_class
retpoline: Y
intree: Y
name: mpt3sas
vermagic: 6.1.55-production+truenas SMP preempt mod_unload modversions
parm: logging_level: bits for enabling additional logging info (default=0)
parm: max_sectors:max sectors, range 64 to 32767 default=32767 (ushort)
parm: missing_delay: device missing delay , io missing delay (array of int)
parm: max_lun: max lun, default=16895 (ullong)
parm: hbas_to_enumerate: 0 - enumerates both SAS 2.0 & SAS 3.0 generation HBAs
1 - enumerates only SAS 2.0 generation HBAs
2 - enumerates only SAS 3.0 generation HBAs (default=0) (ushort)
parm: diag_buffer_enable: post diag buffers (TRACE=1/SNAPSHOT=2/EXTENDED=4/default=0) (int)
parm: disable_discovery: disable discovery (int)
parm: prot_mask: host protection capabilities mask, def=7 (int)
parm: enable_sdev_max_qd:Enable sdev max qd as can_queue, def=disabled(0) (bool)
parm: multipath_on_hba:Multipath support to add same target device
as many times as it is visible to HBA from various paths
(by default:
SAS 2.0 & SAS 3.0 HBA - This will be disabled,
SAS 3.5 HBA - This will be enabled) (int)
parm: host_tagset_enable:Shared host tagset enable/disable Default: enable(1) (int)
parm: max_queue_depth: max controller queue depth (int)
parm: max_sgl_entries: max sg entries (int)
parm: msix_disable: disable msix routed interrupts (default=0) (int)
parm: smp_affinity_enable:SMP affinity feature enable/disable Default: enable(1) (int)
parm: max_msix_vectors: max msix vectors (int)
parm: irqpoll_weight:irq poll weight (default= one fourth of HBA queue depth) (int)
parm: mpt3sas_fwfault_debug: enable detection of firmware fault and halt firmware - (default=0)
parm: perf_mode:Performance mode (only for Aero/Sea Generation), options:
0 - balanced: high iops mode is enabled &
interrupt coalescing is enabled only on high iops queues,
1 - iops: high iops mode is disabled &
interrupt coalescing is enabled on all queues,
2 - latency: high iops mode is disabled &
interrupt coalescing is enabled on all queues with timeout value 0xA,
default - default perf_mode is 'balanced' (int)
parm: poll_queues:Number of queues to be use for io_uring poll mode.
This parameter is effective only if host_tagset_enable=1. &
when poll_queues are enabled then &
perf_mode is set to latency mode. &
(int)
root@rawht[~]# cat /etc/version
23.10.0.1#
I do not know if this has “ever” worked on SCALE specifically. This is the first time I’ve ever ran three pools on a server. I tested on a couple versions, including dragonfish and 23.10, which was where I first experienced the issue. But again, this was my first time attempting such a build.
It almost seems like some sort of race condition causing inconsistent enumeration of the drives. The hardware itself is definitely OK, as CORE does not seem to experience it.
For me i changed from Core to Scale 24.4.0 RC1 and since then upgraded to 24.4.0 and 24.4.1 and had this problem since then.
No problems on Core 13.0-U6.1 or any other Core versions before
I unfortunately can’t provide a usefull debug because i did a freh install and since then i always export pools before i reboot.
I am scared of rebooting without exporting pools and maybe corrupting my pools
There we go. Sent you a debug
1 Like
If you need any data or commands i can run and post the output let me know i am more than happy to provide any data
Interesting. The way the logs read, it sounds like a cabling issue.
- We see the
kernel
throw CDB
errors, indicating it can’t talk to the drives
*We see zio
throw ZFS errors indicating it cant talk to the drives
*We see the mpt3sas
driver indicating drives were removed
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/a6d9ce3d-04dc-44a7-9896-f12b8b5ce74e error=5 type=1 offset=270336 size=8192 flags=721601
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/a6d9ce3d-04dc-44a7-9896-f12b8b5ce74e error=5 type=1 offset=10000827686912 size=8192 flags=721601
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#388 FAILED Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK cmd_age=0s
May 29 13:20:17 truenas kernel: sd 8:0:2:0: [sdg] tag#2790 FAILED Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK cmd_age=0s
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#388 CDB: Write(16) 8a 00 00 00 00 04 54 00 57 d0 00 00 00 08 00 00
May 29 13:20:17 truenas kernel: sd 8:0:2:0: [sdg] tag#2790 CDB: Write(16) 8a 00 00 00 00 02 d9 c0 cf d8 00 00 00 08 00 00
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/bb3663c2-e327-467a-b1c7-d625f4cee075 error=5 type=2 offset=9517656940544 size=4096 flags=1572992
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/a9cadbcc-7fe9-46ac-9f6e-083124f29cc2 error=5 type=2 offset=6268529913856 size=4096 flags=1572992
May 29 13:20:17 truenas kernel: sd 8:0:1:0: [sdf] tag#389 FAILED Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK cmd_age=0s
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#393 FAILED Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK cmd_age=0s
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#393 CDB: Write(16) 8a 00 00 00 00 01 0d 80 37 60 00 00 00 20 00 00
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/bb3663c2-e327-467a-b1c7-d625f4cee075 error=5 type=2 offset=2314992533504 size=16384 flags=1572992
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#394 FAILED Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK cmd_age=0s
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#394 CDB: Read(16) 88 00 00 00 00 00 00 00 12 10 00 00 00 10 00 00
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/bb3663c2-e327-467a-b1c7-d625f4cee075 error=5 type=1 offset=270336 size=8192 flags=721089
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#395 FAILED Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK cmd_age=0s
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#395 CDB: Read(16) 88 00 00 00 00 04 8c 3f f4 10 00 00 00 10 00 00
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/bb3663c2-e327-467a-b1c7-d625f4cee075 error=5 type=1 offset=10000827686912 size=8192 flags=721089
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#396 FAILED Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK cmd_age=0s
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#396 CDB: Read(16) 88 00 00 00 00 04 8c 3f f6 10 00 00 00 10 00 00
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/bb3663c2-e327-467a-b1c7-d625f4cee075 error=5 type=1 offset=10000827949056 size=8192 flags=721089
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#5301 FAILED Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK cmd_age=0s
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#5301 CDB: Write(16) 8a 00 00 00 00 01 0e e2 36 e0 00 00 00 20 00 00
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/bb3663c2-e327-467a-b1c7-d625f4cee075 error=5 type=2 offset=2326870736896 size=16384 flags=1572992
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#397 FAILED Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK cmd_age=0s
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#397 CDB: Write(16) 8a 00 00 00 00 01 14 88 25 a0 00 00 00 08 00 00
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/bb3663c2-e327-467a-b1c7-d625f4cee075 error=5 type=2 offset=2375388184576 size=4096 flags=1572992
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#5302 FAILED Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK cmd_age=0s
May 29 13:20:17 truenas kernel: sd 8:0:3:0: [sde] tag#5302 CDB: Write(16) 8a 00 00 00 00 01 1e 00 35 58 00 00 00 08 00 00
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/bb3663c2-e327-467a-b1c7-d625f4cee075 error=5 type=2 offset=2456726188032 size=4096 flags=1572992
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/a9cadbcc-7fe9-46ac-9f6e-083124f29cc2 error=5 type=2 offset=9139762241536 size=4096 flags=1572992
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/a9cadbcc-7fe9-46ac-9f6e-083124f29cc2 error=5 type=1 offset=270336 size=8192 flags=721089
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/a9cadbcc-7fe9-46ac-9f6e-083124f29cc2 error=5 type=1 offset=10000827686912 size=8192 flags=721089
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/a9cadbcc-7fe9-46ac-9f6e-083124f29cc2 error=5 type=1 offset=10000827949056 size=8192 flags=721089
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/a9cadbcc-7fe9-46ac-9f6e-083124f29cc2 error=5 type=2 offset=2314992537600 size=12288 flags=1572992
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/a9cadbcc-7fe9-46ac-9f6e-083124f29cc2 error=5 type=2 offset=2326870740992 size=12288 flags=1572992
May 29 13:20:17 truenas kernel: zio pool=G_Backup vdev=/dev/disk/by-partuuid/a9cadbcc-7fe9-46ac-9f6e-083124f29cc2 error=5 type=2 offset=2456726188032 size=4096 flags=1572992
May 29 13:20:17 truenas kernel: sd 8:0:1:0: [sdf] tag#389 CDB: Write(16) 8a 00 00 00 00 04 54 00 57 d0 00 00 00 08 00 00
..............
May 29 13:20:17 truenas kernel: mpt2sas_cm1: mpt3sas_transport_port_remove: removed: sas_addr(0x500a09800809138a)
May 29 13:20:17 truenas kernel: mpt2sas_cm1: removing handle(0x000c), sas_addr(0x500a09800809138a)
May 29 13:20:17 truenas kernel: mpt2sas_cm1: enclosure logical id(0x500a098006c94a8b), slot(2)
The disks “being removed” and resetting seems consistent with the strange drive labelling, but the cabling has been triple checked, and I even went as far as to reseat all the drive bays to see if maybe that was the trouble.
But again, CORE seems to think the drives are all there and fine.
Its also strange that having the disks in there doesn’t seem to cause the issue, only once the third pool has been created and the system reboots does this occur. Referring to SCALE here.
Just so that I’m clear. I do believe you, I’m just reporting what I am seeing 
Still digging.
Oh no, I didn’t mean to imply otherwise. Apologies if I came off that way!
1 Like
i found a kern.log from yesterday how can i send it to you?
i cant send it via message i get this error Sorry, the file you are trying to upload is not authorized (authorized extensions: jpg, jpeg, png, gif, heic, heif, webp, avif, sh).
I sent mine as a google drive link
1 Like
@Bloodpack Do you also have the problem with 3 pools and not with less?
Your kernel logs are a bit differant. It looks like its trying to talk to the drives and then times out after 30 seconds.
ay 3 23:44:07 storage kernel: sd 1:0:12:0: [sdy] Attached SCSI disk
May 3 23:48:31 storage kernel: ses 1:0:0:0: attempting task abort!scmd(0x000000005948b215), outstanding for 30252 ms & timeout 30000 ms
May 3 23:48:31 storage kernel: ses 1:0:0:0: tag#1454 CDB: Receive Diagnostic 1c 01 02 01 6c 00
May 3 23:48:31 storage kernel: scsi target1:0:0: handle(0x000a), sas_address(0x500a0980019f07fe), phy(36)
May 3 23:48:31 storage kernel: scsi target1:0:0: enclosure logical id(0x50050cc10204053c), slot(0)
May 3 23:48:31 storage kernel: ses 1:0:0:0: task abort: SUCCESS scmd(0x000000005948b215)
May 3 23:48:31 storage kernel: ses 1:0:0:0: attempting task abort!scmd(0x0000000092171d32), outstanding for 30264 ms & timeout 30000 ms
May 3 23:48:31 storage kernel: ses 1:0:0:0: tag#1453 CDB: Receive Diagnostic 1c 01 02 01 6c 00
May 3 23:48:31 storage kernel: scsi target1:0:0: handle(0x000a), sas_address(0x500a0980019f07fe), phy(36)
May 3 23:48:31 storage kernel: scsi target1:0:0: enclosure logical id(0x50050cc10204053c), slot(0)
May 3 23:48:31 storage kernel: ses 1:0:0:0: task abort: SUCCESS scmd(0x0000000092171d32)
May 3 23:48:31 storage kernel: ses 1:0:0:0: attempting task abort!scmd(0x000000002a27f7ac), outstanding for 30272 ms & timeout 30000 ms
May 3 23:48:31 storage kernel: ses 1:0:0:0: tag#1449 CDB: Receive Diagnostic 1c 01 02 01 6c 00
May 3 23:48:31 storage kernel: scsi target1:0:0: handle(0x000a), sas_address(0x500a0980019f07fe), phy(36)
May 3 23:48:31 storage kernel: scsi target1:0:0: enclosure logical id(0x50050cc10204053c), slot(0)
May 3 23:48:31 storage kernel: ses 1:0:0:0: task abort: SUCCESS scmd(0x000000002a27f7ac)
May 3 23:48:31 storage kernel: ses 1:0:0:0: attempting task abort!scmd(0x000000003361da0a), outstanding for 30284 ms & timeout 30000 ms
May 3 23:48:31 storage kernel: ses 1:0:0:0: tag#1448 CDB: Receive Diagnostic 1c 01 02 01 6c 00
May 3 23:48:31 storage kernel: scsi target1:0:0: handle(0x000a), sas_address(0x500a0980019f07fe), phy(36)
May 3 23:48:31 storage kernel: scsi target1:0:0: enclosure logical id(0x50050cc10204053c), slot(0)
May 3 23:48:31 storage kernel: ses 1:0:0:0: task abort: SUCCESS scmd(0x000000003361da0a)
May 3 23:50:16 storage kernel: perf: interrupt took too long (2505 > 2500), lowering kernel.perf_event_max_sample_rate to 79750
May 4 00:07:50 storage kernel: perf: interrupt took too long (3146 > 3131), lowering kernel.perf_event_max_sample_rate to 63500
May 4 00:30:25 storage kernel: perf: interrupt took too long (3942 > 3932), lowering kernel.perf_event_max_sample_rate to 50500
May 4 00:43:09 storage kernel: perf: interrupt took too long (4931 > 4927), lowering kernel.perf_event_max_sample_rate to 40500
May 4 01:07:01 storage kernel: mpt2sas_cm0: log_info(0x31080000): originator(PL), code(0x08), sub_code(0x0000)
May 4 01:07:01 storage kernel: mpt2sas_cm0: log_info(0x31080000): originator(PL), code(0x08), sub_code(0x0000)
May 4 01:07:01 storage kernel: sd 0:0:2:0: [sdc] tag#2885 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=3s
May 4 01:07:01 storage kernel: sd 0:0:2:0: [sdc] tag#2885 Sense Key : Medium Error [current]
May 4 01:07:01 storage kernel: sd 0:0:2:0: [sdc] tag#2885 Add. Sense: Unrecovered read error
May 4 01:07:01 storage kernel: sd 0:0:2:0: [sdc] tag#2885 CDB: Read(16) 88 00 00 00 00 00 01 10 7c 00 00 00 07 d0 00 00
May 4 01:07:01 storage kernel: critical medium error, dev sdc, sector 17858768 op 0x0:(READ) flags 0x0 phys_seg 52 prio class 2
May 4 01:07:01 storage kernel: zio pool=TANK03 vdev=/dev/disk/by-partuuid/05bf3c7b-65ca-452c-b8b6-0e4c7676d750 error=61 type=1 offset=9140961280 size=1024000 flags=1074267312
i have currently 4 pools
i can reboot fine with 3 pools but only when the 3 pools are on the R510xd
if i reboot with one pool on the netapp it goes bananas
so to be clear, if export all pools from the Dell R510xd and disconnect the drives and only let the pool from the netapp attached it fails and the fans in the netapp turn up to 100% for a minute or so.
if i reboot with the 3 pools attached on the Dell R510 and only export the pool from the netapp it reboots and imports the pools just fine
EDIT:
i guess i´ll risk it and do the update to 24.4.1.1 tomorrow without exporting any pools so that i can send you a debug
i think this is related to me changing a faulted disk
May 4 01:07:01 storage kernel: sd 0:0:2:0: [sdc] tag#2885 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=3s
May 4 01:07:01 storage kernel: sd 0:0:2:0: [sdc] tag#2885 Sense Key : Medium Error [current]
May 4 01:07:01 storage kernel: sd 0:0:2:0: [sdc] tag#2885 Add. Sense: Unrecovered read error
May 4 01:07:01 storage kernel: sd 0:0:2:0: [sdc] tag#2885 CDB: Read(16) 88 00 00 00 00 00 01 10 7c 00 00 00 07 d0 00 00
May 4 01:07:01 storage kernel: critical medium error, dev sdc, sector 17858768 op 0x0:(READ) flags 0x0 phys_seg 52 prio class 2
May 4 01:07:01 storage kernel: zio pool=TANK03 vdev=/dev/disk/by-partuuid/05bf3c7b-65ca-452c-b8b6-0e4c7676d750 error=61 type=1 offset=9140961280 size=1024000 flags=1074267312
Yeah, you’re correct it looks like sdc
was not in the shelf, its SAS address is 0:0:2:0
not 1:0:2:0
.
Also, am I understanding this correctly?
If you export the pools before rebooting, you manually can import them and its fine in normal operations after? Is that the same for you @QuirkyKirkHax
I have 3 EMC shelves, two 3.5 and one 2.5. In those cases, I don’t have any issues, but I’ve also never put more than one pool on a disk shelf.
absolute correct,
if i export the Pool TANK01 that is located on my NetApp before i reboot i can import it after the reboot manually and it imports and works without any errors
i have 3 pools on my Dell R510XD
pool: TANK03
state: ONLINE
scan: scrub repaired 0B in 04:52:38 with 0 errors on Sat May 25 05:22:40 2024
config:
NAME STATE READ WRITE CKSUM
TANK03 ONLINE 0 0 0
raidz2-0 ONLINE 0 0 0
e51fcaa5-3d00-4527-96b5-2b87b2836cc7 ONLINE 0 0 0
335ab2e2-18b4-4602-9270-f02676455dd9 ONLINE 0 0 0
ad11d232-a0a6-4d92-9f19-b01348111734 ONLINE 0 0 0
530b614a-08d1-4dcb-8b0f-464010991c1b ONLINE 0 0 0
de88e9d5-db80-4c67-820a-15146ae0f2f3 ONLINE 0 0 0
8618bdca-7410-4b4c-b75e-9126fca1ab58 ONLINE 0 0 0
errors: No known data errors
pool: TANK04
state: ONLINE
scan: scrub repaired 0B in 00:32:35 with 0 errors on Sun May 26 13:32:37 2024
config:
NAME STATE READ WRITE CKSUM
TANK04 ONLINE 0 0 0
raidz1-0 ONLINE 0 0 0
e640c07c-a298-450c-ae29-599721bdb7a4 ONLINE 0 0 0
3532b4c2-f3d6-43aa-828e-46b78d1b3d21 ONLINE 0 0 0
478175fb-5c5c-4d27-9391-ec44c5660947 ONLINE 0 0 0
errors: No known data errors
pool: TANK05
state: ONLINE
scan: scrub repaired 0B in 00:00:09 with 0 errors on Sat May 25 13:00:11 2024
config:
NAME STATE READ WRITE CKSUM
TANK05 ONLINE 0 0 0
2d830c9c-e91d-461a-96ae-23253b1ef07d ONLINE 0 0 0
errors: No known data errors
and then one pool on the NetApp 4246
pool: TANK01
state: ONLINE
scan: scrub repaired 0B in 02:52:38 with 0 errors on Sun May 26 02:52:40 2024
config:
NAME STATE READ WRITE CKSUM
TANK01 ONLINE 0 0 0
raidz3-0 ONLINE 0 0 0
dfa1274c-ec58-11ea-812d-782bcb443fe6 ONLINE 0 0 0
79a2ac59-4d32-11ec-85fc-782bcb443fe6 ONLINE 0 0 0
e089d5b5-ec58-11ea-812d-782bcb443fe6 ONLINE 0 0 0
e1c10342-ec58-11ea-812d-782bcb443fe6 ONLINE 0 0 0
e2e86e7f-ec58-11ea-812d-782bcb443fe6 ONLINE 0 0 0
e26295b3-ec58-11ea-812d-782bcb443fe6 ONLINE 0 0 0
e2b0be6a-ec58-11ea-812d-782bcb443fe6 ONLINE 0 0 0
e286ce4b-ec58-11ea-812d-782bcb443fe6 ONLINE 0 0 0
cb3d13b5-904f-11eb-935f-782bcb443fe6 ONLINE 0 0 0
e30a77a6-ec58-11ea-812d-782bcb443fe6 ONLINE 0 0 0
e31647c7-ec58-11ea-812d-782bcb443fe6 ONLINE 0 0 0
e3952aef-ec58-11ea-812d-782bcb443fe6 ONLINE 0 0 0
errors: No known data errors
I am fine until a reboot. Though, I first encountered this issue late one night when smart died on all the drives at once. This was a few days after I had built my 3rd pool (not counting boot). When I rebooted, this started to happen. After a few days of experimenting, I found the trigger was having 3 pools (not including boot), and a reboot. Though I have my suspicions if I left a scale setup like that again and waited a few days, it would repeat.
@NickF1227
could you throw a model number for the EMC 3,5" at me
i would like to maybe buy one to play around
Thank you
Thank you!
maybe an important information i do NOT use Interposers on my NetAPP, i removed them when i migrated from Core to Scale because i had the same problems with the Interposers on Scale, on Core it worked with the Interposers