Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9e2e575

Browse files
Keith Buschbrauner
authored andcommitted
nvme-pci: rerun irq setup on IO queue init errors
If the driver is unable to create a subset of IO queues for any reason, the read/write and polled queue sets will not match the actual allocated hardware contexts. This leaves gaps in the CPU affinity mappings and causes the following kernel panic after blk_mq_map_queue_type() returns a NULL hctx. BUG: unable to handle kernel NULL pointer dereference at 0000000000000198 #PF error: [normal kernel read fault] PGD 0 P4D 0 Oops: 0000 [#1] SMP CPU: 64 PID: 1171 Comm: kworker/u259:1 Not tainted 4.20.0+ torvalds#241 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014 Workqueue: nvme-wq nvme_scan_work [nvme_core] RIP: 0010:blk_mq_init_allocated_queue+0x2d9/0x440 RSP: 0018:ffffb1bf0abc3cd0 EFLAGS: 00010286 RAX: 000000000000001f RBX: ffff8ea744cf0718 RCX: 0000000000000000 RDX: 0000000000000002 RSI: 000000000000007c RDI: ffffffff9109a820 RBP: ffff8ea7565f7008 R08: 000000000000001f R09: 000000000000003f R10: ffffb1bf0abc3c00 R11: 0000000000000000 R12: 000000000001d008 R13: ffff8ea7565f7008 R14: 000000000000003f R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff8ea757200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000198 CR3: 0000000013058000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: blk_mq_init_queue+0x35/0x60 nvme_validate_ns+0xc6/0x7c0 [nvme_core] ? nvme_identify_ctrl.isra.56+0x7e/0xc0 [nvme_core] nvme_scan_work+0xc8/0x340 [nvme_core] ? __wake_up_common+0x6d/0x120 ? try_to_wake_up+0x55/0x410 process_one_work+0x1e9/0x3d0 worker_thread+0x2d/0x3d0 ? process_one_work+0x3d0/0x3d0 kthread+0x111/0x130 ? kthread_park+0x90/0x90 ret_from_fork+0x1f/0x30 Modules linked in: nvme nvme_core serio_raw CR2: 0000000000000198 Fix by re-running the interrupt vector setup from scratch using a reduced count that may be successful until the created queues matches the irq affinity plus polling queue sets. Signed-off-by: Keith Busch <[email protected]> Reviewed-by: Sagi Grimberg <[email protected]> Reviewed-by: Ming Lei <[email protected]> Signed-off-by: Christoph Hellwig <[email protected]>
1 parent 8ab1328 commit 9e2e575

File tree

1 file changed

+36
-14
lines changed

1 file changed

+36
-14
lines changed

drivers/nvme/host/pci.c

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ struct nvme_dev;
9595
struct nvme_queue;
9696

9797
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
98+
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
9899

99100
/*
100101
* Represents an NVM Express device. Each nvme_dev is a PCI function.
@@ -1420,6 +1421,14 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
14201421
return 0;
14211422
}
14221423

1424+
static void nvme_suspend_io_queues(struct nvme_dev *dev)
1425+
{
1426+
int i;
1427+
1428+
for (i = dev->ctrl.queue_count - 1; i > 0; i--)
1429+
nvme_suspend_queue(&dev->queues[i]);
1430+
}
1431+
14231432
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
14241433
{
14251434
struct nvme_queue *nvmeq = &dev->queues[0];
@@ -2134,6 +2143,12 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
21342143
return result;
21352144
}
21362145

2146+
static void nvme_disable_io_queues(struct nvme_dev *dev)
2147+
{
2148+
if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
2149+
__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
2150+
}
2151+
21372152
static int nvme_setup_io_queues(struct nvme_dev *dev)
21382153
{
21392154
struct nvme_queue *adminq = &dev->queues[0];
@@ -2170,6 +2185,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
21702185
} while (1);
21712186
adminq->q_db = dev->dbs;
21722187

2188+
retry:
21732189
/* Deregister the admin queue's interrupt */
21742190
pci_free_irq(pdev, 0, adminq);
21752191

@@ -2187,25 +2203,34 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
21872203
result = max(result - 1, 1);
21882204
dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
21892205

2190-
dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
2191-
dev->io_queues[HCTX_TYPE_DEFAULT],
2192-
dev->io_queues[HCTX_TYPE_READ],
2193-
dev->io_queues[HCTX_TYPE_POLL]);
2194-
21952206
/*
21962207
* Should investigate if there's a performance win from allocating
21972208
* more queues than interrupt vectors; it might allow the submission
21982209
* path to scale better, even if the receive path is limited by the
21992210
* number of interrupts.
22002211
*/
2201-
22022212
result = queue_request_irq(adminq);
22032213
if (result) {
22042214
adminq->cq_vector = -1;
22052215
return result;
22062216
}
22072217
set_bit(NVMEQ_ENABLED, &adminq->flags);
2208-
return nvme_create_io_queues(dev);
2218+
2219+
result = nvme_create_io_queues(dev);
2220+
if (result || dev->online_queues < 2)
2221+
return result;
2222+
2223+
if (dev->online_queues - 1 < dev->max_qid) {
2224+
nr_io_queues = dev->online_queues - 1;
2225+
nvme_disable_io_queues(dev);
2226+
nvme_suspend_io_queues(dev);
2227+
goto retry;
2228+
}
2229+
dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
2230+
dev->io_queues[HCTX_TYPE_DEFAULT],
2231+
dev->io_queues[HCTX_TYPE_READ],
2232+
dev->io_queues[HCTX_TYPE_POLL]);
2233+
return 0;
22092234
}
22102235

22112236
static void nvme_del_queue_end(struct request *req, blk_status_t error)
@@ -2250,7 +2275,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
22502275
return 0;
22512276
}
22522277

2253-
static bool nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
2278+
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
22542279
{
22552280
int nr_queues = dev->online_queues - 1, sent = 0;
22562281
unsigned long timeout;
@@ -2411,7 +2436,6 @@ static void nvme_pci_disable(struct nvme_dev *dev)
24112436

24122437
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
24132438
{
2414-
int i;
24152439
bool dead = true;
24162440
struct pci_dev *pdev = to_pci_dev(dev->dev);
24172441

@@ -2438,13 +2462,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
24382462
nvme_stop_queues(&dev->ctrl);
24392463

24402464
if (!dead && dev->ctrl.queue_count > 0) {
2441-
if (nvme_disable_io_queues(dev, nvme_admin_delete_sq))
2442-
nvme_disable_io_queues(dev, nvme_admin_delete_cq);
2465+
nvme_disable_io_queues(dev);
24432466
nvme_disable_admin_queue(dev, shutdown);
24442467
}
2445-
for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
2446-
nvme_suspend_queue(&dev->queues[i]);
2447-
2468+
nvme_suspend_io_queues(dev);
2469+
nvme_suspend_queue(&dev->queues[0]);
24482470
nvme_pci_disable(dev);
24492471

24502472
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);

0 commit comments

Comments
 (0)