diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index a771bb94b5fb52..1b406863a9d615 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -25,6 +25,16 @@ MODULE_PARM_DESC(enable_uring, /* redfs only to allow patch backports */ #define IO_URING_F_TASK_DEAD (1 << 13) +/* Number of queued fuse requests until a queue is considered full + * Basically no entries, as synchronization is with bitmaps and lockless. I.e. + * no accuracy - queues always get a bit more requests that way. Lightly + * loaded queues is wanted to reduced kernel/userspace switches. + */ +#define FUSE_URING_QUEUE_THRESHOLD 0 + +static unsigned int fuse_uring_get_random_qid(struct fuse_ring *ring, + const struct cpumask *mask); + #ifndef io_uring_cmd_to_pdu static inline void io_uring_cmd_private_sz_check(size_t cmd_sz) { @@ -103,13 +113,13 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, lockdep_assert_not_held(&queue->lock); spin_lock(&queue->lock); ent->fuse_req = NULL; + queue->nr_reqs--; if (test_bit(FR_BACKGROUND, &req->flags)) { queue->active_background--; spin_lock(&fc->bg_lock); fuse_uring_flush_bg(queue); spin_unlock(&fc->bg_lock); } - spin_unlock(&queue->lock); if (error) @@ -129,6 +139,7 @@ static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) list_for_each_entry(req, &queue->fuse_req_queue, list) clear_bit(FR_PENDING, &req->flags); list_splice_init(&queue->fuse_req_queue, &req_list); + queue->nr_reqs = 0; spin_unlock(&queue->lock); /* must not hold queue lock to avoid order issues with fi->lock */ @@ -141,7 +152,7 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) struct fuse_ring_queue *queue; struct fuse_conn *fc = ring->fc; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; @@ -173,6 +184,23 @@ static void io_pages_free(struct page ***pages, int npages) *pages = NULL; } +static void fuse_ring_destruct_q_masks(struct fuse_ring *ring) +{ + free_cpumask_var(ring->avail_q_mask); + if (ring->per_numa_avail_q_mask) { + for (int node = 0; node < ring->nr_numa_nodes; node++) + free_cpumask_var(ring->per_numa_avail_q_mask[node]); + kfree(ring->per_numa_avail_q_mask); + } + + free_cpumask_var(ring->registered_q_mask); + if (ring->numa_registered_q_mask) { + for (int node = 0; node < ring->nr_numa_nodes; node++) + free_cpumask_var(ring->numa_registered_q_mask[node]); + kfree(ring->numa_registered_q_mask); + } +} + void fuse_uring_destruct(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; @@ -181,7 +209,7 @@ void fuse_uring_destruct(struct fuse_conn *fc) if (!ring) return; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; struct fuse_ring_ent *ent, *next; @@ -210,11 +238,44 @@ void fuse_uring_destruct(struct fuse_conn *fc) ring->queues[qid] = NULL; } + fuse_ring_destruct_q_masks(ring); kfree(ring->queues); kfree(ring); fc->ring = NULL; } +static int fuse_ring_create_q_masks(struct fuse_ring *ring, int nr_queues) +{ + if (!zalloc_cpumask_var(&ring->avail_q_mask, GFP_KERNEL_ACCOUNT)) + return -ENOMEM; + + if (!zalloc_cpumask_var(&ring->registered_q_mask, GFP_KERNEL_ACCOUNT)) + return -ENOMEM; + + ring->per_numa_avail_q_mask = kmalloc_array(ring->nr_numa_nodes, + sizeof(struct cpumask *), + GFP_KERNEL_ACCOUNT); + if (!ring->per_numa_avail_q_mask) + return -ENOMEM; + for (int node = 0; node < ring->nr_numa_nodes; node++) + if (!zalloc_cpumask_var(&ring->per_numa_avail_q_mask[node], + GFP_KERNEL_ACCOUNT)) + return -ENOMEM; + + ring->numa_registered_q_mask = kmalloc_array(ring->nr_numa_nodes, + sizeof(struct cpumask *), + GFP_KERNEL_ACCOUNT); + if (!ring->numa_registered_q_mask) + return -ENOMEM; + for (int node = 0; node < ring->nr_numa_nodes; node++) { + if (!zalloc_cpumask_var(&ring->numa_registered_q_mask[node], + GFP_KERNEL_ACCOUNT)) + return -ENOMEM; + } + + return 0; +} + /* * Basic ring setup for this connection based on the provided configuration */ @@ -224,11 +285,14 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) size_t nr_queues = num_possible_cpus(); struct fuse_ring *res = NULL; size_t max_payload_size; + int err; ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT); if (!ring) return NULL; + ring->nr_numa_nodes = num_online_nodes(); + ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), GFP_KERNEL_ACCOUNT); if (!ring->queues) @@ -237,6 +301,10 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); + err = fuse_ring_create_q_masks(ring, nr_queues); + if (err) + goto out_err; + spin_lock(&fc->lock); if (fc->ring) { /* race, another thread created the ring in the meantime */ @@ -247,7 +315,7 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) init_waitqueue_head(&ring->stop_waitq); - ring->nr_queues = nr_queues; + ring->max_nr_queues = nr_queues; ring->fc = fc; ring->max_payload_sz = max_payload_size; atomic_set(&ring->queue_refs, 0); @@ -257,6 +325,7 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) return ring; out_err: + fuse_ring_destruct_q_masks(ring); kfree(ring->queues); kfree(ring); return res; @@ -280,6 +349,7 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, queue->qid = qid; queue->ring = ring; + queue->numa_node = cpu_to_node(qid); spin_lock_init(&queue->lock); INIT_LIST_HEAD(&queue->ent_avail_queue); @@ -400,7 +470,7 @@ static void fuse_uring_log_ent_state(struct fuse_ring *ring) int qid; struct fuse_ring_ent *ent; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = ring->queues[qid]; if (!queue) @@ -419,6 +489,7 @@ static void fuse_uring_log_ent_state(struct fuse_ring *ring) pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", ring, qid, ent, ent->state); } + spin_unlock(&queue->lock); } ring->stop_debug_log = 1; @@ -431,7 +502,7 @@ static void fuse_uring_async_stop_queues(struct work_struct *work) container_of(work, struct fuse_ring, async_teardown_work.work); /* XXX code dup */ - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); if (!queue) @@ -464,15 +535,25 @@ static void fuse_uring_async_stop_queues(struct work_struct *work) */ void fuse_uring_stop_queues(struct fuse_ring *ring) { - int qid; + int qid, node; - for (qid = 0; qid < ring->nr_queues; qid++) { + for (qid = 0; qid < ring->max_nr_queues; qid++) { struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); if (!queue) continue; fuse_uring_teardown_entries(queue); + + cpumask_clear_cpu(qid, ring->registered_q_mask); + cpumask_clear_cpu(qid, ring->avail_q_mask); + for (node = 0; node < ring->nr_numa_nodes; node++) { + /* Clear the queue from all masks */ + cpumask_clear_cpu(qid, + ring->numa_registered_q_mask[node]); + cpumask_clear_cpu(qid, + ring->per_numa_avail_q_mask[node]); + } } if (atomic_read(&ring->queue_refs) > 0) { @@ -810,9 +891,17 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, static void fuse_uring_ent_avail(struct fuse_ring_ent *ent, struct fuse_ring_queue *queue) { + struct fuse_ring *ring = queue->ring; + int node = queue->numa_node; + WARN_ON_ONCE(!ent->cmd); list_move(&ent->list, &queue->ent_avail_queue); ent->state = FRRS_AVAILABLE; + + if (queue->nr_reqs <= FUSE_URING_QUEUE_THRESHOLD) { + cpumask_set_cpu(queue->qid, ring->avail_q_mask); + cpumask_set_cpu(queue->qid, ring->per_numa_avail_q_mask[node]); + } } /* Used to find the request on SQE commit */ @@ -835,6 +924,8 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, struct fuse_req *req) { struct fuse_ring_queue *queue = ent->queue; + struct fuse_ring *ring = queue->ring; + int node = queue->numa_node; lockdep_assert_held(&queue->lock); @@ -849,6 +940,16 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, ent->state = FRRS_FUSE_REQ; list_move(&ent->list, &queue->ent_w_req_queue); fuse_uring_add_to_pq(ent, req); + + /* + * If there are no more available entries, mark the queue as unavailable + * in both global and per-NUMA node masks + */ + if (list_empty(&queue->ent_avail_queue)) { + cpumask_clear_cpu(queue->qid, ring->avail_q_mask); + cpumask_clear_cpu(queue->qid, + ring->per_numa_avail_q_mask[node]); + } } /* Fetch the next fuse request if available */ @@ -955,7 +1056,7 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, if (!ring) return err; - if (qid >= ring->nr_queues) + if (qid >= ring->max_nr_queues) return -EINVAL; queue = ring->queues[qid]; @@ -1012,31 +1113,6 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, return 0; } -static bool is_ring_ready(struct fuse_ring *ring, int current_qid) -{ - int qid; - struct fuse_ring_queue *queue; - bool ready = true; - - for (qid = 0; qid < ring->nr_queues && ready; qid++) { - if (current_qid == qid) - continue; - - queue = ring->queues[qid]; - if (!queue) { - ready = false; - break; - } - - spin_lock(&queue->lock); - if (list_empty(&queue->ent_avail_queue)) - ready = false; - spin_unlock(&queue->lock); - } - - return ready; -} - /* * fuse_uring_req_fetch command handling */ @@ -1057,13 +1133,9 @@ static void fuse_uring_do_register(struct fuse_ring_ent *ent, spin_unlock(&queue->lock); if (!ring->ready) { - bool ready = is_ring_ready(ring, queue->qid); - - if (ready) { - WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); - WRITE_ONCE(ring->ready, true); - wake_up_all(&fc->blocked_waitq); - } + WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); + WRITE_ONCE(ring->ready, true); + wake_up_all(&fc->blocked_waitq); } } @@ -1249,6 +1321,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, struct fuse_ring_ent *ent; int err; unsigned int qid = READ_ONCE(cmd_req->qid); + int node = cpu_to_node(qid); err = -ENOMEM; if (!ring) { @@ -1257,7 +1330,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, return err; } - if (qid >= ring->nr_queues) { + if (qid >= ring->max_nr_queues) { pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); return -EINVAL; } @@ -1280,6 +1353,9 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, fuse_uring_do_register(ent, cmd, issue_flags); + cpumask_set_cpu(queue->qid, ring->registered_q_mask); + cpumask_set_cpu(queue->qid, ring->numa_registered_q_mask[node]); + return 0; } @@ -1379,22 +1455,109 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, fuse_uring_send(ent, cmd, err, issue_flags); } -static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) +static struct fuse_ring_queue * +fuse_uring_get_first_queue(struct fuse_ring *ring, const struct cpumask *mask) +{ + int qid; + + /* Find the first available CPU in this mask */ + qid = cpumask_first(mask); + + /* Check if we found a valid CPU */ + if (qid >= ring->max_nr_queues) + return NULL; /* No available queues */ + + /* This is the global mask, cpu is already the global qid */ + return ring->queues[qid]; +} + +/* + * Return a random queue from the registered queues mask + * + * Uses a deterministic but well-distributed algorithm to select + * a random queue from the provided CPU mask. + */ +static unsigned int fuse_uring_get_random_qid(struct fuse_ring *ring, + const struct cpumask *mask) +{ + unsigned int nr_bits = cpumask_weight(mask); + unsigned int nth, cpu; + + if (nr_bits == 0) + return UINT_MAX; + + /* Fast path for single CPU */ + if (nr_bits == 1) + return cpumask_first(mask); + + /* + * Use current jiffies and task PID to create a pseudo-random + * but well-distributed selection that varies across calls + */ + nth = (get_random_u32() ^ (jiffies & 0xFFFF) ^ + (current->pid & 0xFFFF)) % + nr_bits; + + /* Find the CPU at that position */ + for_each_cpu(cpu, mask) { + if (nth-- == 0) + return cpu; + } + + return UINT_MAX; +} + +/* + * Get the best queue for the current CPU + */ +static struct fuse_ring_queue *fuse_uring_get_queue(struct fuse_ring *ring) { unsigned int qid; - struct fuse_ring_queue *queue; + struct fuse_ring_queue *queue, *local_queue = NULL; + int local_node; + struct cpumask *mask; + struct fuse_conn *fc = ring->fc; qid = task_cpu(current); + local_node = cpu_to_node(qid); + if (WARN_ON_ONCE(local_node >= ring->nr_numa_nodes || local_node < 0)) + local_node = 0; + + /* First check if current CPU's queue is available */ + if (qid < ring->max_nr_queues) { + local_queue = queue = ring->queues[qid]; + if (queue && queue->nr_reqs <= FUSE_URING_QUEUE_THRESHOLD) + return queue; + } - if (WARN_ONCE(qid >= ring->nr_queues, - "Core number (%u) exceeds nr queues (%zu)\n", qid, - ring->nr_queues)) - qid = 0; + /* Second check if there are any available queues on the local node */ + mask = ring->per_numa_avail_q_mask[local_node]; + queue = fuse_uring_get_first_queue(ring, mask); + if (queue) + return queue; + + /* Third check if there are any available queues on any node */ + queue = fuse_uring_get_first_queue(ring, ring->avail_q_mask); + if (queue) + return queue; + + /* No free queue, use the local queue if it exists */ + if (local_queue) + return local_queue; + + /* Try to use a random queue from the local NUMA node, if there is one */ + mask = ring->numa_registered_q_mask[local_node]; + qid = fuse_uring_get_random_qid(ring, mask); + if (qid < ring->max_nr_queues) + return ring->queues[qid]; - queue = ring->queues[qid]; - WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); + /* Finally, use a random queue among all queues that are registered */ + qid = fuse_uring_get_random_qid(ring, ring->registered_q_mask); + if (qid < ring->max_nr_queues) + return ring->queues[qid]; - return queue; + WARN_ON_ONCE(fc->connected); + return NULL; } static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent, bool bg) @@ -1434,7 +1597,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) int err; err = -EINVAL; - queue = fuse_uring_task_to_queue(ring); + queue = fuse_uring_get_queue(ring); if (!queue) goto err; @@ -1447,10 +1610,26 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) req->ring_queue = queue; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); + queue->nr_reqs++; + + /* + * Update queue availability based on number of requests + * A queue is considered busy if it has more than + * FUSE_URING_QUEUE_THRESHOLD requests + */ + if (queue->nr_reqs == FUSE_URING_QUEUE_THRESHOLD + 1) { + /* Queue just became busy */ + cpumask_clear_cpu(queue->qid, ring->avail_q_mask); + cpumask_clear_cpu( + queue->qid, + ring->per_numa_avail_q_mask[queue->numa_node]); + } + if (ent) fuse_uring_add_req_to_ring_ent(ent, req); else list_add_tail(&req->list, &queue->fuse_req_queue); + spin_unlock(&queue->lock); if (ent) @@ -1473,7 +1652,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) struct fuse_ring_queue *queue; struct fuse_ring_ent *ent = NULL; - queue = fuse_uring_task_to_queue(ring); + queue = fuse_uring_get_queue(ring); if (!queue) return false; @@ -1486,6 +1665,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) set_bit(FR_URING, &req->flags); req->ring_queue = queue; list_add_tail(&req->list, &queue->fuse_req_bg_queue); + queue->nr_reqs++; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); @@ -1518,8 +1698,29 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) bool fuse_uring_remove_pending_req(struct fuse_req *req) { struct fuse_ring_queue *queue = req->ring_queue; + struct fuse_ring *ring = queue->ring; + int node = queue->numa_node; + bool removed = fuse_remove_pending_req(req, &queue->lock); + + if (removed) { + /* Update counters after successful removal */ + spin_lock(&queue->lock); + queue->nr_reqs--; + + /* + * Update queue availability based on number of requests + * A queue is considered available if it has FUSE_URING_QUEUE_THRESHOLD or fewer requests + */ + if (queue->nr_reqs == FUSE_URING_QUEUE_THRESHOLD) { + /* Queue just became available */ + cpumask_set_cpu(queue->qid, ring->avail_q_mask); + cpumask_set_cpu(queue->qid, + ring->per_numa_avail_q_mask[node]); + } + spin_unlock(&queue->lock); + } - return fuse_remove_pending_req(req, &queue->lock); + return removed; } static const struct fuse_iqueue_ops fuse_io_uring_ops = { diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index b6e67bd24d6538..eabb501f6534fd 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -70,6 +70,9 @@ struct fuse_ring_queue { /* queue id, corresponds to the cpu core */ unsigned int qid; + /* NUMA node this queue belongs to */ + int numa_node; + /* * queue lock, taken when any value in the queue changes _and_ also * a ring entry state changes. @@ -98,6 +101,9 @@ struct fuse_ring_queue { /* background fuse requests */ struct list_head fuse_req_bg_queue; + /* number of requests queued or in userspace */ + unsigned int nr_reqs; + struct fuse_pqueue fpq; unsigned int active_background; @@ -114,7 +120,10 @@ struct fuse_ring { struct fuse_conn *fc; /* number of ring queues */ - size_t nr_queues; + size_t max_nr_queues; + + /* number of numa nodes */ + int nr_numa_nodes; /* maximum payload/arg size */ size_t max_payload_sz; @@ -126,6 +135,18 @@ struct fuse_ring { */ unsigned int stop_debug_log : 1; + /* Tracks which queues are available (empty) globally */ + cpumask_var_t avail_q_mask; + + /* Tracks which queues are available per NUMA node */ + cpumask_var_t *per_numa_avail_q_mask; + + /* Tracks which queues are registered */ + cpumask_var_t registered_q_mask; + + /* Tracks which queues are registered per NUMA node */ + cpumask_var_t *numa_registered_q_mask; + wait_queue_head_t stop_waitq; /* async tear down */ @@ -136,6 +157,9 @@ struct fuse_ring { atomic_t queue_refs; + /* Number of CPUs per node */ + int *numa_domain_size; + bool ready; };