From ea8c97f10d7e9d5d66270fea7f984b012966fde7 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 20 Jun 2025 17:34:53 +0200 Subject: [PATCH 01/11] fuse: add DLM_LOCK opcode When having writeback cache enabled it is beneficial for data consistency to communicate to the FUSE server when the kernel prepares a page for caching. This lets the FUSE server react and lock the page. Additionally the kernel lets the FUSE server decide how much data it locks by the same call and keeps the given information in the dlm lock management. If the feature is not supported it will be disabled after first unsuccessful use. - Add DLM_LOCK fuse opcode - Add cache page lock caching for writeback cache functionality. This means sending out a FUSE call whenever the kernel prepares a page for writeback cache. The kernel will manage the cache so that it will keep track of already acquired locks. (except for the case that is documented in the code) - Use rb-trees for the management of the already 'locked' page ranges - Use rw_semaphore for synchronization in fuse_dlm_cache (cherry picked from commit 287c8840b60d5cdcf806b16e8cc5722f2dbf0738) --- fs/fuse/Makefile | 2 +- fs/fuse/dir.c | 6 + fs/fuse/file.c | 13 + fs/fuse/fuse_dlm_cache.c | 551 ++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_dlm_cache.h | 50 ++++ fs/fuse/fuse_i.h | 18 ++ fs/fuse/fuse_trace.h | 1 + fs/fuse/inode.c | 11 + include/uapi/linux/fuse.h | 36 +++ 9 files changed, 687 insertions(+), 1 deletion(-) create mode 100644 fs/fuse/fuse_dlm_cache.c create mode 100644 fs/fuse/fuse_dlm_cache.h diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 47e649f17f25ae..f6952d826bcb33 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o -fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse_dlm_cache.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 15e8eab4b719c6..25c20fd682d93d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -6,6 +6,7 @@ See the file COPYING. */ +#include "fuse_dlm_cache.h" #include "fuse_i.h" #include @@ -1923,6 +1924,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, * truncation has already been done by OPEN. But still * need to truncate page cache. */ + if (fc->dlm && fc->writeback_cache) + fuse_dlm_cache_release_locks(fi); i_size_write(inode, 0); truncate_pagecache(inode, 0); goto out; @@ -2028,6 +2031,9 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, */ if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { + if (fc->dlm && fc->writeback_cache) + fuse_dlm_unlock_range(fi, outarg.attr.size & PAGE_MASK, -1); + truncate_pagecache(inode, outarg.attr.size); invalidate_inode_pages2(mapping); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3f98aca9dc63a2..68e841f9704024 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dlm_cache.h" #include #include @@ -1420,6 +1421,17 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) goto writethrough; } + /* if we have dlm support acquire the lock for the area + * we are writing into */ + if (fc->dlm) { + /* note that a file opened with O_APPEND will have relative values + * in ki_pos. This code is here for convenience and for libfuse overlay test. + * Filesystems should handle O_APPEND with 'direct io' to additionally + * get the performance benefits of 'parallel direct writes'. */ + loff_t pos = file->f_flags & O_APPEND ? i_size_read(inode) + iocb->ki_pos : iocb->ki_pos; + size_t length = iov_iter_count(from); + fuse_get_dlm_write_lock(file, pos, length); + } return generic_file_write_iter(iocb, from); } @@ -3329,6 +3341,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); + fuse_dlm_cache_init(fi); fi->writectr = 0; fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); diff --git a/fs/fuse/fuse_dlm_cache.c b/fs/fuse/fuse_dlm_cache.c new file mode 100644 index 00000000000000..ea947f34a9f70a --- /dev/null +++ b/fs/fuse/fuse_dlm_cache.c @@ -0,0 +1,551 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * FUSE page lock cache implementation + */ +#include "fuse_i.h" +#include "fuse_dlm_cache.h" + +#include +#include +#include +#include + + +/* A range of pages with a lock */ +struct fuse_dlm_range { + /* Interval tree node */ + struct rb_node rb; + /* Start page offset (inclusive) */ + pgoff_t start; + /* End page offset (inclusive) */ + pgoff_t end; + /* Subtree end value for interval tree */ + pgoff_t __subtree_end; + /* Lock mode */ + enum fuse_page_lock_mode mode; + /* Temporary list entry for operations */ + struct list_head list; +}; + +/* Lock modes for FUSE page cache */ +#define FUSE_PCACHE_LK_READ 1 /* Shared read lock */ +#define FUSE_PCACHE_LK_WRITE 2 /* Exclusive write lock */ + +/* Interval tree definitions for page ranges */ +static inline pgoff_t fuse_dlm_range_start(struct fuse_dlm_range *range) +{ + return range->start; +} + +static inline pgoff_t fuse_dlm_range_last(struct fuse_dlm_range *range) +{ + return range->end; +} + +INTERVAL_TREE_DEFINE(struct fuse_dlm_range, rb, pgoff_t, __subtree_end, + fuse_dlm_range_start, fuse_dlm_range_last, static, + fuse_page_it); + +/** + * fuse_page_cache_init - Initialize a page cache lock manager + * @cache: The cache to initialize + * + * Initialize a page cache lock manager for a FUSE inode. + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_cache_init(struct fuse_inode *inode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + + if (!cache) + return -EINVAL; + + init_rwsem(&cache->lock); + cache->ranges = RB_ROOT_CACHED; + + return 0; +} + +/** + * fuse_page_cache_destroy - Clean up a page cache lock manager + * @cache: The cache to clean up + * + * Release all locks and free all resources associated with the cache. + */ +void fuse_dlm_cache_release_locks(struct fuse_inode *inode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range; + struct rb_node *node; + + if (!cache) + return; + + /* Release all locks */ + down_write(&cache->lock); + while ((node = rb_first_cached(&cache->ranges)) != NULL) { + range = rb_entry(node, struct fuse_dlm_range, rb); + fuse_page_it_remove(range, &cache->ranges); + kfree(range); + } + up_write(&cache->lock); +} + +/** + * fuse_dlm_find_overlapping - Find a range that overlaps with [start, end] + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Return: Pointer to the first overlapping range, or NULL if none found + */ +static struct fuse_dlm_range * +fuse_dlm_find_overlapping(struct fuse_dlm_cache *cache, pgoff_t start, + pgoff_t end) +{ + return fuse_page_it_iter_first(&cache->ranges, start, end); +} + +/** + * fuse_page_try_merge - Try to merge ranges within a specific region + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Attempt to merge ranges within and adjacent to the specified region + * that have the same lock mode. + */ +static void fuse_dlm_try_merge(struct fuse_dlm_cache *cache, pgoff_t start, + pgoff_t end) +{ + struct fuse_dlm_range *range, *next; + struct rb_node *node; + + if (!cache) + return; + + /* Find the first range that might need merging */ + range = NULL; + node = rb_first_cached(&cache->ranges); + while (node) { + range = rb_entry(node, struct fuse_dlm_range, rb); + if (range->end >= start - 1) + break; + node = rb_next(node); + } + + if (!range || range->start > end + 1) + return; + + /* Try to merge ranges in and around the specified region */ + while (range && range->start <= end + 1) { + /* Get next range before we potentially modify the tree */ + next = NULL; + if (rb_next(&range->rb)) { + next = rb_entry(rb_next(&range->rb), + struct fuse_dlm_range, rb); + } + + /* Try to merge with next range if adjacent and same mode */ + if (next && range->mode == next->mode && + range->end + 1 == next->start) { + /* Merge ranges */ + range->end = next->end; + + /* Remove next from tree */ + fuse_page_it_remove(next, &cache->ranges); + kfree(next); + + /* Continue with the same range */ + continue; + } + + /* Move to next range */ + range = next; + } +} + +/** + * fuse_dlm_lock_range - Lock a range of pages + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * @mode: Lock mode (read or write) + * + * Add a locked range on the specified range of pages. + * If parts of the range are already locked, only add the remaining parts. + * For overlapping ranges, handle lock compatibility: + * - READ locks are compatible with existing READ locks + * - READ locks are compatible with existing WRITE locks (downgrade not needed) + * - WRITE locks need to upgrade existing READ locks + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range, *new_range, *next; + int lock_mode; + int ret = 0; + LIST_HEAD(to_lock); + LIST_HEAD(to_upgrade); + pgoff_t current_start = start; + + if (!cache || start > end) + return -EINVAL; + + /* Convert to lock mode */ + lock_mode = (mode == FUSE_PAGE_LOCK_READ) ? FUSE_PCACHE_LK_READ : + FUSE_PCACHE_LK_WRITE; + + down_write(&cache->lock); + + /* Find all ranges that overlap with [start, end] */ + range = fuse_page_it_iter_first(&cache->ranges, start, end); + while (range) { + /* Get next overlapping range before we potentially modify the tree */ + next = fuse_page_it_iter_next(range, start, end); + + /* Check lock compatibility */ + if (lock_mode == FUSE_PCACHE_LK_WRITE && + lock_mode != range->mode) { + /* we own the lock but have to update it. */ + list_add_tail(&range->list, &to_upgrade); + } + /* If WRITE lock already exists - nothing to do */ + + /* If there's a gap before this range, we need to add the missing range */ + if (current_start < range->start) { + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out_free; + } + + new_range->start = current_start; + new_range->end = range->start - 1; + new_range->mode = lock_mode; + INIT_LIST_HEAD(&new_range->list); + + list_add_tail(&new_range->list, &to_lock); + } + + /* Move current_start past this range */ + current_start = max(current_start, range->end + 1); + + /* Move to next range */ + range = next; + } + + /* If there's a gap after the last range to the end, extend the range */ + if (current_start <= end) { + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out_free; + } + + new_range->start = current_start; + new_range->end = end; + new_range->mode = lock_mode; + INIT_LIST_HEAD(&new_range->list); + + list_add_tail(&new_range->list, &to_lock); + } + + /* update locks, if any lock is in this list it has the wrong mode */ + list_for_each_entry(range, &to_upgrade, list) { + /* Update the lock mode */ + range->mode = lock_mode; + } + + /* Add all new ranges to the tree */ + list_for_each_entry(new_range, &to_lock, list) { + /* Add to interval tree */ + fuse_page_it_insert(new_range, &cache->ranges); + } + + /* Try to merge adjacent ranges with the same mode */ + fuse_dlm_try_merge(cache, start, end); + + up_write(&cache->lock); + return 0; + +out_free: + /* Free any ranges we allocated but didn't insert */ + while (!list_empty(&to_lock)) { + new_range = + list_first_entry(&to_lock, struct fuse_dlm_range, list); + list_del(&new_range->list); + kfree(new_range); + } + + /* Restore original lock modes for any partially upgraded locks */ + list_for_each_entry(range, &to_upgrade, list) { + if (lock_mode == FUSE_PCACHE_LK_WRITE) { + /* We upgraded this lock but failed later, downgrade it back */ + range->mode = FUSE_PCACHE_LK_READ; + } + } + + up_write(&cache->lock); + return ret; +} + +/** + * fuse_dlm_punch_hole - Punch a hole in a locked range + * @cache: The page cache + * @start: Start page offset of the hole + * @end: End page offset of the hole + * + * Create a hole in a locked range by splitting it into two ranges. + * + * Return: 0 on success, negative error code on failure + */ +static int fuse_dlm_punch_hole(struct fuse_dlm_cache *cache, pgoff_t start, + pgoff_t end) +{ + struct fuse_dlm_range *range, *new_range; + int ret = 0; + + if (!cache || start > end) + return -EINVAL; + + /* Find a range that contains [start, end] */ + range = fuse_dlm_find_overlapping(cache, start, end); + if (!range) { + ret = -EINVAL; + goto out; + } + + /* If the hole is at the beginning of the range */ + if (start == range->start) { + range->start = end + 1; + goto out; + } + + /* If the hole is at the end of the range */ + if (end == range->end) { + range->end = start - 1; + goto out; + } + + /* The hole is in the middle, need to split */ + new_range = kmalloc(sizeof(*new_range), GFP_KERNEL); + if (!new_range) { + ret = -ENOMEM; + goto out; + } + + /* Copy properties from original range */ + *new_range = *range; + INIT_LIST_HEAD(&new_range->list); + + /* Adjust ranges */ + new_range->start = end + 1; + range->end = start - 1; + + /* Update interval tree */ + fuse_page_it_remove(range, &cache->ranges); + fuse_page_it_insert(range, &cache->ranges); + fuse_page_it_insert(new_range, &cache->ranges); + +out: + return ret; +} + +/** + * fuse_dlm_unlock_range - Unlock a range of pages + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * + * Release locks on the specified range of pages. + * + * Return: 0 on success, negative error code on failure + */ +int fuse_dlm_unlock_range(struct fuse_inode *inode, + pgoff_t start, pgoff_t end) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range, *next; + int ret = 0; + + if (!cache) + return -EINVAL; + + down_write(&cache->lock); + + /* Find all ranges that overlap with [start, end] */ + range = fuse_page_it_iter_first(&cache->ranges, start, end); + while (range) { + /* Get next overlapping range before we potentially modify the tree */ + next = fuse_page_it_iter_next(range, start, end); + + /* Check if we need to punch a hole */ + if (start > range->start && end < range->end) { + /* Punch a hole in the middle */ + ret = fuse_dlm_punch_hole(cache, start, end); + if (ret) + goto out; + /* After punching a hole, we're done */ + break; + } else if (start > range->start) { + /* Adjust the end of the range */ + range->end = start - 1; + } else if (end < range->end) { + /* Adjust the start of the range */ + range->start = end + 1; + } else { + /* Complete overlap, remove the range */ + fuse_page_it_remove(range, &cache->ranges); + kfree(range); + } + + range = next; + } + +out: + up_write(&cache->lock); + return ret; +} + +/** + * fuse_dlm_range_is_locked - Check if a page range is already locked + * @cache: The page cache + * @start: Start page offset + * @end: End page offset + * @mode: Lock mode to check for (or NULL to check for any lock) + * + * Check if the specified range of pages is already locked. + * The entire range must be locked for this to return true. + * + * Return: true if the entire range is locked, false otherwise + */ +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode) +{ + struct fuse_dlm_cache *cache = &inode->dlm_locked_areas; + struct fuse_dlm_range *range; + int lock_mode = 0; + pgoff_t current_start = start; + + if (!cache || start > end) + return false; + + /* Convert to lock mode if specified */ + if (mode == FUSE_PAGE_LOCK_READ) + lock_mode = FUSE_PCACHE_LK_READ; + else if (mode == FUSE_PAGE_LOCK_WRITE) + lock_mode = FUSE_PCACHE_LK_WRITE; + + down_read(&cache->lock); + + /* Find the first range that overlaps with [start, end] */ + range = fuse_dlm_find_overlapping(cache, start, end); + + /* Check if the entire range is covered */ + while (range && current_start <= end) { + /* If we're checking for a specific mode, verify it matches */ + if (lock_mode && range->mode != lock_mode) { + /* Wrong lock mode */ + up_read(&cache->lock); + return false; + } + + /* Check if there's a gap before this range */ + if (current_start < range->start) { + /* Found a gap */ + up_read(&cache->lock); + return false; + } + + /* Move current_start past this range */ + current_start = range->end + 1; + + /* Get next overlapping range */ + range = fuse_page_it_iter_next(range, start, end); + } + + /* Check if we covered the entire range */ + if (current_start <= end) { + /* There's a gap at the end */ + up_read(&cache->lock); + return false; + } + + up_read(&cache->lock); + return true; +} + +/** + * request a dlm lock from the fuse server + */ +void fuse_get_dlm_write_lock(struct file *file, loff_t offset, + size_t length) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_mount *fm = ff->fm; + loff_t end = (offset + length - 1) | (PAGE_SIZE - 1); + + /* note that the offset and length don't have to be page aligned here + * but since we only get here on writeback caching we will send out + * page aligned requests */ + offset &= PAGE_MASK; + + FUSE_ARGS(args); + struct fuse_dlm_lock_in inarg; + struct fuse_dlm_lock_out outarg; + int err; + + /* note that this can be run from different processes + * at the same time. It is intentionally not protected + * since a DLM implementation in the FUSE server should take care + * of any races in lock requests */ + if (fuse_dlm_range_is_locked(fi, offset, + end, FUSE_PAGE_LOCK_WRITE)) + return; /* we already have this area locked */ + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + + inarg.offset = offset; + inarg.size = end - offset + 1; + inarg.type = FUSE_DLM_LOCK_WRITE; + + args.opcode = FUSE_DLM_WB_LOCK; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + /* fuse server does not support dlm, save the info */ + fc->dlm = 0; + return; + } + + if (outarg.locksize < end - offset + 1) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %llu bytes returned %u bytes\n", + end - offset + 1, outarg.locksize); + fuse_abort_conn(fc); + return; + } + + if (err) + return; + else + /* ignore any errors here, there is no way we can react appropriately */ + fuse_dlm_lock_range(fi, offset, + offset + outarg.locksize - 1, + FUSE_PAGE_LOCK_WRITE); +} diff --git a/fs/fuse/fuse_dlm_cache.h b/fs/fuse/fuse_dlm_cache.h new file mode 100644 index 00000000000000..98b27a2c15d8ba --- /dev/null +++ b/fs/fuse/fuse_dlm_cache.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * FUSE page cache lock implementation + */ + +#ifndef _FS_FUSE_DLM_CACHE_H +#define _FS_FUSE_DLM_CACHE_H + +#include +#include +#include +#include + + +struct fuse_inode; + +/* Lock modes for page ranges */ +enum fuse_page_lock_mode { FUSE_PAGE_LOCK_READ, FUSE_PAGE_LOCK_WRITE }; + +/* Page cache lock manager */ +struct fuse_dlm_cache { + /* Lock protecting the tree */ + struct rw_semaphore lock; + /* Interval tree of locked ranges */ + struct rb_root_cached ranges; +}; + +/* Initialize a page cache lock manager */ +int fuse_dlm_cache_init(struct fuse_inode *inode); + +/* Clean up a page cache lock manager */ +void fuse_dlm_cache_release_locks(struct fuse_inode *inode); + +/* Lock a range of pages */ +int fuse_dlm_lock_range(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode); + +/* Unlock a range of pages */ +int fuse_dlm_unlock_range(struct fuse_inode *inode, pgoff_t start, + pgoff_t end); + +/* Check if a page range is already locked */ +bool fuse_dlm_range_is_locked(struct fuse_inode *inode, pgoff_t start, + pgoff_t end, enum fuse_page_lock_mode mode); + +/* this is the interface to the filesystem */ +void fuse_get_dlm_write_lock(struct file *file, loff_t offset, + size_t length); + +#endif /* _FS_FUSE_DLM_CACHE_H */ diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a2cd4a9f909bcc..f86d0c0d17d5b6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -31,6 +31,7 @@ #include #include #include +#include "fuse_dlm_cache.h" /** Default max number of pages that can be used in a single read request */ #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 @@ -84,6 +85,17 @@ struct fuse_submount_lookup { struct fuse_forget_link *forget; }; +/** + * data structure to save the information that we have + * requested dlm locks for the given area from the fuse server +*/ +struct dlm_locked_area +{ + struct list_head list; + loff_t offset; + size_t size; +}; + /** FUSE inode */ struct fuse_inode { /** Inode data */ @@ -142,6 +154,9 @@ struct fuse_inode { /* List of writepage requestst (pending or sent) */ struct rb_root writepages; + + /* dlm locked areas we have sent lock requests for */ + struct fuse_dlm_cache dlm_locked_areas; }; /* readdir cache (directory only) */ @@ -841,6 +856,9 @@ struct fuse_conn { /* Is statx not implemented by fs? */ unsigned int no_statx:1; + /* do we have support for dlm in the fs? */ + unsigned int dlm:1; + /* Use io_uring for communication */ unsigned int io_uring; diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h index 393c630e772635..9976e31a51a9c9 100644 --- a/fs/fuse/fuse_trace.h +++ b/fs/fuse/fuse_trace.h @@ -58,6 +58,7 @@ EM( FUSE_SYNCFS, "FUSE_SYNCFS") \ EM( FUSE_TMPFILE, "FUSE_TMPFILE") \ EM( FUSE_STATX, "FUSE_STATX") \ + EM( FUSE_DLM_WB_LOCK, "FUSE_DLM_WB_LOCK") \ EMe(CUSE_INIT, "CUSE_INIT") /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 35f71b523237fd..bfcd662e18b927 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dlm_cache.h" #include "dev_uring_i.h" #include @@ -183,6 +184,7 @@ static void fuse_evict_inode(struct inode *inode) if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); + fuse_dlm_cache_release_locks(fi); } } @@ -569,6 +571,14 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, pg_end = -1; else pg_end = (offset + len - 1) >> PAGE_SHIFT; + + if (fc->dlm && fc->writeback_cache) + /* invalidate the range from the beginning of the first page + * in the given range to the last byte of the last page */ + fuse_dlm_unlock_range(fi, + pg_start << PAGE_SHIFT, + (pg_end << PAGE_SHIFT) | (PAGE_SIZE - 1)); + invalidate_inode_pages2_range(inode->i_mapping, pg_start, pg_end); } @@ -973,6 +983,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->blocked = 0; fc->initialized = 0; fc->connected = 1; + fc->dlm = 1; atomic64_set(&fc->attr_version, 1); atomic64_set(&fc->evict_ctr, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 76ba774993db0d..bab03e93f79dd1 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -638,6 +638,7 @@ enum fuse_opcode { FUSE_SYNCFS = 50, FUSE_TMPFILE = 51, FUSE_STATX = 52, + FUSE_DLM_WB_LOCK = 53, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -1172,6 +1173,41 @@ struct fuse_supp_groups { uint32_t groups[]; }; +/** + * Type of the dlm lock requested + */ +enum fuse_dlm_lock_type { + FUSE_DLM_LOCK_NONE = 0, + FUSE_DLM_LOCK_READ = 1, + FUSE_DLM_LOCK_WRITE = 2 +}; + +/** + * struct fuse_dlm_lock_in - Lock request + * @fh: file handle + * @offset: offset into the file + * @size: size of the locked region + * @type: type of lock + */ +struct fuse_dlm_lock_in { + uint64_t fh; + uint64_t offset; + uint32_t size; + uint32_t type; + uint64_t reserved; +}; + +/** + * struct fuse_dlm_lock_out - Lock response + * @locksize: how many bytes where locked by the call + * (most of the time we want to lock more than is requested + * to reduce number of calls) + */ +struct fuse_dlm_lock_out { + uint32_t locksize; + uint32_t padding; +}; + /** * Size of the ring buffer header */ From 4a26f96b2405bf1d994125e57a8f5e01733dd4a7 Mon Sep 17 00:00:00 2001 From: Yong Ze Chen Date: Tue, 8 Jul 2025 06:41:45 +0000 Subject: [PATCH 02/11] fuse: invalidate inode aliases when doing inode invalidation Add support to invalidate inode aliases when doing inode invalidation. This is useful for distributed file systems, which use DLM for cache coherency. So, when a client losts its inode lock, it should invalidate its inode cache and dentry cache since the other client may delete this file after getting inode lock. Signed-off-by: Yong Ze Chen (cherry picked from commit 49720b5c84ada61feeb09da9ad4b9a0a40694792) --- fs/fuse/fuse_i.h | 6 +++++ fs/fuse/inode.c | 50 ++++++++++++++++++++++++++++++++++++++- include/uapi/linux/fuse.h | 4 ++++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f86d0c0d17d5b6..e62c9ada07bcc5 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -722,6 +722,12 @@ struct fuse_conn { */ unsigned handle_killpriv_v2:1; + /* invalidate inode entries when doing inode invalidation */ + unsigned inval_inode_entries:1; + + /* expire inode entries when doing inode invalidation */ + unsigned expire_inode_entries:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index bfcd662e18b927..9912c3a9b559a1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -546,6 +546,45 @@ struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, return NULL; } +static void fuse_prune_aliases(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + fuse_invalidate_entry_cache(dentry); + } + spin_unlock(&inode->i_lock); + + d_prune_aliases(inode); +} + +static void fuse_invalidate_inode_entry(struct inode *inode) +{ + struct dentry *dentry; + + if (S_ISDIR(inode->i_mode)) { + /* For directories, use d_invalidate to handle children and submounts */ + dentry = d_find_alias(inode); + if (dentry) { + d_invalidate(dentry); + fuse_invalidate_entry_cache(dentry); + dput(dentry); + } + } else { + /* For regular files, just unhash the dentry */ + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry)) + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + fuse_invalidate_entry_cache(dentry); + } + spin_unlock(&inode->i_lock); + } +} + int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len) { @@ -563,6 +602,11 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, fi->attr_version = atomic64_inc_return(&fc->attr_version); spin_unlock(&fi->lock); + if (fc->inval_inode_entries) + fuse_invalidate_inode_entry(inode); + else if (fc->expire_inode_entries) + fuse_prune_aliases(inode); + fuse_invalidate_attr(inode); forget_all_cached_acls(inode); if (offset >= 0) { @@ -1377,6 +1421,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->io_uring = 1; if (flags & FUSE_NO_EXPORT_SUPPORT) fm->sb->s_export_op = &fuse_export_fid_operations; + if (flags & FUSE_INVAL_INODE_ENTRY) + fc->inval_inode_entries = 1; + if (flags & FUSE_EXPIRE_INODE_ENTRY) + fc->expire_inode_entries = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1427,7 +1475,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | - FUSE_NO_EXPORT_SUPPORT; + FUSE_NO_EXPORT_SUPPORT | FUSE_INVAL_INODE_ENTRY | FUSE_EXPIRE_INODE_ENTRY; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index bab03e93f79dd1..eda798a70f90e1 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -427,6 +427,8 @@ struct fuse_file_lock { * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode. * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support * FUSE_OVER_IO_URING: Indicate that client supports io-uring + * FUSE_INVAL_INODE_ENTRY: invalidate inode aliases when doing inode invalidation + * FUSE_EXPIRE_INODE_ENTRY: expire inode aliases when doing inode invalidation */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -471,6 +473,8 @@ struct fuse_file_lock { /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP #define FUSE_OVER_IO_URING (1ULL << 41) +#define FUSE_INVAL_INODE_ENTRY (1ULL << 60) +#define FUSE_EXPIRE_INODE_ENTRY (1ULL << 61) /** * CUSE INIT request/reply flags From 60b62fe47398c0e89846f30ad658b0295421c015 Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Thu, 17 Jul 2025 17:04:16 +0000 Subject: [PATCH 03/11] fuse: Renumber FUSE_DLM_WB_LOCK to 100 Renumber the operation code to a high value to avoid conflicts with upstream. (cherry picked from commit 27a0e9ea714f7fcf3ee40f977be6a17c10766509) --- fs/fuse/fuse_trace.h | 2 +- include/uapi/linux/fuse.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h index 9976e31a51a9c9..e81c93b9614627 100644 --- a/fs/fuse/fuse_trace.h +++ b/fs/fuse/fuse_trace.h @@ -58,7 +58,7 @@ EM( FUSE_SYNCFS, "FUSE_SYNCFS") \ EM( FUSE_TMPFILE, "FUSE_TMPFILE") \ EM( FUSE_STATX, "FUSE_STATX") \ - EM( FUSE_DLM_WB_LOCK, "FUSE_DLM_WB_LOCK") \ + EM( FUSE_DLM_WB_LOCK, "FUSE_DLM_WB_LOCK") \ EMe(CUSE_INIT, "CUSE_INIT") /* diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index eda798a70f90e1..fe1a9223a50c6e 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -642,7 +642,9 @@ enum fuse_opcode { FUSE_SYNCFS = 50, FUSE_TMPFILE = 51, FUSE_STATX = 52, - FUSE_DLM_WB_LOCK = 53, + + /* Operations which have not been merged into upstream */ + FUSE_DLM_WB_LOCK = 100, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -1183,7 +1185,7 @@ struct fuse_supp_groups { enum fuse_dlm_lock_type { FUSE_DLM_LOCK_NONE = 0, FUSE_DLM_LOCK_READ = 1, - FUSE_DLM_LOCK_WRITE = 2 + FUSE_DLM_LOCK_WRITE = 2, }; /** From 466745090f60fc621b68a22fc6419ce5e56f9ef7 Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Wed, 16 Jul 2025 03:18:06 +0000 Subject: [PATCH 04/11] fuse: Send DLM_WB_LOCK request in page_mkwrite handler Send a DLM_WB_LOCK request in the page_mkwrite handler to enable FUSE filesystems to acquire a distributed lock manager (DLM) lock for protecting upcoming dirty pages when a previously read-only mapped page is about to be written. Signed-off-by: Cheng Ding (cherry picked from commit ec36c455214837e9ce0d3f3385a0bb50dcfb51db) --- fs/fuse/file.c | 64 ++++++++++++++++++++++++++++++++++++++- include/uapi/linux/fuse.h | 1 + 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 68e841f9704024..8a344ac12692e7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2499,6 +2499,57 @@ static void fuse_vma_close(struct vm_area_struct *vma) mapping_set_error(vma->vm_file->f_mapping, err); } +/** + * Request a DLM lock from the FUSE server. + * + * This routine is similar to fuse_get_dlm_write_lock(), but it + * does not cache the DLM lock in the kernel. + */ +static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t length) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = ff->fm; + + FUSE_ARGS(args); + struct fuse_dlm_lock_in inarg; + struct fuse_dlm_lock_out outarg; + int err; + + if (WARN_ON_ONCE((offset & ~PAGE_MASK) || (length & ~PAGE_MASK))) + return -EIO; + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + + inarg.offset = offset; + inarg.size = length; + inarg.type = FUSE_DLM_PAGE_MKWRITE; + + args.opcode = FUSE_DLM_WB_LOCK; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fc->dlm = 0; + err = 0; + } + + if (!err && outarg.locksize < length) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %lu bytes returned %u bytes\n", + length, outarg.locksize); + fuse_abort_conn(fc); + err = -EINVAL; + } + return err; +} /* * Wait for writeback against this page to complete before allowing it * to be marked dirty again, and hence written back again, possibly @@ -2517,7 +2568,18 @@ static void fuse_vma_close(struct vm_area_struct *vma) static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = file_inode(vmf->vma->vm_file); + struct file *file = vmf->vma->vm_file; + struct inode *inode = file_inode(file); + struct fuse_mount *fm = get_fuse_mount(inode); + + if (fm->fc->dlm) { + loff_t pos = vmf->pgoff << PAGE_SHIFT; + size_t length = PAGE_SIZE; + int err = fuse_get_page_mkwrite_lock(file, pos, length); + if (err < 0) { + return vmf_error(err); + } + } file_update_time(vmf->vma->vm_file); lock_page(page); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index fe1a9223a50c6e..946533327b1563 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1186,6 +1186,7 @@ enum fuse_dlm_lock_type { FUSE_DLM_LOCK_NONE = 0, FUSE_DLM_LOCK_READ = 1, FUSE_DLM_LOCK_WRITE = 2, + FUSE_DLM_PAGE_MKWRITE = 3, }; /** From 8273bdc7167b27a9af31bae315ea4becc275188c Mon Sep 17 00:00:00 2001 From: Cheng Ding Date: Wed, 16 Jul 2025 03:20:08 +0000 Subject: [PATCH 05/11] fuse: Allow read_folio to retry page fault and read operations Allow read_folio to return EAGAIN error and translate it to AOP_TRUNCATE_PAGE to retry page fault and read operations. This is used to prevent deadlock of folio lock/DLM lock order reversal: - Fault or read operations acquire folio lock first, then DLM lock. - FUSE daemon blocks new DLM lock acquisition while it invalidating page cache. invalidate_inode_pages2_range() acquires folio lock To prevent deadlock, the FUSE daemon will fail its DLM lock acquisition with EAGAIN if it detects an in-flight page cache invalidating operation. Signed-off-by: Cheng Ding (cherry picked from commit 8ecf1182053891c6458b10be1272d2d562492fbd) --- fs/fuse/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8a344ac12692e7..358c7e52665daf 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -879,8 +879,11 @@ static int fuse_do_readpage(struct file *file, struct page *page) fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); res = fuse_simple_request(fm, &ia.ap.args); - if (res < 0) + if (res < 0) { + if (res == -EAGAIN) + res = AOP_TRUNCATED_PAGE; return res; + } /* * Short read means EOF. If file size is larger, truncate it */ From 29b5777282eb0641479ef4d3fe32aaab6fb18aac Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 17 Jul 2025 16:26:51 -0700 Subject: [PATCH 06/11] fuse: flush pending fuse events before aborting the connection generic/488 fails with fuse2fs in the following fashion: generic/488 _check_generic_filesystem: filesystem on /dev/sdf is inconsistent (see /var/tmp/fstests/generic/488.full for details) This test opens a large number of files, unlinks them (which really just renames them to fuse hidden files), closes the program, unmounts the filesystem, and runs fsck to check that there aren't any inconsistencies in the filesystem. Unfortunately, the 488.full file shows that there are a lot of hidden files left over in the filesystem, with incorrect link counts. Tracing fuse_request_* shows that there are a large number of FUSE_RELEASE commands that are queued up on behalf of the unlinked files at the time that fuse_conn_destroy calls fuse_abort_conn. Had the connection not aborted, the fuse server would have responded to the RELEASE commands by removing the hidden files; instead they stick around. Create a function to push all the background requests to the queue and then wait for the number of pending events to hit zero, and call this before fuse_abort_conn. That way, all the pending events are processed by the fuse server and we don't end up with a corrupt filesystem. Signed-off-by: Darrick J. Wong (cherry picked from commit d4262f9cf5232394d518207863d1ad79f52b179e) --- fs/fuse/dev.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 6 ++++++ fs/fuse/inode.c | 1 + 3 files changed, 45 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f4180d3ecbf924..7ddb08b96d2a25 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -23,6 +23,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "fuse_trace.h" @@ -2147,6 +2148,43 @@ static void end_polls(struct fuse_conn *fc) } } +/* + * Flush all pending requests and wait for them. Only call this function when + * it is no longer possible for other threads to add requests. + */ +void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout) +{ + unsigned long deadline; + + spin_lock(&fc->lock); + if (!fc->connected) { + spin_unlock(&fc->lock); + return; + } + + /* Push all the background requests to the queue. */ + spin_lock(&fc->bg_lock); + fc->blocked = 0; + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); + spin_unlock(&fc->lock); + + /* + * Wait 30s for all the events to complete or abort. Touch the + * watchdog once per second so that we don't trip the hangcheck timer + * while waiting for the fuse server. + */ + deadline = jiffies + timeout; + smp_mb(); + while (fc->connected && + (!timeout || time_before(jiffies, deadline)) && + wait_event_timeout(fc->blocked_waitq, + !fc->connected || atomic_read(&fc->num_waiting) == 0, + HZ) == 0) + touch_softlockup_watchdog(); +} + /* * Abort all requests. * diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e62c9ada07bcc5..f0915fe551b99e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1202,6 +1202,12 @@ void fuse_request_end(struct fuse_req *req); void fuse_abort_conn(struct fuse_conn *fc); void fuse_wait_aborted(struct fuse_conn *fc); +/** + * Flush all pending requests and wait for them. Takes an optional timeout + * in jiffies. + */ +void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout); + /** * Invalidate inode attributes */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9912c3a9b559a1..52d0b027e88741 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -2024,6 +2024,7 @@ void fuse_conn_destroy(struct fuse_mount *fm) { struct fuse_conn *fc = fm->fc; + fuse_flush_requests(fc, 30 * HZ); if (fc->destroy) fuse_send_destroy(fm); From 80549bccbf92aa0363ea809a3a3b0424c371de64 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 18 Jul 2025 17:24:42 +0200 Subject: [PATCH 07/11] fuse: Refactor io-uring bg queue flush and queue abort This is a preparation to allow fuse-io-uring bg queue flush from flush_bg_queue() This does two function renames: fuse_uring_flush_bg -> fuse_uring_flush_queue_bg fuse_uring_abort_end_requests -> fuse_uring_flush_bg And fuse_uring_abort_end_queue_requests() is moved to fuse_uring_stop_queues(). Signed-off-by: Bernd Schubert (cherry picked from commit e70ef24251116bc7f591a9a856c371549cd5ae77) --- fs/fuse/dev_uring.c | 14 +++++++------- fs/fuse/dev_uring_i.h | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index a771bb94b5fb52..5facaddd61d1f8 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -65,7 +65,7 @@ static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) return pdu->ent; } -static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) +static void fuse_uring_flush_queue_bg(struct fuse_ring_queue *queue) { struct fuse_ring *ring = queue->ring; struct fuse_conn *fc = ring->fc; @@ -106,7 +106,7 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, if (test_bit(FR_BACKGROUND, &req->flags)) { queue->active_background--; spin_lock(&fc->bg_lock); - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); } @@ -135,11 +135,11 @@ static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) fuse_dev_end_requests(&req_list); } -void fuse_uring_abort_end_requests(struct fuse_ring *ring) +void fuse_uring_flush_bg(struct fuse_conn *fc) { int qid; struct fuse_ring_queue *queue; - struct fuse_conn *fc = ring->fc; + struct fuse_ring *ring = fc->ring; for (qid = 0; qid < ring->nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); @@ -151,10 +151,9 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); spin_lock(&queue->lock); spin_lock(&fc->bg_lock); - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); spin_unlock(&queue->lock); - fuse_uring_abort_end_queue_requests(queue); } } @@ -472,6 +471,7 @@ void fuse_uring_stop_queues(struct fuse_ring *ring) if (!queue) continue; + fuse_uring_abort_end_queue_requests(queue); fuse_uring_teardown_entries(queue); } @@ -1493,7 +1493,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - fuse_uring_flush_bg(queue); + fuse_uring_flush_queue_bg(queue); spin_unlock(&fc->bg_lock); /* diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index b6e67bd24d6538..1abcc4ae5a78ab 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -142,7 +142,7 @@ struct fuse_ring { bool fuse_uring_enabled(void); void fuse_uring_destruct(struct fuse_conn *fc); void fuse_uring_stop_queues(struct fuse_ring *ring); -void fuse_uring_abort_end_requests(struct fuse_ring *ring); +void fuse_uring_flush_bg(struct fuse_conn *fc); int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); bool fuse_uring_queue_bq_req(struct fuse_req *req); @@ -156,7 +156,7 @@ static inline void fuse_uring_abort(struct fuse_conn *fc) return; if (atomic_read(&ring->queue_refs) > 0) { - fuse_uring_abort_end_requests(ring); + fuse_uring_flush_bg(fc); fuse_uring_stop_queues(ring); } } From 6fcf9b412f78fc464032f5601e2f8011a475dbc4 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 18 Jul 2025 18:24:41 +0200 Subject: [PATCH 08/11] fuse: Flush the io-uring bg queue from fuse_uring_flush_bg This is useful to have a unique API to flush background requests. For example when the bg queue gets flushed before the remaining of fuse_conn_destroy(). Signed-off-by: Bernd Schubert (cherry picked from commit fc4120cc58e7fbcb541bf2e9a72781b569561912) --- fs/fuse/dev.c | 2 ++ fs/fuse/dev_uring.c | 3 +++ fs/fuse/dev_uring_i.h | 8 ++++++++ 3 files changed, 13 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 7ddb08b96d2a25..9add15a87dd2c1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2170,6 +2170,8 @@ void fuse_flush_requests(struct fuse_conn *fc, unsigned long timeout) spin_unlock(&fc->bg_lock); spin_unlock(&fc->lock); + fuse_uring_flush_bg(fc); + /* * Wait 30s for all the events to complete or abort. Touch the * watchdog once per second so that we don't trip the hangcheck timer diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 5facaddd61d1f8..2aa20707f40b72 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -141,6 +141,9 @@ void fuse_uring_flush_bg(struct fuse_conn *fc) struct fuse_ring_queue *queue; struct fuse_ring *ring = fc->ring; + if (!ring) + return; + for (qid = 0; qid < ring->nr_queues; qid++) { queue = READ_ONCE(ring->queues[qid]); if (!queue) diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 1abcc4ae5a78ab..49cb8c961b11c2 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -204,6 +204,14 @@ static inline bool fuse_uring_remove_pending_req(struct fuse_req *req) return false; } +static inline bool fuse_uring_request_expired(struct fuse_conn *fc) +{ +} + +static inline void fuse_uring_flush_bg(struct fuse_conn *fc) +{ +} + #endif /* CONFIG_FUSE_IO_URING */ #endif /* _FS_FUSE_DEV_URING_I_H */ From a338501965ce2717d3544caf05c5e128af0f8a9f Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Mon, 7 Jul 2025 14:28:41 +0200 Subject: [PATCH 09/11] fuse: avoid tmp copying of data for writeback pages When writing back pages while using writeback caching the code did a copy of data into temporary pages to avoid a deadlock in reclaiming of memory. This is an adaptation and backport of a patch by Joanne Koong joannelkoong@gmail.com. Since we use pinned memory with io_uring we don't need the temporary copies and we don't use the AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM flag in the pagemap. Link: https://www.spinics.net/lists/linux-mm/msg407405.html Signed-off-by: Horst Birthelmer (cherry picked from commit 114c4df06d489bd0fc3bbc318073da76597401c4) --- fs/fuse/file.c | 351 ++++------------------------------------------- fs/fuse/fuse_i.h | 3 - 2 files changed, 23 insertions(+), 331 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 358c7e52665daf..d0f3a9be21afd4 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -416,83 +416,20 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; - struct rb_node writepages_entry; struct list_head queue_entry; - struct fuse_writepage_args *next; struct inode *inode; struct fuse_sync_bucket *bucket; }; -static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, - pgoff_t idx_from, pgoff_t idx_to) -{ - struct rb_node *n; - - n = fi->writepages.rb_node; - - while (n) { - struct fuse_writepage_args *wpa; - pgoff_t curr_index; - - wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); - WARN_ON(get_fuse_inode(wpa->inode) != fi); - curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + wpa->ia.ap.num_pages) - n = n->rb_right; - else if (idx_to < curr_index) - n = n->rb_left; - else - return wpa; - } - return NULL; -} - -/* - * Check if any page in a range is under writeback - */ -static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, - pgoff_t idx_to) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - bool found; - - if (RB_EMPTY_ROOT(&fi->writepages)) - return false; - - spin_lock(&fi->lock); - found = fuse_find_writeback(fi, idx_from, idx_to); - spin_unlock(&fi->lock); - - return found; -} - -static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) -{ - return fuse_range_is_writeback(inode, index, index); -} - -/* - * Wait for page writeback to be completed. - * - * Since fuse doesn't rely on the VM writeback tracking, this has to - * use some other means. - */ static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) { - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); + struct page *page = find_get_page(inode->i_mapping, index); + if (page) { + wait_on_page_writeback(page); + put_page(page); + } } -/* - * Wait for all pending writepages on the inode to finish. - * - * This is currently done by blocking further writes with FUSE_NOWRITE - * and waiting for all sent writes to complete. - * - * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage - * could conflict with truncation. - */ static void fuse_sync_writes(struct inode *inode) { fuse_set_nowrite(inode); @@ -518,10 +455,6 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - inode_lock(inode); - fuse_sync_writes(inode); - inode_unlock(inode); - err = filemap_check_errors(file->f_mapping); if (err) return err; @@ -864,13 +797,6 @@ static int fuse_do_readpage(struct file *file, struct page *page) ssize_t res; u64 attr_ver; - /* - * Page writeback can extend beyond the lifetime of the - * page-cache page, so make sure we read a properly synced - * page. - */ - fuse_wait_on_page_writeback(inode, page->index); - attr_ver = fuse_get_attr_version(fm->fc); /* Don't overflow end offset */ @@ -1570,7 +1496,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, return res; } } - if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { + if (!cuse && filemap_range_has_writeback(mapping, pos, pos + count - 1)) { if (!write) inode_lock(inode); fuse_sync_writes(inode); @@ -1734,14 +1660,10 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) static void fuse_writepage_free(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; - int i; if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); - for (i = 0; i < ap->num_pages; i++) - __free_page(ap->pages[i]); - if (wpa->ia.ff) fuse_file_put(wpa->ia.ff, false); @@ -1758,11 +1680,12 @@ static void fuse_writepage_finish(struct fuse_mount *fm, struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_pages; i++) { - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); - } + for (i = 0; i < ap->num_pages; i++) { + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + end_page_writeback(ap->pages[i]); + wb_writeout_inc(&bdi->wb); + } + wake_up(&fi->page_waitq); } @@ -1772,7 +1695,6 @@ static void fuse_send_writepage(struct fuse_mount *fm, __releases(fi->lock) __acquires(fi->lock) { - struct fuse_writepage_args *aux, *next; struct fuse_inode *fi = get_fuse_inode(wpa->inode); struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_args *args = &wpa->ia.ap.args; @@ -1808,23 +1730,8 @@ __acquires(fi->lock) out_free: fi->writectr--; - rb_erase(&wpa->writepages_entry, &fi->writepages); fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); - - /* After rb_erase() aux request list is private */ - for (aux = wpa->next; aux; aux = next) { - struct backing_dev_info *bdi = inode_to_bdi(aux->inode); - - next = aux->next; - aux->next = NULL; - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(aux->ia.ap.pages[0], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); - fuse_writepage_free(aux); - } - fuse_writepage_free(wpa); spin_lock(&fi->lock); } @@ -1852,43 +1759,6 @@ __acquires(fi->lock) } } -static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, - struct fuse_writepage_args *wpa) -{ - pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; - pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - - WARN_ON(!wpa->ia.ap.num_pages); - while (*p) { - struct fuse_writepage_args *curr; - pgoff_t curr_index; - - parent = *p; - curr = rb_entry(parent, struct fuse_writepage_args, - writepages_entry); - WARN_ON(curr->inode != wpa->inode); - curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; - - if (idx_from >= curr_index + curr->ia.ap.num_pages) - p = &(*p)->rb_right; - else if (idx_to < curr_index) - p = &(*p)->rb_left; - else - return curr; - } - - rb_link_node(&wpa->writepages_entry, parent, p); - rb_insert_color(&wpa->writepages_entry, root); - return NULL; -} - -static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) -{ - WARN_ON(fuse_insert_writeback(root, wpa)); -} - static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, int error) { @@ -1908,42 +1778,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, if (!fc->writeback_cache) fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); spin_lock(&fi->lock); - rb_erase(&wpa->writepages_entry, &fi->writepages); - while (wpa->next) { - struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_write_in *inarg = &wpa->ia.write.in; - struct fuse_writepage_args *next = wpa->next; - - wpa->next = next->next; - next->next = NULL; - next->ia.ff = fuse_file_get(wpa->ia.ff); - tree_insert(&fi->writepages, next); - - /* - * Skip fuse_flush_writepages() to make it easy to crop requests - * based on primary request size. - * - * 1st case (trivial): there are no concurrent activities using - * fuse_set/release_nowrite. Then we're on safe side because - * fuse_flush_writepages() would call fuse_send_writepage() - * anyway. - * - * 2nd case: someone called fuse_set_nowrite and it is waiting - * now for completion of all in-flight requests. This happens - * rarely and no more than once per page, so this should be - * okay. - * - * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle - * of fuse_set_nowrite..fuse_release_nowrite section. The fact - * that fuse_set_nowrite returned implies that all in-flight - * requests were completed along with all of their secondary - * requests. Further primary requests are blocked by negative - * writectr. Hence there cannot be any in-flight requests and - * no invocations of fuse_writepage_end() while we're in - * fuse_set_nowrite..fuse_release_nowrite section. - */ - fuse_send_writepage(fm, next, inarg->offset + inarg->size); - } fi->writectr--; fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); @@ -2037,8 +1871,7 @@ static int fuse_writepage_locked(struct page *page) struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; - struct page *tmp_page; - int error = -ENOMEM; + int error = -EIO; set_page_writeback(page); @@ -2047,44 +1880,32 @@ static int fuse_writepage_locked(struct page *page) goto err; ap = &wpa->ia.ap; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) - goto err_free; - - error = -EIO; wpa->ia.ff = fuse_write_file_get(fi); if (!wpa->ia.ff) - goto err_nofile; + goto err_free; fuse_writepage_add_to_bucket(fc, wpa); fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); - copy_highpage(tmp_page, page); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; - wpa->next = NULL; + ap->args.in_pages = true; ap->num_pages = 1; - ap->pages[0] = tmp_page; + ap->pages[0] = page; ap->descs[0].offset = 0; ap->descs[0].length = PAGE_SIZE; ap->args.end = fuse_writepage_end; wpa->inode = inode; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); - tree_insert(&fi->writepages, wpa); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - end_page_writeback(page); - return 0; -err_nofile: - __free_page(tmp_page); err_free: kfree(wpa); err: @@ -2098,19 +1919,6 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc) struct fuse_conn *fc = get_fuse_conn(page->mapping->host); int err; - if (fuse_page_is_writeback(page->mapping->host, page->index)) { - /* - * ->writepages() should be called for sync() and friends. We - * should only get here on direct reclaim and then we are - * allowed to skip a page which is already in flight - */ - WARN_ON(wbc->sync_mode == WB_SYNC_ALL); - - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - if (wbc->sync_mode == WB_SYNC_NONE && fc->num_background >= fc->congestion_threshold) return AOP_WRITEPAGE_ACTIVATE; @@ -2125,7 +1933,6 @@ struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; - struct page **orig_pages; unsigned int max_pages; }; @@ -2160,74 +1967,14 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_pages = wpa->ia.ap.num_pages; - int i; wpa->ia.ff = fuse_file_get(data->ff); spin_lock(&fi->lock); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - - for (i = 0; i < num_pages; i++) - end_page_writeback(data->orig_pages[i]); } -/* - * Check under fi->lock if the page is under writeback, and insert it onto the - * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's - * one already added for a page at this offset. If there's none, then insert - * this new request onto the auxiliary list, otherwise reuse the existing one by - * swapping the new temp page with the old one. - */ -static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, - struct page *page) -{ - struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); - struct fuse_writepage_args *tmp; - struct fuse_writepage_args *old_wpa; - struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - - WARN_ON(new_ap->num_pages != 0); - new_ap->num_pages = 1; - - spin_lock(&fi->lock); - old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); - if (!old_wpa) { - spin_unlock(&fi->lock); - return true; - } - - for (tmp = old_wpa->next; tmp; tmp = tmp->next) { - pgoff_t curr_index; - - WARN_ON(tmp->inode != new_wpa->inode); - curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; - if (curr_index == page->index) { - WARN_ON(tmp->ia.ap.num_pages != 1); - swap(tmp->ia.ap.pages[0], new_ap->pages[0]); - break; - } - } - - if (!tmp) { - new_wpa->next = old_wpa->next; - old_wpa->next = new_wpa; - } - - spin_unlock(&fi->lock); - - if (tmp) { - struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode); - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); - fuse_writepage_free(new_wpa); - } - - return false; -} static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, struct fuse_args_pages *ap, @@ -2235,15 +1982,6 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, { WARN_ON(!ap->num_pages); - /* - * Being under writeback is unlikely but possible. For example direct - * read to an mmaped fuse file will set the page dirty twice; once when - * the pages are faulted with get_user_pages(), and then after the read - * completed. - */ - if (fuse_page_is_writeback(data->inode, page->index)) - return true; - /* Reached max pages */ if (ap->num_pages == fc->max_pages) return true; @@ -2253,7 +1991,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, return true; /* Discontinuity */ - if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index) + if (ap->pages[ap->num_pages - 1]->index + 1 != page->index) return true; /* Need to grow the pages array? If so, did the expansion fail? */ @@ -2272,7 +2010,6 @@ static int fuse_writepages_fill(struct folio *folio, struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct page *tmp_page; int err; if (!data->ff) { @@ -2287,31 +2024,11 @@ static int fuse_writepages_fill(struct folio *folio, data->wpa = NULL; } - err = -ENOMEM; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) - goto out_unlock; - - /* - * The page must not be redirtied until the writeout is completed - * (i.e. userspace has sent a reply to the write request). Otherwise - * there could be more than one temporary page instance for each real - * page. - * - * This is ensured by holding the page lock in page_mkwrite() while - * checking fuse_page_is_writeback(). We already hold the page lock - * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment ap->num_pages. - * After this fuse_page_is_writeback() will indicate that the page is - * under writeback, so we can release the page lock. - */ if (data->wpa == NULL) { err = -ENOMEM; wpa = fuse_writepage_args_alloc(); - if (!wpa) { - __free_page(tmp_page); + if (!wpa) goto out_unlock; - } fuse_writepage_add_to_bucket(fc, wpa); data->max_pages = 1; @@ -2319,36 +2036,23 @@ static int fuse_writepages_fill(struct folio *folio, ap = &wpa->ia.ap; fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; - wpa->next = NULL; + wpa->inode = inode; + wpa->ia.ff = data->ff; ap->args.in_pages = true; ap->args.end = fuse_writepage_end; ap->num_pages = 0; - wpa->inode = inode; } folio_start_writeback(folio); - - copy_highpage(tmp_page, &folio->page); - ap->pages[ap->num_pages] = tmp_page; ap->descs[ap->num_pages].offset = 0; ap->descs[ap->num_pages].length = PAGE_SIZE; - data->orig_pages[ap->num_pages] = &folio->page; + ap->pages[ap->num_pages] = &folio->page; + ap->num_pages++; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; - if (data->wpa) { - /* - * Protected by fi->lock against concurrent access by - * fuse_page_is_writeback(). - */ - spin_lock(&fi->lock); - ap->num_pages++; - spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, &folio->page)) { + if (!data->wpa) { data->wpa = wpa; - } else { - folio_end_writeback(folio); } out_unlock: folio_unlock(folio); @@ -2376,13 +2080,6 @@ static int fuse_writepages(struct address_space *mapping, data.wpa = NULL; data.ff = NULL; - err = -ENOMEM; - data.orig_pages = kcalloc(fc->max_pages, - sizeof(struct page *), - GFP_NOFS); - if (!data.orig_pages) - goto out; - err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { WARN_ON(!data.wpa->ia.ap.num_pages); @@ -2391,7 +2088,6 @@ static int fuse_writepages(struct address_space *mapping, if (data.ff) fuse_file_put(data.ff, false); - kfree(data.orig_pages); out: return err; } @@ -3411,7 +3107,6 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); init_waitqueue_head(&fi->direct_io_waitq); - fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f0915fe551b99e..1af8e20c3e4a70 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -152,9 +152,6 @@ struct fuse_inode { /* waitq for direct-io completion */ wait_queue_head_t direct_io_waitq; - /* List of writepage requestst (pending or sent) */ - struct rb_root writepages; - /* dlm locked areas we have sent lock requests for */ struct fuse_dlm_cache dlm_locked_areas; }; From 5b678768eef868d0b9045035aa3857f5cb868e8b Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Mon, 21 Jul 2025 15:54:09 +0200 Subject: [PATCH 10/11] fuse: fix unnecessary connection abort in dlm lock acquiring When calling the fuse server with a dlm request and the fuse server responds with some other error than ENOSYS most likely the lock size will be set to zero. In that case the kernel will abort the fuse connection. This is completely unnecessary. Signed-off-by: Horst Birthelmer (cherry picked from commit 0bc2f9c39c52ad11a1753e5be376c424b06f43db) --- fs/fuse/fuse_dlm_cache.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/fuse/fuse_dlm_cache.c b/fs/fuse/fuse_dlm_cache.c index ea947f34a9f70a..a9cad2c1bd2174 100644 --- a/fs/fuse/fuse_dlm_cache.c +++ b/fs/fuse/fuse_dlm_cache.c @@ -533,19 +533,19 @@ void fuse_get_dlm_write_lock(struct file *file, loff_t offset, return; } - if (outarg.locksize < end - offset + 1) { - /* fuse server is seriously broken */ - pr_warn("fuse: dlm lock request for %llu bytes returned %u bytes\n", - end - offset + 1, outarg.locksize); - fuse_abort_conn(fc); - return; - } - if (err) return; else - /* ignore any errors here, there is no way we can react appropriately */ - fuse_dlm_lock_range(fi, offset, + if (outarg.locksize < end - offset + 1) { + /* fuse server is seriously broken */ + pr_warn("fuse: dlm lock request for %llu bytes returned %u bytes\n", + end - offset + 1, outarg.locksize); + fuse_abort_conn(fc); + return; + } else { + /* ignore any errors here, there is no way we can react appropriately */ + fuse_dlm_lock_range(fi, offset, offset + outarg.locksize - 1, FUSE_PAGE_LOCK_WRITE); + } } From 3754b6e6eb4478846dfab32798e4a0c0ce8e8a47 Mon Sep 17 00:00:00 2001 From: Horst Birthelmer Date: Mon, 21 Jul 2025 18:15:55 +0200 Subject: [PATCH 11/11] fuse: fix connection abort on mmap when fuse server returns ENOSYS Check whether dlm is still enabled when interpreting the returned error from fuse server. Signed-off-by: Horst Birthelmer (cherry picked from commit f6fbf7c7bfb976ae2a30b4d699770a13e699ff04) --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d0f3a9be21afd4..50e5ece10850a6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2240,7 +2240,7 @@ static int fuse_get_page_mkwrite_lock(struct file *file, loff_t offset, size_t l err = 0; } - if (!err && outarg.locksize < length) { + if (!err && fc->dlm && outarg.locksize < length) { /* fuse server is seriously broken */ pr_warn("fuse: dlm lock request for %lu bytes returned %u bytes\n", length, outarg.locksize);