diff --git a/Documentation/filesystems/fiemap.rst b/Documentation/filesystems/fiemap.rst index 93fc96f760aa66..fd0911e65b43fe 100644 --- a/Documentation/filesystems/fiemap.rst +++ b/Documentation/filesystems/fiemap.rst @@ -80,14 +80,24 @@ Each extent is described by a single fiemap_extent structure as returned in fm_extents:: struct fiemap_extent { - __u64 fe_logical; /* logical offset in bytes for the start of - * the extent */ - __u64 fe_physical; /* physical offset in bytes for the start - * of the extent */ - __u64 fe_length; /* length in bytes for the extent */ - __u64 fe_reserved64[2]; - __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ - __u32 fe_reserved[3]; + /* + * logical offset in bytes for the start of + * the extent from the beginning of the file + */ + __u64 fe_logical; + /* + * physical offset in bytes for the start + * of the extent from the beginning of the disk + */ + __u64 fe_physical; + /* length in bytes for this extent */ + __u64 fe_length; + /* physical length in bytes for this extent */ + __u64 fe_physical_length; + __u64 fe_reserved64[1]; + /* FIEMAP_EXTENT_* flags for this extent */ + __u32 fe_flags; + __u32 fe_reserved[3]; }; All offsets and lengths are in bytes and mirror those on disk. It is valid @@ -224,7 +234,8 @@ For each extent in the request range, the file system should call the helper function, fiemap_fill_next_extent():: int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, - u64 phys, u64 len, u32 flags, u32 dev); + u64 phys, u64 log_len, u64 phys_len, u32 flags, + u32 dev); fiemap_fill_next_extent() will use the passed values to populate the next free extent in the fm_extents array. 'General' extent flags will diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 77ae65542db916..e6dc2e7260ef14 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -859,7 +859,9 @@ static int bch2_fill_extent(struct bch_fs *c, ret = fiemap_fill_next_extent(info, bkey_start_offset(k.k) << 9, offset << 9, - k.k->size << 9, flags|flags2); + k.k->size << 9, + k.k->size << 9, + flags|flags2); if (ret) return ret; } @@ -869,12 +871,14 @@ static int bch2_fill_extent(struct bch_fs *c, return fiemap_fill_next_extent(info, bkey_start_offset(k.k) << 9, 0, k.k->size << 9, + k.k->size << 9, flags| FIEMAP_EXTENT_DATA_INLINE); } else if (k.k->type == KEY_TYPE_reservation) { return fiemap_fill_next_extent(info, bkey_start_offset(k.k) << 9, 0, k.k->size << 9, + k.k->size << 9, flags| FIEMAP_EXTENT_DELALLOC| FIEMAP_EXTENT_UNWRITTEN); diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 4fb925e8c981d8..7fe030d91e4f0b 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 +# misc-next marker config BTRFS_FS tristate "Btrfs filesystem support" diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 1925a0919ca62f..79026917db19dc 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -5,7 +5,8 @@ #include #include "messages.h" -#include "ctree.h" +#include "extent_io.h" +#include "fs.h" #include "accessors.h" static bool check_setget_bounds(const struct extent_buffer *eb, @@ -63,8 +64,8 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ const unsigned long oil = get_eb_offset_in_folio(token->eb, \ member_offset);\ - const int unit_size = folio_size(token->eb->folios[0]); \ - const int unit_shift = folio_shift(token->eb->folios[0]); \ + const int unit_size = token->eb->folio_size; \ + const int unit_shift = token->eb->folio_shift; \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ const int part = unit_size - oil; \ @@ -94,7 +95,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ const unsigned long oil = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = folio_size(eb->folios[0]); \ + const int unit_size = eb->folio_size; \ char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ const int part = unit_size - oil; \ @@ -117,8 +118,8 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ const unsigned long oil = get_eb_offset_in_folio(token->eb, \ member_offset);\ - const int unit_size = folio_size(token->eb->folios[0]); \ - const int unit_shift = folio_shift(token->eb->folios[0]); \ + const int unit_size = token->eb->folio_size; \ + const int unit_shift = token->eb->folio_shift; \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ const int part = unit_size - oil; \ @@ -151,7 +152,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ const unsigned long oil = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = folio_size(eb->folios[0]); \ + const int unit_size = eb->folio_size; \ char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ const int part = unit_size - oil; \ diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index ed7aa32972add9..6fce3e8d3dac52 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -3,8 +3,17 @@ #ifndef BTRFS_ACCESSORS_H #define BTRFS_ACCESSORS_H -#include #include +#include +#include +#include +#include +#include +#include +#include +#include + +struct extent_buffer; struct btrfs_map_token { struct extent_buffer *eb; @@ -844,45 +853,6 @@ static inline void btrfs_set_balance_sys(struct extent_buffer *eb, write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } -static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, - const struct btrfs_disk_balance_args *disk) -{ - memset(cpu, 0, sizeof(*cpu)); - - cpu->profiles = le64_to_cpu(disk->profiles); - cpu->usage = le64_to_cpu(disk->usage); - cpu->devid = le64_to_cpu(disk->devid); - cpu->pstart = le64_to_cpu(disk->pstart); - cpu->pend = le64_to_cpu(disk->pend); - cpu->vstart = le64_to_cpu(disk->vstart); - cpu->vend = le64_to_cpu(disk->vend); - cpu->target = le64_to_cpu(disk->target); - cpu->flags = le64_to_cpu(disk->flags); - cpu->limit = le64_to_cpu(disk->limit); - cpu->stripes_min = le32_to_cpu(disk->stripes_min); - cpu->stripes_max = le32_to_cpu(disk->stripes_max); -} - -static inline void btrfs_cpu_balance_args_to_disk( - struct btrfs_disk_balance_args *disk, - const struct btrfs_balance_args *cpu) -{ - memset(disk, 0, sizeof(*disk)); - - disk->profiles = cpu_to_le64(cpu->profiles); - disk->usage = cpu_to_le64(cpu->usage); - disk->devid = cpu_to_le64(cpu->devid); - disk->pstart = cpu_to_le64(cpu->pstart); - disk->pend = cpu_to_le64(cpu->pend); - disk->vstart = cpu_to_le64(cpu->vstart); - disk->vend = cpu_to_le64(cpu->vend); - disk->target = cpu_to_le64(cpu->target); - disk->flags = cpu_to_le64(cpu->flags); - disk->limit = cpu_to_le64(cpu->limit); - disk->stripes_min = cpu_to_le32(cpu->stripes_min); - disk->stripes_max = cpu_to_le32(cpu->stripes_max); -} - /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 7427449a04a3f2..e0ba00d64ea0bf 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -12,7 +12,6 @@ #include #include #include "ctree.h" -#include "btrfs_inode.h" #include "xattr.h" #include "acl.h" diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h index a270e71ec05f91..48b9ddae4a46a7 100644 --- a/fs/btrfs/acl.h +++ b/fs/btrfs/acl.h @@ -3,8 +3,15 @@ #ifndef BTRFS_ACL_H #define BTRFS_ACL_H +struct posix_acl; +struct inode; +struct btrfs_trans_handle; + #ifdef CONFIG_BTRFS_FS_POSIX_ACL +struct mnt_idmap; +struct dentry; + struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); @@ -13,6 +20,10 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, #else +#include + +struct btrfs_trans_handle; + #define btrfs_get_acl NULL #define btrfs_set_acl NULL static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 9e261aac671e62..361a866c19955a 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -11,7 +11,6 @@ #include #include #include "async-thread.h" -#include "ctree.h" enum { WORK_DONE_BIT, diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 62b8a0d5789865..04c2f3175828bb 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -7,11 +7,14 @@ #ifndef BTRFS_ASYNC_THREAD_H #define BTRFS_ASYNC_THREAD_H +#include #include +#include struct btrfs_fs_info; struct btrfs_workqueue; struct btrfs_work; + typedef void (*btrfs_func_t)(struct btrfs_work *arg); typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index beed7e459dabde..c1e6a5bbeeaffe 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -198,10 +198,7 @@ static struct kmem_cache *btrfs_prelim_ref_cache; int __init btrfs_prelim_ref_init(void) { btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", - sizeof(struct prelim_ref), - 0, - SLAB_MEM_SPREAD, - NULL); + sizeof(struct prelim_ref), 0, 0, NULL); if (!btrfs_prelim_ref_cache) return -ENOMEM; return 0; @@ -1036,8 +1033,6 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, slot = path->slots[0]; item_size = btrfs_item_size(leaf, slot); - BUG_ON(item_size < sizeof(*ei)); - ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); if (ctx->check_extent_item) { @@ -1435,8 +1430,10 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, if (ret < 0) goto out; if (ret == 0) { - /* This shouldn't happen, indicates a bug or fs corruption. */ - ASSERT(ret != 0); + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ ret = -EUCLEAN; goto out; } @@ -2225,6 +2222,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + return -EUCLEAN; + } ret = btrfs_previous_extent_item(extent_root, path, 0); if (ret) { @@ -2247,7 +2251,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, eb = path->nodes[0]; item_size = btrfs_item_size(eb, path->slots[0]); - BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(eb, ei); @@ -2850,6 +2853,16 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_inf return ret; } +static void btrfs_backref_iter_release(struct btrfs_backref_iter *iter) +{ + iter->bytenr = 0; + iter->item_ptr = 0; + iter->cur_ptr = 0; + iter->end_ptr = 0; + btrfs_release_path(iter->path); + memset(&iter->cur_key, 0, sizeof(iter->cur_key)); +} + int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) { struct btrfs_fs_info *fs_info = iter->fs_info; @@ -2868,6 +2881,10 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) if (ret < 0) return ret; if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ ret = -EUCLEAN; goto release; } @@ -2938,6 +2955,14 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) return ret; } +static bool btrfs_backref_iter_is_inline_ref(struct btrfs_backref_iter *iter) +{ + if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY || + iter->cur_key.type == BTRFS_METADATA_ITEM_KEY) + return true; + return false; +} + /* * Go to the next backref item of current bytenr, can be either inlined or * keyed. @@ -2950,7 +2975,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) */ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) { - struct extent_buffer *eb = btrfs_backref_get_eb(iter); + struct extent_buffer *eb = iter->path->nodes[0]; struct btrfs_root *extent_root; struct btrfs_path *path = iter->path; struct btrfs_extent_inline_ref *iref; @@ -3038,6 +3063,19 @@ struct btrfs_backref_node *btrfs_backref_alloc_node( return node; } +void btrfs_backref_free_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node) +{ + if (node) { + ASSERT(list_empty(&node->list)); + ASSERT(list_empty(&node->lower)); + ASSERT(node->eb == NULL); + cache->nr_nodes--; + btrfs_put_root(node->root); + kfree(node); + } +} + struct btrfs_backref_edge *btrfs_backref_alloc_edge( struct btrfs_backref_cache *cache) { @@ -3049,6 +3087,52 @@ struct btrfs_backref_edge *btrfs_backref_alloc_edge( return edge; } +void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, + struct btrfs_backref_edge *edge) +{ + if (edge) { + cache->nr_edges--; + kfree(edge); + } +} + +void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node) +{ + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } +} + +void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node) +{ + if (node->eb) { + btrfs_backref_unlock_node_buffer(node); + free_extent_buffer(node->eb); + node->eb = NULL; + } +} + +/* + * Drop the backref node from cache without cleaning up its children + * edges. + * + * This can only be called on node without parent edges. + * The children edges are still kept as is. + */ +void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, + struct btrfs_backref_node *node) +{ + ASSERT(list_empty(&node->upper)); + + btrfs_backref_drop_node_buffer(node); + list_del_init(&node->list); + list_del_init(&node->lower); + if (!RB_EMPTY_NODE(&node->rb_node)) + rb_erase(&node->rb_node, &tree->rb_root); + btrfs_backref_free_node(tree, node); +} + /* * Drop the backref node from cache, also cleaning up all its * upper edges and any uncached nodes in the path. @@ -3120,6 +3204,19 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache) ASSERT(!cache->nr_edges); } +void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, + struct btrfs_backref_node *lower, + struct btrfs_backref_node *upper, + int link_which) +{ + ASSERT(upper && lower && upper->level == lower->level + 1); + edge->node[LOWER] = lower; + edge->node[UPPER] = upper; + if (link_which & LINK_LOWER) + list_add_tail(&edge->list[LOWER], &lower->upper); + if (link_which & LINK_UPPER) + list_add_tail(&edge->list[UPPER], &upper->lower); +} /* * Handle direct tree backref * @@ -3428,7 +3525,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, int type; cond_resched(); - eb = btrfs_backref_get_eb(iter); + eb = iter->path->nodes[0]; key.objectid = iter->bytenr; if (btrfs_backref_iter_is_inline_ref(iter)) { diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ab4ca0eda60557..e8c22cccb5c132 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -6,11 +6,23 @@ #ifndef BTRFS_BACKREF_H #define BTRFS_BACKREF_H -#include +#include +#include +#include +#include +#include +#include #include "messages.h" -#include "ulist.h" +#include "locking.h" #include "disk-io.h" #include "extent_io.h" +#include "ctree.h" + +struct extent_inode_elem; +struct ulist; +struct btrfs_extent_item; +struct btrfs_trans_handle; +struct btrfs_fs_info; /* * Used by implementations of iterate_extent_inodes_t (see definition below) to @@ -271,22 +283,6 @@ struct btrfs_backref_iter { struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info); -static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter) -{ - if (!iter) - return; - btrfs_free_path(iter->path); - kfree(iter); -} - -static inline struct extent_buffer *btrfs_backref_get_eb( - struct btrfs_backref_iter *iter) -{ - if (!iter) - return NULL; - return iter->path->nodes[0]; -} - /* * For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data * is btrfs_tree_block_info, without a btrfs_extent_inline_ref header. @@ -306,25 +302,6 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr); int btrfs_backref_iter_next(struct btrfs_backref_iter *iter); -static inline bool btrfs_backref_iter_is_inline_ref( - struct btrfs_backref_iter *iter) -{ - if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY || - iter->cur_key.type == BTRFS_METADATA_ITEM_KEY) - return true; - return false; -} - -static inline void btrfs_backref_iter_release(struct btrfs_backref_iter *iter) -{ - iter->bytenr = 0; - iter->item_ptr = 0; - iter->cur_ptr = 0; - iter->end_ptr = 0; - btrfs_release_path(iter->path); - memset(&iter->cur_key, 0, sizeof(iter->cur_key)); -} - /* * Backref cache related structures * @@ -452,83 +429,22 @@ struct btrfs_backref_edge *btrfs_backref_alloc_edge( #define LINK_LOWER (1 << 0) #define LINK_UPPER (1 << 1) -static inline void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, - struct btrfs_backref_node *lower, - struct btrfs_backref_node *upper, - int link_which) -{ - ASSERT(upper && lower && upper->level == lower->level + 1); - edge->node[LOWER] = lower; - edge->node[UPPER] = upper; - if (link_which & LINK_LOWER) - list_add_tail(&edge->list[LOWER], &lower->upper); - if (link_which & LINK_UPPER) - list_add_tail(&edge->list[UPPER], &upper->lower); -} - -static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache, - struct btrfs_backref_node *node) -{ - if (node) { - ASSERT(list_empty(&node->list)); - ASSERT(list_empty(&node->lower)); - ASSERT(node->eb == NULL); - cache->nr_nodes--; - btrfs_put_root(node->root); - kfree(node); - } -} - -static inline void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, - struct btrfs_backref_edge *edge) -{ - if (edge) { - cache->nr_edges--; - kfree(edge); - } -} - -static inline void btrfs_backref_unlock_node_buffer( - struct btrfs_backref_node *node) -{ - if (node->locked) { - btrfs_tree_unlock(node->eb); - node->locked = 0; - } -} -static inline void btrfs_backref_drop_node_buffer( - struct btrfs_backref_node *node) -{ - if (node->eb) { - btrfs_backref_unlock_node_buffer(node); - free_extent_buffer(node->eb); - node->eb = NULL; - } -} - -/* - * Drop the backref node from cache without cleaning up its children - * edges. - * - * This can only be called on node without parent edges. - * The children edges are still kept as is. - */ -static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, - struct btrfs_backref_node *node) -{ - ASSERT(list_empty(&node->upper)); - - btrfs_backref_drop_node_buffer(node); - list_del_init(&node->list); - list_del_init(&node->lower); - if (!RB_EMPTY_NODE(&node->rb_node)) - rb_erase(&node->rb_node, &tree->rb_root); - btrfs_backref_free_node(tree, node); -} +void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, + struct btrfs_backref_node *lower, + struct btrfs_backref_node *upper, + int link_which); +void btrfs_backref_free_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node); +void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, + struct btrfs_backref_edge *edge); +void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node); +void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node); void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node); +void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, + struct btrfs_backref_node *node); void btrfs_backref_release_cache(struct btrfs_backref_cache *cache); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 928f512cdb4a74..477f350a8bd09e 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -11,7 +11,6 @@ #include "raid56.h" #include "async-thread.h" #include "dev-replace.h" -#include "rcu-string.h" #include "zoned.h" #include "file-item.h" #include "raid-stripe-tree.h" @@ -509,8 +508,6 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, if (!bioc) { /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; - if (bio_op(bio) != REQ_OP_READ) - btrfs_bio(bio)->orig_physical = smap->physical; bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; if (bio_op(bio) != REQ_OP_READ) btrfs_bio(bio)->orig_physical = smap->physical; @@ -611,8 +608,20 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) static bool should_async_write(struct btrfs_bio *bbio) { + bool auto_csum_mode = true; + +#ifdef CONFIG_BTRFS_DEBUG + struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; + enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); + + if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF) + return false; + + auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO); +#endif + /* Submit synchronously if the checksum implementation is fast. */ - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) + if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) return false; /* diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index bbaed317161a4c..d9dd5276093df0 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -7,12 +7,14 @@ #ifndef BTRFS_BIO_H #define BTRFS_BIO_H +#include #include #include #include "tree-checker.h" struct btrfs_bio; struct btrfs_fs_info; +struct btrfs_inode; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 378d9103a2072b..84932d944d5146 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -418,7 +418,7 @@ struct btrfs_caching_control *btrfs_get_caching_control( return ctl; } -void btrfs_put_caching_control(struct btrfs_caching_control *ctl) +static void btrfs_put_caching_control(struct btrfs_caching_control *ctl) { if (refcount_dec_and_test(&ctl->count)) kfree(ctl); @@ -1063,7 +1063,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, bool remove_rsv = false; block_group = btrfs_lookup_block_group(fs_info, map->start); - BUG_ON(!block_group); + if (!block_group) + return -ENOENT; + BUG_ON(!block_group->ro); trace_btrfs_remove_block_group(block_group); @@ -1429,7 +1431,7 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, * group in pinned_extents before we were able to clear the whole block * group range from pinned_extents. This means that task can lookup for * the block group after we unpinned it from pinned_extents and removed - * it, leading to a BUG_ON() at unpin_extent_range(). + * it, leading to an error at unpin_extent_range(). */ mutex_lock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) { @@ -1522,6 +1524,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * outstanding allocations in this block group. We do * the ro check in case balance is currently acting on * this block group. + * + * Also bail out if this is the only block group for its + * type, because otherwise we would lose profile + * information from fs_info->avail_*_alloc_bits and the + * next block group of this type would be created with a + * "single" profile (even if we're in a raid fs) because + * fs_info->avail_*_alloc_bits would be 0. */ trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); @@ -1747,24 +1756,21 @@ static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) { - const struct btrfs_space_info *space_info = bg->space_info; - const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); + const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info); + u64 thresh_bytes = mult_perc(bg->length, thresh_pct); const u64 new_val = bg->used; const u64 old_val = new_val + bytes_freed; - u64 thresh; - if (reclaim_thresh == 0) + if (thresh_bytes == 0) return false; - thresh = mult_perc(bg->length, reclaim_thresh); - /* * If we were below the threshold before don't reclaim, we are likely a * brand new block group and we don't want to relocate new block groups. */ - if (old_val < thresh) + if (old_val < thresh_bytes) return false; - if (new_val >= thresh) + if (new_val >= thresh_bytes) return false; return true; } @@ -1824,6 +1830,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) /* Don't race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); + spin_lock(&space_info->lock); spin_lock(&bg->lock); if (bg->reserved || bg->pinned || bg->ro) { /* @@ -1833,6 +1840,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) * this block group. */ spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); up_write(&space_info->groups_sem); goto next; } @@ -1851,6 +1859,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_mark_bg_unused(bg); spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); up_write(&space_info->groups_sem); goto next; @@ -1867,10 +1876,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ if (!should_reclaim_block_group(bg, bg->length)) { spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); up_write(&space_info->groups_sem); goto next; } spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); /* * Get out fast, in case we're read-only or unmounting the @@ -1903,6 +1914,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) div64_u64(bg->used * 100, bg->length), div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); + spin_lock(&space_info->lock); + space_info->reclaim_count++; + spin_unlock(&space_info->lock); ret = btrfs_relocate_chunk(fs_info, bg->start); if (ret) { btrfs_dec_block_group_ro(bg); @@ -1911,7 +1925,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } next: - if (ret) + if (ret && !READ_ONCE(space_info->periodic_reclaim)) btrfs_mark_bg_to_reclaim(bg); btrfs_put_block_group(bg); @@ -1938,6 +1952,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) { + btrfs_reclaim_sweep(fs_info); spin_lock(&fs_info->unused_bgs_lock); if (!list_empty(&fs_info->reclaim_bgs)) queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); @@ -3636,6 +3651,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, old_val += num_bytes; cache->used = old_val; cache->reserved -= num_bytes; + cache->reclaim_mark = 0; space_info->bytes_reserved -= num_bytes; space_info->bytes_used += num_bytes; space_info->disk_used += num_bytes * factor; @@ -3649,6 +3665,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, space_info->bytes_used -= num_bytes; space_info->disk_used -= num_bytes * factor; + space_info->periodic_reclaim_ready = true; reclaim = should_reclaim_block_group(cache, num_bytes); spin_unlock(&cache->lock); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 962b11983901a8..8656b38f1fa59d 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -3,9 +3,22 @@ #ifndef BTRFS_BLOCK_GROUP_H #define BTRFS_BLOCK_GROUP_H +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include "free-space-cache.h" struct btrfs_chunk_map; +struct btrfs_fs_info; +struct btrfs_inode; +struct btrfs_trans_handle; enum btrfs_disk_cache_state { BTRFS_DC_WRITTEN, @@ -250,6 +263,7 @@ struct btrfs_block_group { struct work_struct zone_finish_work; struct extent_buffer *last_eb; enum btrfs_block_group_size_class size_class; + u64 reclaim_mark; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -297,7 +311,6 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); -void btrfs_put_caching_control(struct btrfs_caching_control *ctl); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); int btrfs_add_new_free_space(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 1043a8142351b2..95c174f9fd4ff7 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -6,7 +6,6 @@ #include "space-info.h" #include "transaction.h" #include "block-group.h" -#include "disk-io.h" #include "fs.h" #include "accessors.h" diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 43a9a6b5a79f46..1f53b967d06919 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -3,8 +3,15 @@ #ifndef BTRFS_BLOCK_RSV_H #define BTRFS_BLOCK_RSV_H +#include +#include +#include + struct btrfs_trans_handle; struct btrfs_root; +struct btrfs_space_info; +struct btrfs_block_rsv; +struct btrfs_fs_info; enum btrfs_reserve_flush_enum; /* diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 7f7c5a92d2b879..100020ca4658ec 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -8,13 +8,32 @@ #include #include +#include +#include +#include +#include +#include +#include #include +#include +#include #include +#include "block-rsv.h" +#include "btrfs_inode.h" #include "extent_map.h" #include "extent_io.h" +#include "extent-io-tree.h" #include "ordered-data.h" #include "delayed-inode.h" +struct extent_state; +struct posix_acl; +struct iov_iter; +struct writeback_control; +struct btrfs_root; +struct btrfs_fs_info; +struct btrfs_trans_handle; + /* * Since we search a directory based on f_pos (struct dir_context::pos) we have * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so @@ -41,7 +60,6 @@ enum { */ BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, - BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, /* @@ -428,7 +446,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool nowait, bool strict); -void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); +void btrfs_del_delalloc_inode(struct btrfs_inode *inode); struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, @@ -490,8 +508,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino, struct btrfs_root *root, struct btrfs_path *path); struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); + struct page *page, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 68345f73d429aa..24778d22936869 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -25,8 +25,6 @@ #include "misc.h" #include "ctree.h" #include "fs.h" -#include "disk-io.h" -#include "transaction.h" #include "btrfs_inode.h" #include "bio.h" #include "ordered-data.h" @@ -34,8 +32,7 @@ #include "extent_io.h" #include "extent_map.h" #include "subpage.h" -#include "zoned.h" -#include "file-item.h" +#include "messages.h" #include "super.h" static struct bio_set btrfs_compressed_bioset; @@ -93,20 +90,20 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len) } static int compression_compress_pages(int type, struct list_head *ws, - struct address_space *mapping, u64 start, struct page **pages, - unsigned long *out_pages, unsigned long *total_in, - unsigned long *total_out) + struct address_space *mapping, u64 start, struct folio **folios, + unsigned long *out_folios, unsigned long *total_in, + unsigned long *total_out) { switch (type) { case BTRFS_COMPRESS_ZLIB: - return zlib_compress_pages(ws, mapping, start, pages, - out_pages, total_in, total_out); + return zlib_compress_folios(ws, mapping, start, folios, + out_folios, total_in, total_out); case BTRFS_COMPRESS_LZO: - return lzo_compress_pages(ws, mapping, start, pages, - out_pages, total_in, total_out); + return lzo_compress_folios(ws, mapping, start, folios, + out_folios, total_in, total_out); case BTRFS_COMPRESS_ZSTD: - return zstd_compress_pages(ws, mapping, start, pages, - out_pages, total_in, total_out); + return zstd_compress_folios(ws, mapping, start, folios, + out_folios, total_in, total_out); case BTRFS_COMPRESS_NONE: default: /* @@ -118,7 +115,7 @@ static int compression_compress_pages(int type, struct list_head *ws, * Not a big deal, just need to inform caller that we * haven't allocated any pages yet. */ - *out_pages = 0; + *out_folios = 0; return -E2BIG; } } @@ -161,11 +158,11 @@ static int compression_decompress(int type, struct list_head *ws, } } -static void btrfs_free_compressed_pages(struct compressed_bio *cb) +static void btrfs_free_compressed_folios(struct compressed_bio *cb) { - for (unsigned int i = 0; i < cb->nr_pages; i++) - btrfs_free_compr_page(cb->compressed_pages[i]); - kfree(cb->compressed_pages); + for (unsigned int i = 0; i < cb->nr_folios; i++) + btrfs_free_compr_folio(cb->compressed_folios[i]); + kfree(cb->compressed_folios); } static int btrfs_decompress_bio(struct compressed_bio *cb); @@ -226,25 +223,25 @@ static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_co /* * Common wrappers for page allocation from compression wrappers */ -struct page *btrfs_alloc_compr_page(void) +struct folio *btrfs_alloc_compr_folio(void) { - struct page *page = NULL; + struct folio *folio = NULL; spin_lock(&compr_pool.lock); if (compr_pool.count > 0) { - page = list_first_entry(&compr_pool.list, struct page, lru); - list_del_init(&page->lru); + folio = list_first_entry(&compr_pool.list, struct folio, lru); + list_del_init(&folio->lru); compr_pool.count--; } spin_unlock(&compr_pool.lock); - if (page) - return page; + if (folio) + return folio; - return alloc_page(GFP_NOFS); + return folio_alloc(GFP_NOFS, 0); } -void btrfs_free_compr_page(struct page *page) +void btrfs_free_compr_folio(struct folio *folio) { bool do_free = false; @@ -252,7 +249,7 @@ void btrfs_free_compr_page(struct page *page) if (compr_pool.count > compr_pool.thresh) { do_free = true; } else { - list_add(&page->lru, &compr_pool.list); + list_add(&folio->lru, &compr_pool.list); compr_pool.count++; } spin_unlock(&compr_pool.lock); @@ -260,8 +257,8 @@ void btrfs_free_compr_page(struct page *page) if (!do_free) return; - ASSERT(page_ref_count(page) == 1); - put_page(page); + ASSERT(folio_ref_count(folio) == 1); + folio_put(folio); } static void end_bbio_comprssed_read(struct btrfs_bio *bbio) @@ -272,7 +269,7 @@ static void end_bbio_comprssed_read(struct btrfs_bio *bbio) if (!status) status = errno_to_blk_status(btrfs_decompress_bio(cb)); - btrfs_free_compressed_pages(cb); + btrfs_free_compressed_folios(cb); btrfs_bio_end_io(cb->orig_bbio, status); bio_put(&bbio->bio); } @@ -284,7 +281,7 @@ static void end_bbio_comprssed_read(struct btrfs_bio *bbio) static noinline void end_compressed_writeback(const struct compressed_bio *cb) { struct inode *inode = &cb->bbio.inode->vfs_inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct folio_batch fbatch; @@ -326,7 +323,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) end_compressed_writeback(cb); /* Note, our inode could be gone now */ - btrfs_free_compressed_pages(cb); + btrfs_free_compressed_folios(cb); bio_put(&cb->bbio.bio); } @@ -345,17 +342,18 @@ static void end_bbio_comprssed_write(struct btrfs_bio *bbio) queue_work(fs_info->compressed_write_workers, &cb->write_end_work); } -static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) +static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) { struct bio *bio = &cb->bbio.bio; u32 offset = 0; while (offset < cb->compressed_len) { + int ret; u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE); /* Maximum compressed extent is smaller than bio size limit. */ - __bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT], - len, 0); + ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT], len, 0); + ASSERT(ret); offset += len; } } @@ -370,8 +368,8 @@ static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) * the end io hooks. */ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, - struct page **compressed_pages, - unsigned int nr_pages, + struct folio **compressed_folios, + unsigned int nr_folios, blk_opf_t write_flags, bool writeback) { @@ -387,14 +385,14 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, end_bbio_comprssed_write); cb->start = ordered->file_offset; cb->len = ordered->num_bytes; - cb->compressed_pages = compressed_pages; + cb->compressed_folios = compressed_folios; cb->compressed_len = ordered->disk_num_bytes; cb->writeback = writeback; INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); - cb->nr_pages = nr_pages; + cb->nr_folios = nr_folios; cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; cb->bbio.ordered = ordered; - btrfs_add_compressed_bio_pages(cb); + btrfs_add_compressed_bio_folios(cb); btrfs_submit_bio(&cb->bbio, 0); } @@ -415,7 +413,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, struct compressed_bio *cb, int *memstall, unsigned long *pflags) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); unsigned long end_index; struct bio *orig_bio = &cb->orig_bbio->bio; u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; @@ -441,7 +439,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * This makes readahead less effective, so here disable readahead for * subpage for now, until full compressed write is supported. */ - if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE) + if (fs_info->sectorsize < PAGE_SIZE) return 0; end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; @@ -602,14 +600,14 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) free_extent_map(em); - cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); - cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS); - if (!cb->compressed_pages) { + cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE); + cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS); + if (!cb->compressed_folios) { ret = BLK_STS_RESOURCE; goto out_free_bio; } - ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages, 0); + ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios, 0); if (ret2) { ret = BLK_STS_RESOURCE; goto out_free_compressed_pages; @@ -621,7 +619,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) /* include any pages we added in add_ra-bio_pages */ cb->len = bbio->bio.bi_iter.bi_size; cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector; - btrfs_add_compressed_bio_pages(cb); + btrfs_add_compressed_bio_folios(cb); if (memstall) psi_memstall_leave(&pflags); @@ -630,7 +628,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) return; out_free_compressed_pages: - kfree(cb->compressed_pages); + kfree(cb->compressed_folios); out_free_bio: bio_put(&cb->bbio.bio); out: @@ -977,6 +975,32 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level) return level; } +/* A wrapper around filemap_get_folio(), with extra error message. */ +int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, + struct folio **in_folio_ret) +{ + struct folio *in_folio; + + /* + * The compressed write path should have the folio locked already, + * thus we only need to grab one reference. + */ + in_folio = filemap_get_folio(mapping, start >> PAGE_SHIFT); + if (IS_ERR(in_folio)) { + struct btrfs_inode *binode = BTRFS_I(mapping->host); + struct btrfs_fs_info *fs_info = binode->root->fs_info; + + btrfs_crit(fs_info, + "failed to get page cache, root %lld ino %llu file offset %llu", + binode->root->root_key.objectid, btrfs_ino(binode), + start); + ASSERT(0); + return -ENOENT; + } + *in_folio_ret = in_folio; + return 0; +} + /* * Given an address space and start and length, compress the bytes into @pages * that are allocated on demand. @@ -997,9 +1021,9 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level) * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes */ -int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, - u64 start, struct page **pages, - unsigned long *out_pages, +int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, + u64 start, struct folio **folios, + unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { @@ -1010,8 +1034,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, level = btrfs_compress_set_level(type, level); workspace = get_workspace(type, level); - ret = compression_compress_pages(type, workspace, mapping, start, pages, - out_pages, total_in, total_out); + ret = compression_compress_pages(type, workspace, mapping, start, folios, + out_folios, total_in, total_out); put_workspace(type, workspace); return ret; } @@ -1039,7 +1063,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { - struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page); struct list_head *workspace; const u32 sectorsize = fs_info->sectorsize; int ret; @@ -1479,11 +1503,6 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, /* * Compression heuristic. * - * For now is's a naive and optimistic 'return true', we'll extend the logic to - * quickly (compared to direct compression) detect data characteristics - * (compressible/incompressible) to avoid wasting CPU time on incompressible - * data. - * * The following types of analysis can be performed: * - detect mostly zero data * - detect data with low "byte set" size (text, etc) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index afd7e50d073d4a..a31e8fc938ac55 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -7,10 +7,18 @@ #define BTRFS_COMPRESSION_H #include +#include +#include +#include +#include #include "bio.h" +struct address_space; +struct page; +struct inode; struct btrfs_inode; struct btrfs_ordered_extent; +struct btrfs_bio; /* * We want to make sure that amount of RAM required to uncompress an extent is @@ -32,14 +40,12 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 -struct page; - struct compressed_bio { - /* Number of compressed pages in the array */ - unsigned int nr_pages; + /* Number of compressed folios in the array */ + unsigned int nr_folios; - /* the pages with the compressed data on them */ - struct page **compressed_pages; + /* the folios with the compressed data on them */ + struct folio **compressed_folios; /* starting offset in the inode for our pages */ u64 start; @@ -79,9 +85,9 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level) int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); -int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, - u64 start, struct page **pages, - unsigned long *out_pages, +int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, + u64 start, struct folio **folios, + unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, @@ -90,16 +96,16 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, - struct page **compressed_pages, - unsigned int nr_pages, - blk_opf_t write_flags, - bool writeback); + struct folio **compressed_folios, + unsigned int nr_folios, + blk_opf_t write_flags, + bool writeback); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); -struct page *btrfs_alloc_compr_page(void); -void btrfs_free_compr_page(struct page *page); +struct folio *btrfs_alloc_compr_folio(void); +void btrfs_free_compr_folio(struct folio *folio); enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, @@ -143,8 +149,11 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len); int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); -int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, +int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, + struct folio **in_folio_ret); + +int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, @@ -154,8 +163,8 @@ struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); struct list_head *zlib_get_workspace(unsigned int level); -int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, +int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, @@ -164,12 +173,12 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, struct list_head *lzo_alloc_workspace(unsigned int level); void lzo_free_workspace(struct list_head *ws); -int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, +int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e65e012bac5531..aaf53fd84358ea 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -820,7 +820,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, } while (low < high) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; unsigned long oil; unsigned long offset; struct btrfs_disk_key *tmp; @@ -4280,6 +4280,10 @@ void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, /* * Given a key and some data, insert items into the tree. * This does all the path init required, making room in the tree if needed. + * + * Returns: 0 on success + * -EEXIST if the first key already exists + * < 0 on other errors */ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -5082,9 +5086,7 @@ int btrfs_previous_extent_item(struct btrfs_root *root, int __init btrfs_ctree_init(void) { - btrfs_path_cachep = kmem_cache_create("btrfs_path", - sizeof(struct btrfs_path), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0); if (!btrfs_path_cachep) return -ENOMEM; return 0; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 70e828d33177d6..c03c58246033bf 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -7,25 +7,24 @@ #define BTRFS_CTREE_H #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include "locking.h" #include "fs.h" #include "accessors.h" +#include "extent-io-tree.h" +struct extent_buffer; +struct btrfs_block_rsv; struct btrfs_trans_handle; -struct btrfs_transaction; -struct btrfs_pending_snapshot; -struct btrfs_delayed_ref_root; -struct btrfs_space_info; struct btrfs_block_group; -struct btrfs_ordered_sum; -struct btrfs_ref; -struct btrfs_bio; -struct btrfs_ioctl_encoded_io_args; -struct btrfs_device; -struct btrfs_fs_devices; -struct btrfs_balance_control; -struct btrfs_delayed_root; -struct reloc_control; /* Read ahead values for struct btrfs_path.reada */ enum { @@ -478,8 +477,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) return mapping_gfp_constraint(mapping, ~__GFP_FS); } -int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, - u64 start, u64 end); +void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 5b0b645714183a..ca4cfe4f3f5dcf 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -6,7 +6,6 @@ #include #include "ctree.h" #include "disk-io.h" -#include "print-tree.h" #include "transaction.h" #include "locking.h" #include "accessors.h" @@ -521,7 +520,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, * keep_locks set and lowest_level is 1, regardless of the value of * path->slots[1]. */ - BUG_ON(path->locks[1] == 0); + ASSERT(path->locks[1] != 0); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, &last_ret, @@ -810,7 +809,7 @@ static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, u32 extent_thresh, u64 newer_than, bool locked) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *next; bool ret = false; @@ -861,20 +860,22 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, * NOTE: Caller should also wait for page writeback after the cluster is * prepared, here we don't do writeback wait for each page. */ -static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) +static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, + u64 folio_start) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; gfp_t mask = btrfs_alloc_write_mask(mapping); - u64 page_start = (u64)index << PAGE_SHIFT; - u64 page_end = page_start + PAGE_SIZE - 1; + u64 folio_end = folio_start + fs_info->folio_size - 1; struct extent_state *cached_state = NULL; - struct page *page; + struct folio *folio; int ret; again: - page = find_or_create_page(mapping, index, mask); - if (!page) - return ERR_PTR(-ENOMEM); + folio = __filemap_get_folio(mapping, folio_start >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + if (IS_ERR(folio)) + return folio; /* * Since we can defragment files opened read-only, we can encounter @@ -884,16 +885,16 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i * executables that explicitly enable them, so this isn't very * restrictive. */ - if (PageCompound(page)) { - unlock_page(page); - put_page(page); + if (folio_test_large(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-ETXTBSY); } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return ERR_PTR(ret); } @@ -901,24 +902,25 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i while (1) { struct btrfs_ordered_extent *ordered; - lock_extent(&inode->io_tree, page_start, page_end, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); - unlock_extent(&inode->io_tree, page_start, page_end, + lock_extent(&inode->io_tree, folio_start, folio_end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, folio_start, + fs_info->folio_size); + unlock_extent(&inode->io_tree, folio_start, folio_end, &cached_state); if (!ordered) break; - unlock_page(page); + folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); - lock_page(page); + folio_lock(folio); /* - * We unlocked the page above, so we need check if it was + * We unlocked the folio above, so we need check if it was * released or not. */ - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); + if (folio->mapping != mapping || !folio->private) { + folio_unlock(folio); + folio_put(folio); goto again; } } @@ -927,21 +929,21 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t i * Now the page range has no ordered extent any more. Read the page to * make it uptodate. */ - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping || !folio->private) { + folio_unlock(folio); + folio_put(folio); goto again; } - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-EIO); } } - return page; + return folio; } struct defrag_target_range { @@ -1162,20 +1164,20 @@ static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); */ static int defrag_one_locked_target(struct btrfs_inode *inode, struct defrag_target_range *target, - struct page **pages, int nr_pages, + struct folio **folios, int nr_folios, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_changeset *data_reserved = NULL; const u64 start = target->start; const u64 len = target->len; - unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; - unsigned long start_index = start >> PAGE_SHIFT; - unsigned long first_index = page_index(pages[0]); + unsigned long last_index = (start + len - 1) >> fs_info->folio_shift; + unsigned long start_index = start >> fs_info->folio_shift; + unsigned long first_index = folios[0]->index; int ret = 0; int i; - ASSERT(last_index - first_index + 1 <= nr_pages); + ASSERT(last_index - first_index + 1 <= nr_folios); ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); if (ret < 0) @@ -1186,10 +1188,10 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, set_extent_bit(&inode->io_tree, start, start + len - 1, EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); - /* Update the page status */ + /* Update the folio status */ for (i = start_index - first_index; i <= last_index - first_index; i++) { - ClearPageChecked(pages[i]); - btrfs_folio_clamp_set_dirty(fs_info, page_folio(pages[i]), start, len); + folio_clear_checked(folios[i]); + btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len); } btrfs_delalloc_release_extents(inode, len); extent_changeset_free(data_reserved); @@ -1201,40 +1203,42 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, u32 extent_thresh, u64 newer_than, bool do_compress, u64 *last_scanned_ret) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_state *cached_state = NULL; struct defrag_target_range *entry; struct defrag_target_range *tmp; LIST_HEAD(target_list); - struct page **pages; - const u32 sectorsize = inode->root->fs_info->sectorsize; - u64 last_index = (start + len - 1) >> PAGE_SHIFT; - u64 start_index = start >> PAGE_SHIFT; - unsigned int nr_pages = last_index - start_index + 1; + struct folio **folios; + const u32 sectorsize = fs_info->sectorsize; + u64 last_index = (start + len - 1) >> fs_info->folio_shift; + u64 start_index = start >> fs_info->folio_shift; + unsigned int nr_folios = last_index - start_index + 1; int ret = 0; int i; - ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); + ASSERT(nr_folios <= (CLUSTER_SIZE >> fs_info->folio_shift)); ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) + folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); + if (!folios) return -ENOMEM; /* Prepare all pages */ - for (i = 0; i < nr_pages; i++) { - pages[i] = defrag_prepare_one_page(inode, start_index + i); - if (IS_ERR(pages[i])) { - ret = PTR_ERR(pages[i]); - pages[i] = NULL; - goto free_pages; + for (i = 0; i < nr_folios ; i++) { + folios[i] = defrag_prepare_one_folio(inode, + (start_index + i) << fs_info->folio_shift); + if (IS_ERR(folios[i])) { + ret = PTR_ERR(folios[i]); + nr_folios = i; + goto free_folios; } } - for (i = 0; i < nr_pages; i++) - wait_on_page_writeback(pages[i]); + for (i = 0; i < nr_folios; i++) + folio_wait_writeback(folios[i]); /* Lock the pages range */ - lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + lock_extent(&inode->io_tree, start_index << fs_info->folio_shift, + (last_index << fs_info->folio_shift) + fs_info->folio_size - 1, &cached_state); /* * Now we have a consistent view about the extent map, re-check @@ -1250,7 +1254,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, goto unlock_extent; list_for_each_entry(entry, &target_list, list) { - ret = defrag_one_locked_target(inode, entry, pages, nr_pages, + ret = defrag_one_locked_target(inode, entry, folios, nr_folios, &cached_state); if (ret < 0) break; @@ -1261,17 +1265,15 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, kfree(entry); } unlock_extent: - unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + unlock_extent(&inode->io_tree, start_index << fs_info->folio_shift, + (last_index << fs_info->folio_shift) + fs_info->folio_size - 1, &cached_state); -free_pages: - for (i = 0; i < nr_pages; i++) { - if (pages[i]) { - unlock_page(pages[i]); - put_page(pages[i]); - } +free_folios: + for (i = 0; i < nr_folios; i++) { + folio_unlock(folios[i]); + folio_put(folios[i]); } - kfree(pages); + kfree(folios); return ret; } @@ -1283,7 +1285,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode, unsigned long max_sectors, u64 *last_scanned_ret) { - const u32 sectorsize = inode->root->fs_info->sectorsize; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; struct defrag_target_range *entry; struct defrag_target_range *tmp; LIST_HEAD(target_list); @@ -1366,7 +1369,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); unsigned long sectors_defragged = 0; u64 isize = i_size_read(inode); u64 cur; @@ -1422,7 +1425,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, * Make writeback start from the beginning of the range, so that the * defrag range can be written sequentially. */ - start_index = cur >> PAGE_SHIFT; + start_index = cur >> fs_info->folio_shift; if (start_index < inode->i_mapping->writeback_index) inode->i_mapping->writeback_index = start_index; @@ -1437,8 +1440,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, } /* We want the cluster end at page boundary when possible */ - cluster_end = (((cur >> PAGE_SHIFT) + - (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; + cluster_end = (((cur >> fs_info->folio_shift) + + (SZ_256K >> fs_info->folio_shift)) << fs_info->folio_shift) - 1; cluster_end = min(cluster_end, last_byte); btrfs_inode_lock(BTRFS_I(inode), 0); @@ -1512,9 +1515,7 @@ void __cold btrfs_auto_defrag_exit(void) int __init btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", - sizeof(struct inode_defrag), 0, - SLAB_MEM_SPREAD, - NULL); + sizeof(struct inode_defrag), 0, 0, NULL); if (!btrfs_inode_defrag_cachep) return -ENOMEM; diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 5a62763528d1b5..878528e086fbe8 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -3,6 +3,16 @@ #ifndef BTRFS_DEFRAG_H #define BTRFS_DEFRAG_H +#include +#include + +struct inode; +struct file_ra_state; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_trans_handle; +struct btrfs_ioctl_defrag_range_args; + int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index acf9f4b6c04402..b3527efd0b4b52 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -6,9 +6,7 @@ #include "block-rsv.h" #include "btrfs_inode.h" #include "space-info.h" -#include "transaction.h" #include "qgroup.h" -#include "block-group.h" #include "fs.h" /* diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index c5d573f2366e37..ce4f889e4f17b7 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -3,7 +3,11 @@ #ifndef BTRFS_DELALLOC_SPACE_H #define BTRFS_DELALLOC_SPACE_H +#include + struct extent_changeset; +struct btrfs_inode; +struct btrfs_fs_info; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); int btrfs_check_data_free_space(struct btrfs_inode *inode, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 08102883f560a3..dd6f566a383f00 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -28,11 +28,7 @@ static struct kmem_cache *delayed_node_cache; int __init btrfs_delayed_inode_init(void) { - delayed_node_cache = kmem_cache_create("btrfs_delayed_node", - sizeof(struct btrfs_delayed_node), - 0, - SLAB_MEM_SPREAD, - NULL); + delayed_node_cache = KMEM_CACHE(btrfs_delayed_node, 0); if (!delayed_node_cache) return -ENOMEM; return 0; @@ -43,6 +39,17 @@ void __cold btrfs_delayed_inode_exit(void) kmem_cache_destroy(delayed_node_cache); } +void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root) +{ + atomic_set(&delayed_root->items, 0); + atomic_set(&delayed_root->items_seq, 0); + delayed_root->nodes = 0; + spin_lock_init(&delayed_root->lock); + init_waitqueue_head(&delayed_root->wait); + INIT_LIST_HEAD(&delayed_root->node_list); + INIT_LIST_HEAD(&delayed_root->prepare_list); +} + static inline void btrfs_init_delayed_node( struct btrfs_delayed_node *delayed_node, struct btrfs_root *root, u64 inode_id) @@ -430,8 +437,6 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) delayed_root = delayed_node->root->fs_info->delayed_root; - BUG_ON(!delayed_root); - if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; else @@ -980,7 +985,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) if (delayed_node && test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { - BUG_ON(!delayed_node->root); + ASSERT(delayed_node->root); clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count--; diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 5cceb31bbd16b2..64e115d9749912 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -7,15 +7,23 @@ #ifndef BTRFS_DELAYED_INODE_H #define BTRFS_DELAYED_INODE_H +#include #include #include #include #include #include +#include #include #include #include "ctree.h" +struct btrfs_disk_key; +struct btrfs_fs_info; +struct btrfs_inode; +struct btrfs_root; +struct btrfs_trans_handle; + enum btrfs_delayed_item_type { BTRFS_DELAYED_INSERTION_ITEM, BTRFS_DELAYED_DELETION_ITEM @@ -98,18 +106,7 @@ struct btrfs_delayed_item { char data[] __counted_by(data_len); }; -static inline void btrfs_init_delayed_root( - struct btrfs_delayed_root *delayed_root) -{ - atomic_set(&delayed_root->items, 0); - atomic_set(&delayed_root->items_seq, 0); - delayed_root->nodes = 0; - spin_lock_init(&delayed_root->lock); - init_waitqueue_head(&delayed_root->wait); - INIT_LIST_HEAD(&delayed_root->node_list); - INIT_LIST_HEAD(&delayed_root->prepare_list); -} - +void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root); int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 891ea2fa263c93..e44e62cf76bc9d 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -1004,6 +1004,52 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&ref->add_list); } +void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, int action, u64 bytenr, + u64 len, u64 parent, u64 owning_root) +{ + generic_ref->action = action; + generic_ref->bytenr = bytenr; + generic_ref->len = len; + generic_ref->parent = parent; + generic_ref->owning_root = owning_root; +} + +void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 root, + u64 mod_root, bool skip_qgroup) +{ +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* If @real_root not set, use @root as fallback */ + generic_ref->real_root = mod_root ?: root; +#endif + generic_ref->tree_ref.level = level; + generic_ref->tree_ref.ref_root = root; + generic_ref->type = BTRFS_REF_METADATA; + if (skip_qgroup || !(is_fstree(root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; + +} + +void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ref_root, u64 ino, + u64 offset, u64 mod_root, bool skip_qgroup) +{ +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* If @real_root not set, use @root as fallback */ + generic_ref->real_root = mod_root ?: ref_root; +#endif + generic_ref->data_ref.ref_root = ref_root; + generic_ref->data_ref.ino = ino; + generic_ref->data_ref.offset = offset; + generic_ref->type = BTRFS_REF_DATA; + if (skip_qgroup || !(is_fstree(ref_root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; +} + /* * add a delayed tree ref. This does all of the accounting required * to make sure the delayed ref is eventually processed before this @@ -1220,6 +1266,25 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, return 0; } +void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) +{ + if (refcount_dec_and_test(&ref->refs)) { + WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); + switch (ref->type) { + case BTRFS_TREE_BLOCK_REF_KEY: + case BTRFS_SHARED_BLOCK_REF_KEY: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + case BTRFS_SHARED_DATA_REF_KEY: + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + break; + default: + BUG(); + } + } +} + /* * This does a simple search for the head node for a given extent. Returns the * head node if found, or NULL if not. @@ -1242,31 +1307,19 @@ void __cold btrfs_delayed_ref_exit(void) int __init btrfs_delayed_ref_init(void) { - btrfs_delayed_ref_head_cachep = kmem_cache_create( - "btrfs_delayed_ref_head", - sizeof(struct btrfs_delayed_ref_head), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0); if (!btrfs_delayed_ref_head_cachep) goto fail; - btrfs_delayed_tree_ref_cachep = kmem_cache_create( - "btrfs_delayed_tree_ref", - sizeof(struct btrfs_delayed_tree_ref), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_tree_ref_cachep = KMEM_CACHE(btrfs_delayed_tree_ref, 0); if (!btrfs_delayed_tree_ref_cachep) goto fail; - btrfs_delayed_data_ref_cachep = kmem_cache_create( - "btrfs_delayed_data_ref", - sizeof(struct btrfs_delayed_data_ref), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_data_ref_cachep = KMEM_CACHE(btrfs_delayed_data_ref, 0); if (!btrfs_delayed_data_ref_cachep) goto fail; - btrfs_delayed_extent_op_cachep = kmem_cache_create( - "btrfs_delayed_extent_op", - sizeof(struct btrfs_delayed_extent_op), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0); if (!btrfs_delayed_extent_op_cachep) goto fail; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 62d679d40f4f91..b291147cb8ab94 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -6,7 +6,17 @@ #ifndef BTRFS_DELAYED_REF_H #define BTRFS_DELAYED_REF_H +#include #include +#include +#include +#include +#include +#include +#include + +struct btrfs_trans_handle; +struct btrfs_fs_info; /* these are the possible values of struct btrfs_delayed_ref_node->action */ enum btrfs_delayed_ref_action { @@ -308,53 +318,12 @@ static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info * return btrfs_calc_metadata_size(fs_info, num_csum_items); } -static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, - int action, u64 bytenr, u64 len, - u64 parent, u64 owning_root) -{ - generic_ref->action = action; - generic_ref->bytenr = bytenr; - generic_ref->len = len; - generic_ref->parent = parent; - generic_ref->owning_root = owning_root; -} - -static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, - u64 root, u64 mod_root, bool skip_qgroup) -{ -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - /* If @real_root not set, use @root as fallback */ - generic_ref->real_root = mod_root ?: root; -#endif - generic_ref->tree_ref.level = level; - generic_ref->tree_ref.ref_root = root; - generic_ref->type = BTRFS_REF_METADATA; - if (skip_qgroup || !(is_fstree(root) && - (!mod_root || is_fstree(mod_root)))) - generic_ref->skip_qgroup = true; - else - generic_ref->skip_qgroup = false; - -} - -static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref, - u64 ref_root, u64 ino, u64 offset, u64 mod_root, - bool skip_qgroup) -{ -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - /* If @real_root not set, use @root as fallback */ - generic_ref->real_root = mod_root ?: ref_root; -#endif - generic_ref->data_ref.ref_root = ref_root; - generic_ref->data_ref.ino = ino; - generic_ref->data_ref.offset = offset; - generic_ref->type = BTRFS_REF_DATA; - if (skip_qgroup || !(is_fstree(ref_root) && - (!mod_root || is_fstree(mod_root)))) - generic_ref->skip_qgroup = true; - else - generic_ref->skip_qgroup = false; -} +void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, int action, u64 bytenr, + u64 len, u64 parent, u64 owning_root); +void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 root, + u64 mod_root, bool skip_qgroup); +void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ref_root, u64 ino, + u64 offset, u64 mod_root, bool skip_qgroup); static inline struct btrfs_delayed_extent_op * btrfs_alloc_delayed_extent_op(void) @@ -369,24 +338,7 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) kmem_cache_free(btrfs_delayed_extent_op_cachep, op); } -static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) -{ - if (refcount_dec_and_test(&ref->refs)) { - WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); - switch (ref->type) { - case BTRFS_TREE_BLOCK_REF_KEY: - case BTRFS_SHARED_BLOCK_REF_KEY: - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - break; - case BTRFS_EXTENT_DATA_REF_KEY: - case BTRFS_SHARED_DATA_REF_KEY: - kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); - break; - default: - BUG(); - } - } -} +void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref); static inline u64 btrfs_ref_head_to_space_flags( struct btrfs_delayed_ref_head *head_ref) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 79c4293ddf373f..e24d784898fcd9 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -11,10 +11,8 @@ #include #include "misc.h" #include "ctree.h" -#include "extent_map.h" #include "disk-io.h" #include "transaction.h" -#include "print-tree.h" #include "volumes.h" #include "async-thread.h" #include "dev-replace.h" @@ -1000,8 +998,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_sysfs_remove_device(src_device); btrfs_sysfs_update_devid(tgt_device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) - btrfs_scratch_superblocks(fs_info, src_device->bdev, - src_device->name->str); + btrfs_scratch_superblocks(fs_info, src_device); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 675082ccec89f8..23e480efe5e6e4 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -6,11 +6,15 @@ #ifndef BTRFS_DEV_REPLACE_H #define BTRFS_DEV_REPLACE_H +#include +#include + struct btrfs_ioctl_dev_replace_args; struct btrfs_fs_info; struct btrfs_trans_handle; struct btrfs_dev_replace; struct btrfs_block_group; +struct btrfs_device; int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index e40a226373d7ec..00b3d83d7569e5 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -3,9 +3,15 @@ #ifndef BTRFS_DIR_ITEM_H #define BTRFS_DIR_ITEM_H +#include #include struct fscrypt_str; +struct btrfs_fs_info; +struct btrfs_key; +struct btrfs_path; +struct btrfs_root; +struct btrfs_trans_handle; int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const struct fscrypt_str *name); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c843563914cad0..32c900433f5d12 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -29,7 +29,6 @@ #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "rcu-string.h" #include "dev-replace.h" #include "raid56.h" #include "sysfs.h" @@ -193,7 +192,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, struct folio *folio = eb->folios[i]; u64 start = max_t(u64, eb->start, folio_pos(folio)); u64 end = min_t(u64, eb->start + eb->len, - folio_pos(folio) + folio_size(folio)); + folio_pos(folio) + eb->folio_size); u32 len = end - start; ret = btrfs_repair_io_failure(fs_info, 0, start, len, @@ -498,15 +497,15 @@ static int btree_migrate_folio(struct address_space *mapping, static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct btrfs_fs_info *fs_info; int ret; if (wbc->sync_mode == WB_SYNC_NONE) { + struct btrfs_fs_info *fs_info; if (wbc->for_kupdate) return 0; - fs_info = BTRFS_I(mapping->host)->root->fs_info; + fs_info = inode_to_fs_info(mapping->host); /* this is a bit racy, but that's ok */ ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, BTRFS_DIRTY_METADATA_THRESH, @@ -529,11 +528,12 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct extent_io_tree *tree; - tree = &BTRFS_I(folio->mapping->host)->io_tree; + + tree = &folio_to_inode(folio)->io_tree; extent_invalidate_folio(tree, folio, offset); btree_release_folio(folio, GFP_NOFS); if (folio_get_private(folio)) { - btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info, + btrfs_warn(folio_to_fs_info(folio), "folio private not zero on folio %llu", (unsigned long long)folio_pos(folio)); folio_detach_private(folio); @@ -544,7 +544,7 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset, static bool btree_dirty_folio(struct address_space *mapping, struct folio *folio) { - struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); struct btrfs_subpage_info *spi = fs_info->subpage_info; struct btrfs_subpage *subpage; struct extent_buffer *eb; @@ -1244,6 +1244,7 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "leaked root %s refcount %d", btrfs_root_name(&root->root_key, buf), refcount_read(&root->refs)); + WARN_ON_ONCE(1); while (refcount_read(&root->refs) > 1) btrfs_put_root(root); btrfs_put_root(root); @@ -2239,7 +2240,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) struct btrfs_key location; int ret; - BUG_ON(!fs_info->tree_root); + ASSERT(fs_info->tree_root); ret = load_global_roots(tree_root); if (ret) @@ -2821,6 +2822,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->sectorsize = 4096; fs_info->sectorsize_bits = ilog2(4096); fs_info->stripesize = 4096; + fs_info->folio_size = PAGE_SIZE; + fs_info->folio_shift = PAGE_SHIFT; /* Default compress algorithm when user does -o compress */ fs_info->compress_type = BTRFS_COMPRESS_ZLIB; @@ -2839,6 +2842,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block int ret; fs_info->sb = sb; + /* Temporary fixed values for block size until we read the superblock. */ sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); @@ -3313,6 +3317,15 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; + if (sectorsize > PAGE_SIZE) { + /* For future multi-page sectorsize support */ + fs_info->folio_size = sectorsize; + fs_info->sectorsize_bits = fs_info->sectorsize_bits; + } else { + fs_info->folio_size = PAGE_SIZE; + fs_info->folio_shift = PAGE_SHIFT; + } + /* * Handle the space caching options appropriately now that we have the * super block loaded and validated. @@ -3356,6 +3369,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); + /* Update the values for the current filesystem. */ sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE); @@ -4626,7 +4640,7 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) struct inode *inode = NULL; btrfs_inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); - __btrfs_del_delalloc_inode(root, btrfs_inode); + btrfs_del_delalloc_inode(btrfs_inode); spin_unlock(&root->delalloc_lock); /* @@ -4925,7 +4939,14 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) goto error; - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist a root + * with such id, but this is out of valid range. + */ + ret = -EUCLEAN; + goto error; + } if (path->slots[0] > 0) { slot = path->slots[0] - 1; l = path->nodes[0]; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index eb3473d1c1ac1b..76eb53fe7a1145 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -6,6 +6,22 @@ #ifndef BTRFS_DISK_IO_H #define BTRFS_DISK_IO_H +#include +#include +#include "ctree.h" +#include "fs.h" + +struct block_device; +struct super_block; +struct extent_buffer; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_fs_info; +struct btrfs_super_block; +struct btrfs_trans_handle; +struct btrfs_tree_parent_check; +struct btrfs_transaction; + #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 @@ -25,10 +41,6 @@ static inline u64 btrfs_sb_offset(int mirror) return BTRFS_SUPER_INFO_OFFSET; } -struct btrfs_device; -struct btrfs_fs_devices; -struct btrfs_tree_parent_check; - void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 744a02b7fd6717..8398d345ec5b91 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -5,7 +5,6 @@ #include "ctree.h" #include "disk-io.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "export.h" #include "accessors.h" #include "super.h" @@ -174,8 +173,15 @@ struct dentry *btrfs_get_parent(struct dentry *child) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto fail; + if (ret == 0) { + /* + * Key with offset of -1 found, there would have to exist an + * inode with such number or a root with such id. + */ + ret = -EUCLEAN; + goto fail; + } - BUG_ON(ret == 0); /* Key with offset of -1 found */ if (path->slots[0] == 0) { ret = -ENOENT; goto fail; @@ -215,7 +221,7 @@ static int btrfs_get_name(struct dentry *parent, char *name, { struct inode *inode = d_inode(child); struct inode *dir = d_inode(parent); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_inode_ref *iref; diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index eba6bc4f5a619f..464582273af926 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -4,6 +4,10 @@ #define BTRFS_EXPORT_H #include +#include + +struct dentry; +struct super_block; extern const struct export_operations btrfs_export_ops; diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index e3ee5449cc4af7..c09b428823d76d 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -6,7 +6,6 @@ #include "ctree.h" #include "extent-io-tree.h" #include "btrfs_inode.h" -#include "misc.h" static struct kmem_cache *extent_state_cache; @@ -48,6 +47,7 @@ static inline void btrfs_extent_state_leak_debug_check(void) extent_state_in_tree(state), refcount_read(&state->refs)); list_del(&state->leak_list); + WARN_ON_ONCE(1); kmem_cache_free(extent_state_cache, state); } } @@ -1883,8 +1883,8 @@ void __cold extent_state_free_cachep(void) int __init extent_state_init_cachep(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", - sizeof(struct extent_state), 0, - SLAB_MEM_SPREAD, NULL); + sizeof(struct extent_state), 0, 0, + NULL); if (!extent_state_cache) return -ENOMEM; diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index ebe6390d65e9dd..9d3a52d8f59a80 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -3,9 +3,16 @@ #ifndef BTRFS_EXTENT_IO_TREE_H #define BTRFS_EXTENT_IO_TREE_H +#include +#include +#include +#include +#include #include "misc.h" struct extent_changeset; +struct btrfs_fs_info; +struct btrfs_inode; /* Bits for the extent state */ enum { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8e8cc11112772d..d4c70cdd3caf20 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -18,7 +18,7 @@ #include #include "ctree.h" #include "extent-tree.h" -#include "tree-log.h" +#include "transaction.h" #include "disk-io.h" #include "print-tree.h" #include "volumes.h" @@ -26,14 +26,11 @@ #include "locking.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "sysfs.h" #include "qgroup.h" #include "ref-verify.h" #include "space-info.h" #include "block-rsv.h" -#include "delalloc-space.h" #include "discard.h" -#include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" #include "fs.h" @@ -2399,7 +2396,14 @@ static noinline int check_committed_ref(struct btrfs_root *root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + ret = -EUCLEAN; + goto out; + } ret = -ENOENT; if (path->slots[0] == 0) @@ -2780,6 +2784,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, u64 total_unpinned = 0; u64 empty_cluster = 0; bool readonly; + int ret = 0; while (start <= end) { readonly = false; @@ -2789,7 +2794,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, btrfs_put_block_group(cache); total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); - BUG_ON(!cache); /* Logic error */ + if (cache == NULL) { + /* Logic error, something removed the block group. */ + ret = -EUCLEAN; + goto out; + } cluster = fetch_cluster_info(fs_info, cache->space_info, @@ -2858,7 +2867,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, if (cache) btrfs_put_block_group(cache); - return 0; +out: + return ret; } int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) @@ -2888,7 +2898,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) end + 1 - start, NULL); clear_extent_dirty(unpin, start, end, &cached_state); - unpin_extent_range(fs_info, start, end, true); + ret = unpin_extent_range(fs_info, start, end, true); + BUG_ON(ret); mutex_unlock(&fs_info->unused_bg_unpin_mutex); free_extent_state(cached_state); cond_resched(); @@ -3447,16 +3458,17 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 parent, int last_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_ref generic_ref = { 0 }; struct btrfs_block_group *bg; int ret; - btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, - buf->start, buf->len, parent, btrfs_header_owner(buf)); - btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), - root_id, 0, false); - if (root_id != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_ref generic_ref = { 0 }; + + btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, + buf->start, buf->len, parent, + btrfs_header_owner(buf)); + btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), + root_id, 0, false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); BUG_ON(ret); /* -ENOMEM */ @@ -4950,7 +4962,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 root_objectid = root->root_key.objectid; u64 owning_root = root_objectid; - BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); + ASSERT(root_objectid != BTRFS_TREE_LOG_OBJECTID); if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) owning_root = root->relocation_src_root; @@ -5176,8 +5188,16 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, parent = ins.objectid; flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; owning_root = reloc_src_root; - } else - BUG_ON(parent > 0); + } else { + if (unlikely(parent > 0)) { + /* + * Other roots than reloc tree don't expect start + * offset of a parent block. + */ + ret = -EUCLEAN; + goto out_free_reserved; + } + } if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { extent_op = btrfs_alloc_delayed_extent_op(); @@ -6167,10 +6187,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } -int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, - u64 start, u64 end) +/* + * Unpin the extent range in an error context and don't add the space back. + * Errors are not propagated further. + */ +void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { - return unpin_extent_range(fs_info, start, end, false); + unpin_extent_range(fs_info, start, end, false); } /* diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 2e066035cceeea..af9f8800d5aca5 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -3,11 +3,21 @@ #ifndef BTRFS_EXTENT_TREE_H #define BTRFS_EXTENT_TREE_H +#include #include "misc.h" #include "block-group.h" +#include "locking.h" +struct extent_buffer; struct btrfs_free_cluster; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_path; +struct btrfs_ref; +struct btrfs_disk_key; struct btrfs_delayed_ref_head; +struct btrfs_delayed_ref_root; +struct btrfs_extent_inline_ref; enum btrfs_extent_allocation_policy { BTRFS_EXTENT_ALLOC_CLUSTERED, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8b4bef05e22217..87e3af4a3847dd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -14,7 +14,6 @@ #include #include #include -#include "misc.h" #include "extent_io.h" #include "extent-io-tree.h" #include "extent_map.h" @@ -22,7 +21,6 @@ #include "btrfs_inode.h" #include "bio.h" #include "locking.h" -#include "rcu-string.h" #include "backref.h" #include "disk-io.h" #include "subpage.h" @@ -78,10 +76,11 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) eb = list_first_entry(&fs_info->allocated_ebs, struct extent_buffer, leak_list); pr_err( - "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n", + "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n", eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, btrfs_header_owner(eb)); list_del(&eb->leak_list); + WARN_ON_ONCE(1); kmem_cache_free(extent_buffer_cache, eb); } spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); @@ -147,8 +146,8 @@ static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) int __init extent_buffer_init_cachep(void) { extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", - sizeof(struct extent_buffer), 0, - SLAB_MEM_SPREAD, NULL); + sizeof(struct extent_buffer), 0, 0, + NULL); if (!extent_buffer_cache) return -ENOMEM; @@ -207,7 +206,7 @@ static void __process_pages_contig(struct address_space *mapping, struct page *locked_page, u64 start, u64 end, unsigned long page_ops) { - struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; pgoff_t index = start_index; @@ -251,7 +250,7 @@ static noinline int lock_delalloc_pages(struct inode *inode, u64 start, u64 end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct address_space *mapping = inode->i_mapping; pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; @@ -323,7 +322,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, u64 *end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u64 orig_start = *start; const u64 orig_end = *end; @@ -433,7 +432,7 @@ static bool btrfs_verify_page(struct page *page, u64 start) static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = page_to_fs_info(page); struct folio *folio = page_folio(page); ASSERT(page_offset(page) <= start && @@ -462,16 +461,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) */ static void end_bbio_data_write(struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct folio_iter fi; + const u32 sectorsize = fs_info->sectorsize; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_folio_all(fi, bio) { struct folio *folio = fi.folio; - struct inode *inode = folio->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const u32 sectorsize = fs_info->sectorsize; u64 start = folio_pos(folio) + fi.offset; u32 len = fi.length; @@ -593,22 +591,17 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) */ static void end_bbio_data_read(struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; struct processed_extent processed = { 0 }; struct folio_iter fi; - /* - * The offset to the beginning of a bio, since one bio can never be - * larger than UINT_MAX, u32 here is enough. - */ - u32 bio_offset = 0; + const u32 sectorsize = fs_info->sectorsize; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_folio_all(fi, &bbio->bio) { bool uptodate = !bio->bi_status; struct folio *folio = fi.folio; struct inode *inode = folio->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const u32 sectorsize = fs_info->sectorsize; u64 start; u64 end; u32 len; @@ -667,16 +660,43 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) end_page_read(folio_page(folio, 0), uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); - - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; - } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); bio_put(bio); } +/* + * Populate every free slot in a provided array with folios. + * + * @nr_folios: number of foliosto allocate + * @folio_array: the array to fill with folios; any existing non-null entries in + * the array will be skipped + * @extra_gfp: the extra GFP flags for the allocation. + * + * Return: 0 if all folios were able to be allocated; + * -ENOMEM otherwise, the partially allocated folios would be freed and + * the array slots zeroed + */ +int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array, + gfp_t extra_gfp) +{ + for (int i = 0; i < nr_folios; i++) { + if (folio_array[i]) + continue; + folio_array[i] = folio_alloc(GFP_NOFS | extra_gfp, 0); + if (!folio_array[i]) + goto error; + } + return 0; +error: + for (int i = 0; i < nr_folios; i++) { + if (folio_array[i]) + folio_put(folio_array[i]); + } + return -ENOMEM; +} + /* * Populate every free slot in a provided array with pages. * @@ -726,18 +746,40 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, * * For now, the folios populated are always in order 0 (aka, single page). */ -static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp) +static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp, + int order) { struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 }; int num_pages = num_extent_pages(eb); int ret; + if (order) { + /* + * For higher order folio allocation, we discard the extra_gfp + * (should only be __GFP_NOFAIL, and conflicts with higher order + * folio). + * + * Instead we want no warning when allocation failed, and no + * extra retry (to get a faster allocation). + * As we're completely fine to fall back to lower order. + */ + eb->folios[0] = folio_alloc(GFP_NOFS | __GFP_NOWARN | + __GFP_NORETRY, order); + if (eb->folios[0]) { + eb->folio_size = folio_size(eb->folios[0]); + eb->folio_shift = folio_shift(eb->folios[0]); + return 0; + } + /* Fallback to 0 order (single page) folios. */ + } ret = btrfs_alloc_page_array(num_pages, page_array, extra_gfp); if (ret < 0) return ret; for (int i = 0; i < num_pages; i++) eb->folios[i] = page_folio(page_array[i]); + eb->folio_size = PAGE_SIZE; + eb->folio_shift = PAGE_SHIFT; return 0; } @@ -827,7 +869,7 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, u64 disk_bytenr, struct page *page, size_t size, unsigned long pg_offset) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_inode *inode = page_to_inode(page); ASSERT(pg_offset + size <= PAGE_SIZE); ASSERT(bio_ctrl->end_io_func); @@ -936,17 +978,21 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, int set_page_extent_mapped(struct page *page) { - struct folio *folio = page_folio(page); + return set_folio_extent_mapped(page_folio(page)); +} + +int set_folio_extent_mapped(struct folio *folio) +{ struct btrfs_fs_info *fs_info; - ASSERT(page->mapping); + ASSERT(folio->mapping); if (folio_test_private(folio)) return 0; - fs_info = btrfs_sb(page->mapping->host->i_sb); + fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, page->mapping)) + if (btrfs_is_subpage(fs_info, folio->mapping)) return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); @@ -963,20 +1009,21 @@ void clear_page_extent_mapped(struct page *page) if (!folio_test_private(folio)) return; - fs_info = btrfs_sb(page->mapping->host->i_sb); + fs_info = page_to_fs_info(page); if (btrfs_is_subpage(fs_info, page->mapping)) return btrfs_detach_subpage(fs_info, folio); folio_detach_private(folio); } -static struct extent_map * -__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, +static struct extent_map *__get_extent_map(struct inode *inode, struct page *page, u64 start, u64 len, struct extent_map **em_cached) { struct extent_map *em; - if (em_cached && *em_cached) { + ASSERT(em_cached); + + if (*em_cached) { em = *em_cached; if (extent_map_in_tree(em) && start >= em->start && start < extent_map_end(em)) { @@ -988,8 +1035,8 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); - if (em_cached && !IS_ERR(em)) { + em = btrfs_get_extent(BTRFS_I(inode), page, start, len); + if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; @@ -1007,7 +1054,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) { struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 start = page_offset(page); const u64 end = start + PAGE_SIZE - 1; u64 cur = start; @@ -1018,7 +1065,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, int ret = 0; size_t pg_offset = 0; size_t iosize; - size_t blocksize = inode->i_sb->s_blocksize; + size_t blocksize = fs_info->sectorsize; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; ret = set_page_extent_mapped(page); @@ -1051,8 +1098,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, end_page_read(page, true, cur, iosize); break; } - em = __get_extent_map(inode, page, pg_offset, cur, - end - cur + 1, em_cached); + em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { unlock_extent(tree, cur, end, NULL); end_page_read(page, false, cur, end + 1 - cur); @@ -1157,15 +1203,18 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { struct page *page = &folio->page; - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_inode *inode = page_to_inode(page); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; + struct extent_map *em_cached = NULL; int ret; btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL); + ret = btrfs_do_readpage(page, &em_cached, &bio_ctrl, NULL); + free_extent_map(em_cached); + /* * If btrfs_do_readpage() failed we will want to submit the assembled * bio to do the cleanup. @@ -1180,9 +1229,11 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) { - struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); + struct btrfs_inode *inode = page_to_inode(pages[0]); int index; + ASSERT(em_cached); + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { @@ -1371,7 +1422,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, continue; } - em = btrfs_get_extent(inode, NULL, 0, cur, len); + em = btrfs_get_extent(inode, NULL, cur, len); if (IS_ERR(em)) { ret = PTR_ERR_OR_ZERO(em); goto out_error; @@ -1739,10 +1790,10 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, folio_lock(folio); folio_clear_dirty_for_io(folio); folio_start_writeback(folio); - ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); wbc_account_cgroup_owner(wbc, folio_page(folio, 0), - folio_size(folio)); + eb->folio_size); wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } @@ -1766,7 +1817,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, */ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = page_to_fs_info(page); struct folio *folio = page_folio(page); int submitted = 0; u64 page_start = page_offset(page); @@ -1857,7 +1908,7 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) if (!folio_test_private(folio)) return 0; - if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) + if (page_to_fs_info(page)->nodesize < PAGE_SIZE) return submit_eb_subpage(page, wbc); spin_lock(&mapping->i_private_lock); @@ -1915,7 +1966,7 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { struct btrfs_eb_write_context ctx = { .wbc = wbc }; - struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; + struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); int ret = 0; int done = 0; int nr_to_write_done = 0; @@ -2203,7 +2254,7 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page, bool found_error = false; int ret = 0; struct address_space *mapping = inode->i_mapping; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); const u32 sectorsize = fs_info->sectorsize; loff_t i_size = i_size_read(inode); u64 cur = start; @@ -2309,7 +2360,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, struct extent_state *cached_state = NULL; u64 start = folio_pos(folio); u64 end = start + folio_size(folio) - 1; - size_t blocksize = folio->mapping->host->i_sb->s_blocksize; + size_t blocksize = folio_to_fs_info(folio)->sectorsize; /* This function is only called for the btree inode */ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); @@ -2378,7 +2429,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) struct extent_map *em; u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; - struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host); + struct btrfs_inode *btrfs_inode = page_to_inode(page); struct extent_io_tree *tree = &btrfs_inode->io_tree; struct extent_map_tree *map = &btrfs_inode->extent_tree; @@ -2453,19 +2504,96 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) return try_release_extent_state(tree, page, mask); } +struct btrfs_fiemap_entry { + u64 offset; + u64 phys; + u64 log_len; + u64 phys_len; + u32 flags; +}; + /* - * To cache previous fiemap extent + * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file + * range from the inode's io tree, unlock the subvolume tree search path, flush + * the fiemap cache and relock the file range and research the subvolume tree. + * The value here is something negative that can't be confused with a valid + * errno value and different from 1 because that's also a return value from + * fiemap_fill_next_extent() and also it's often used to mean some btree search + * did not find a key, so make it some distinct negative value. + */ +#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1)) + +/* + * Used to: + * + * - Cache the next entry to be emitted to the fiemap buffer, so that we can + * merge extents that are contiguous and can be grouped as a single one; * - * Will be used for merging fiemap extent + * - Store extents ready to be written to the fiemap buffer in an intermediary + * buffer. This intermediary buffer is to ensure that in case the fiemap + * buffer is memory mapped to the fiemap target file, we don't deadlock + * during btrfs_page_mkwrite(). This is because during fiemap we are locking + * an extent range in order to prevent races with delalloc flushing and + * ordered extent completion, which is needed in order to reliably detect + * delalloc in holes and prealloc extents. And this can lead to a deadlock + * if the fiemap buffer is memory mapped to the file we are running fiemap + * against (a silly, useless in practice scenario, but possible) because + * btrfs_page_mkwrite() will try to lock the same extent range. */ struct fiemap_cache { + /* An array of ready fiemap entries. */ + struct btrfs_fiemap_entry *entries; + /* Number of entries in the entries array. */ + int entries_size; + /* Index of the next entry in the entries array to write to. */ + int entries_pos; + /* + * Once the entries array is full, this indicates what's the offset for + * the next file extent item we must search for in the inode's subvolume + * tree after unlocking the extent range in the inode's io tree and + * releasing the search path. + */ + u64 next_search_offset; + /* + * This matches struct fiemap_extent_info::fi_mapped_extents, we use it + * to count ourselves emitted extents and stop instead of relying on + * fiemap_fill_next_extent() because we buffer ready fiemap entries at + * the @entries array, and we want to stop as soon as we hit the max + * amount of extents to map, not just to save time but also to make the + * logic at extent_fiemap() simpler. + */ + unsigned int extents_mapped; + /* Fields for the cached extent (unsubmitted, not ready, extent). */ u64 offset; u64 phys; - u64 len; + u64 log_len; + u64 phys_len; u32 flags; bool cached; }; +static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache) +{ + for (int i = 0; i < cache->entries_pos; i++) { + struct btrfs_fiemap_entry *entry = &cache->entries[i]; + int ret; + + ret = fiemap_fill_next_extent(fieinfo, entry->offset, + entry->phys, entry->log_len, + entry->phys_len, entry->flags); + /* + * Ignore 1 (reached max entries) because we keep track of that + * ourselves in emit_fiemap_extent(). + */ + if (ret < 0) + return ret; + } + cache->entries_pos = 0; + + return 0; +} + /* * Helper to submit fiemap extent. * @@ -2478,10 +2606,11 @@ struct fiemap_cache { */ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache, - u64 offset, u64 phys, u64 len, u32 flags) + u64 offset, u64 phys, u64 log_len, + u64 phys_len, u32 flags) { + struct btrfs_fiemap_entry *entry; u64 cache_end; - int ret = 0; /* Set at the end of extent_fiemap(). */ ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); @@ -2494,7 +2623,9 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * find an extent that starts at an offset behind the end offset of the * previous extent we processed. This happens if fiemap is called * without FIEMAP_FLAG_SYNC and there are ordered extents completing - * while we call btrfs_next_leaf() (through fiemap_next_leaf_item()). + * after we had to unlock the file range, release the search path, emit + * the fiemap extents stored in the buffer (cache->entries array) and + * the lock the remainder of the range and re-search the btree. * * For example we are in leaf X processing its last item, which is the * file extent item for file range [512K, 1M[, and after @@ -2519,7 +2650,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * or equals to what we have in cache->offset. We deal with this as * described below. */ - cache_end = cache->offset + cache->len; + cache_end = cache->offset + cache->log_len; if (cache_end > offset) { if (offset == cache->offset) { /* @@ -2543,10 +2674,10 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * where a previously found file extent item was split * due to an ordered extent completing. */ - cache->len = offset - cache->offset; + cache->log_len = offset - cache->offset; goto emit; } else { - const u64 range_end = offset + len; + const u64 range_end = offset + log_len; /* * The offset of the file extent item we have just found @@ -2583,7 +2714,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, phys += cache_end - offset; offset = cache_end; - len = range_end - cache_end; + log_len = range_end - cache_end; goto emit; } } @@ -2593,30 +2724,58 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * 1) Their logical addresses are continuous * * 2) Their physical addresses are continuous - * So truly compressed (physical size smaller than logical size) - * extents won't get merged with each other * * 3) Share same flags + * + * 4) Not compressed */ - if (cache->offset + cache->len == offset && - cache->phys + cache->len == phys && - cache->flags == flags) { - cache->len += len; + if (cache->offset + cache->log_len == offset && + cache->phys + cache->log_len == phys && + cache->flags == flags && + !(flags & FIEMAP_EXTENT_ENCODED)) { + cache->log_len += log_len; + cache->phys_len += phys_len; return 0; } emit: /* Not mergeable, need to submit cached one */ - ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, - cache->len, cache->flags); - cache->cached = false; - if (ret) - return ret; + + if (cache->entries_pos == cache->entries_size) { + /* + * We will need to research for the end offset of the last + * stored extent and not from the current offset, because after + * unlocking the range and releasing the path, if there's a hole + * between that end offset and this current offset, a new extent + * may have been inserted due to a new write, so we don't want + * to miss it. + */ + entry = &cache->entries[cache->entries_size - 1]; + cache->next_search_offset = entry->offset + entry->log_len; + cache->cached = false; + + return BTRFS_FIEMAP_FLUSH_CACHE; + } + + entry = &cache->entries[cache->entries_pos]; + entry->offset = cache->offset; + entry->phys = cache->phys; + entry->log_len = cache->log_len; + entry->phys_len = cache->phys_len; + entry->flags = cache->flags; + cache->entries_pos++; + cache->extents_mapped++; + + if (cache->extents_mapped == fieinfo->fi_extents_max) { + cache->cached = false; + return 1; + } assign: cache->cached = true; cache->offset = offset; cache->phys = phys; - cache->len = len; + cache->log_len = log_len; + cache->phys_len = phys_len; cache->flags = flags; return 0; @@ -2642,7 +2801,8 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, return 0; ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, - cache->len, cache->flags); + cache->log_len, cache->phys_len, + cache->flags); cache->cached = false; if (ret > 0) ret = 0; @@ -2651,7 +2811,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) { - struct extent_buffer *clone; + struct extent_buffer *clone = path->nodes[0]; struct btrfs_key key; int slot; int ret; @@ -2660,29 +2820,45 @@ static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *p if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) return 0; + /* + * Add a temporary extra ref to an already cloned extent buffer to + * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid + * the cost of allocating a new one. + */ + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags)); + atomic_inc(&clone->refs); + ret = btrfs_next_leaf(inode->root, path); if (ret != 0) - return ret; + goto out; /* * Don't bother with cloning if there are no more file extent items for * our inode. */ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) - return 1; + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { + ret = 1; + goto out; + } /* See the comment at fiemap_search_slot() about why we clone. */ - clone = btrfs_clone_extent_buffer(path->nodes[0]); - if (!clone) - return -ENOMEM; + copy_extent_buffer_full(clone, path->nodes[0]); + /* + * Important to preserve the start field, for the optimizations when + * checking if extents are shared (see extent_fiemap()). + */ + clone->start = path->nodes[0]->start; slot = path->slots[0]; btrfs_release_path(path); path->nodes[0] = clone; path->slots[0] = slot; +out: + if (ret) + free_extent_buffer(clone); - return 0; + return ret; } /* @@ -2737,8 +2913,8 @@ static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path * neighbour leaf). * We also need the private clone because holding a read lock on an * extent buffer of the subvolume's b+tree will make lockdep unhappy - * when we call fiemap_fill_next_extent(), because that may cause a page - * fault when filling the user space buffer with fiemap data. + * when we check if extents are shared, as backref walking may need to + * lock the same leaf we are processing. */ clone = btrfs_clone_extent_buffer(path->nodes[0]); if (!clone) @@ -2778,34 +2954,16 @@ static int fiemap_process_hole(struct btrfs_inode *inode, * it beyond i_size. */ while (cur_offset < end && cur_offset < i_size) { - struct extent_state *cached_state = NULL; u64 delalloc_start; u64 delalloc_end; u64 prealloc_start; - u64 lockstart; - u64 lockend; u64 prealloc_len = 0; bool delalloc; - lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize); - lockend = round_up(end, inode->root->fs_info->sectorsize); - - /* - * We are only locking for the delalloc range because that's the - * only thing that can change here. With fiemap we have a lock - * on the inode, so no buffered or direct writes can happen. - * - * However mmaps and normal page writeback will cause this to - * change arbitrarily. We have to lock the extent lock here to - * make sure that nobody messes with the tree while we're doing - * btrfs_find_delalloc_in_range. - */ - lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, delalloc_cached_state, &delalloc_start, &delalloc_end); - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); if (!delalloc) break; @@ -2838,13 +2996,15 @@ static int fiemap_process_hole(struct btrfs_inode *inode, } ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, disk_bytenr + extent_offset, - prealloc_len, prealloc_flags); + prealloc_len, prealloc_len, + prealloc_flags); if (ret) return ret; extent_offset += prealloc_len; } ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0, + delalloc_end + 1 - delalloc_start, delalloc_end + 1 - delalloc_start, FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); @@ -2885,7 +3045,8 @@ static int fiemap_process_hole(struct btrfs_inode *inode, } ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, disk_bytenr + extent_offset, - prealloc_len, prealloc_flags); + prealloc_len, prealloc_len, + prealloc_flags); if (ret) return ret; } @@ -2973,6 +3134,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { const u64 ino = btrfs_ino(inode); + struct extent_state *cached_state = NULL; struct extent_state *delalloc_cached_state = NULL; struct btrfs_path *path; struct fiemap_cache cache = { 0 }; @@ -2985,26 +3147,33 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, bool stopped = false; int ret; + cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry); + cache.entries = kmalloc_array(cache.entries_size, + sizeof(struct btrfs_fiemap_entry), + GFP_KERNEL); backref_ctx = btrfs_alloc_backref_share_check_ctx(); path = btrfs_alloc_path(); - if (!backref_ctx || !path) { + if (!cache.entries || !backref_ctx || !path) { ret = -ENOMEM; goto out; } +restart: range_start = round_down(start, sectorsize); range_end = round_up(start + len, sectorsize); prev_extent_end = range_start; + lock_extent(&inode->io_tree, range_start, range_end, &cached_state); + ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); if (ret < 0) - goto out; + goto out_unlock; btrfs_release_path(path); path->reada = READA_FORWARD; ret = fiemap_search_slot(inode, path, range_start); if (ret < 0) { - goto out; + goto out_unlock; } else if (ret > 0) { /* * No file extent item found, but we may have delalloc between @@ -3023,6 +3192,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 extent_offset = 0; u64 extent_gen; u64 disk_bytenr = 0; + u64 disk_size = 0; u64 flags = 0; int extent_type; u8 compression; @@ -3051,7 +3221,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, backref_ctx, 0, 0, 0, prev_extent_end, hole_end); if (ret < 0) { - goto out; + goto out_unlock; } else if (ret > 0) { /* fiemap_fill_next_extent() told us to stop. */ stopped = true; @@ -3074,8 +3244,9 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, if (extent_type != BTRFS_FILE_EXTENT_INLINE) { disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); - if (compression == BTRFS_COMPRESS_NONE) + if (compression == BTRFS_COMPRESS_NONE) { extent_offset = btrfs_file_extent_offset(leaf, ei); + } } if (compression != BTRFS_COMPRESS_NONE) @@ -3085,7 +3256,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, flags |= FIEMAP_EXTENT_DATA_INLINE; flags |= FIEMAP_EXTENT_NOT_ALIGNED; ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0, - extent_len, flags); + extent_len, extent_len, flags); } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { ret = fiemap_process_hole(inode, fieinfo, &cache, &delalloc_cached_state, @@ -3100,6 +3271,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, backref_ctx, 0, 0, 0, key.offset, extent_end - 1); } else { + disk_size = btrfs_file_extent_disk_num_bytes(leaf, ei); /* We have a regular extent. */ if (fieinfo->fi_extents_max) { ret = btrfs_is_data_extent_shared(inode, @@ -3107,20 +3279,22 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, extent_gen, backref_ctx); if (ret < 0) - goto out; + goto out_unlock; else if (ret > 0) flags |= FIEMAP_EXTENT_SHARED; } ret = emit_fiemap_extent(fieinfo, &cache, key.offset, disk_bytenr + extent_offset, - extent_len, flags); + extent_len, + disk_size - extent_offset, + flags); } if (ret < 0) { - goto out; + goto out_unlock; } else if (ret > 0) { - /* fiemap_fill_next_extent() told us to stop. */ + /* emit_fiemap_extent() told us to stop. */ stopped = true; break; } @@ -3129,12 +3303,12 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, next_item: if (fatal_signal_pending(current)) { ret = -EINTR; - goto out; + goto out_unlock; } ret = fiemap_next_leaf_item(inode, path); if (ret < 0) { - goto out; + goto out_unlock; } else if (ret > 0) { /* No more file extent items for this inode. */ break; @@ -3143,51 +3317,29 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, } check_eof_delalloc: - /* - * Release (and free) the path before emitting any final entries to - * fiemap_fill_next_extent() to keep lockdep happy. This is because - * once we find no more file extent items exist, we may have a - * non-cloned leaf, and fiemap_fill_next_extent() can trigger page - * faults when copying data to the user space buffer. - */ - btrfs_free_path(path); - path = NULL; - if (!stopped && prev_extent_end < range_end) { ret = fiemap_process_hole(inode, fieinfo, &cache, &delalloc_cached_state, backref_ctx, 0, 0, 0, prev_extent_end, range_end - 1); if (ret < 0) - goto out; + goto out_unlock; prev_extent_end = range_end; } - if (cache.cached && cache.offset + cache.len >= last_extent_end) { + if (cache.cached && cache.offset + cache.log_len >= last_extent_end) { const u64 i_size = i_size_read(&inode->vfs_inode); if (prev_extent_end < i_size) { - struct extent_state *cached_state = NULL; u64 delalloc_start; u64 delalloc_end; - u64 lockstart; - u64 lockend; bool delalloc; - lockstart = round_down(prev_extent_end, sectorsize); - lockend = round_up(i_size, sectorsize); - - /* - * See the comment in fiemap_process_hole as to why - * we're doing the locking here. - */ - lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); delalloc = btrfs_find_delalloc_in_range(inode, prev_extent_end, i_size - 1, &delalloc_cached_state, &delalloc_start, &delalloc_end); - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); if (!delalloc) cache.flags |= FIEMAP_EXTENT_LAST; } else { @@ -3195,9 +3347,39 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, } } +out_unlock: + unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); + + if (ret == BTRFS_FIEMAP_FLUSH_CACHE) { + btrfs_release_path(path); + ret = flush_fiemap_cache(fieinfo, &cache); + if (ret) + goto out; + len -= cache.next_search_offset - start; + start = cache.next_search_offset; + goto restart; + } else if (ret < 0) { + goto out; + } + + /* + * Must free the path before emitting to the fiemap buffer because we + * may have a non-cloned leaf and if the fiemap buffer is memory mapped + * to a file, a write into it (through btrfs_page_mkwrite()) may trigger + * waiting for an ordered extent that in order to complete needs to + * modify that leaf, therefore leading to a deadlock. + */ + btrfs_free_path(path); + path = NULL; + + ret = flush_fiemap_cache(fieinfo, &cache); + if (ret) + goto out; + ret = emit_last_fiemap_cache(fieinfo, &cache); out: free_extent_state(delalloc_cached_state); + kfree(cache.entries); btrfs_free_backref_share_ctx(backref_ctx); btrfs_free_path(path); return ret; @@ -3361,7 +3543,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); - ret = alloc_eb_folio_array(new, 0); + ret = alloc_eb_folio_array(new, 0, folio_order(src->folios[0])); if (ret) { btrfs_release_extent_buffer(new); return NULL; @@ -3395,7 +3577,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, if (!eb) return NULL; - ret = alloc_eb_folio_array(eb, 0); + ret = alloc_eb_folio_array(eb, 0, 0); if (ret) goto err; @@ -3608,6 +3790,18 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) return 0; } +/* + * A helper to free all eb folios, should only be utilized in eb allocation + * path where we know all the folios are safe to be dropped. + */ +static void free_all_eb_folios(struct extent_buffer *eb) +{ + for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) { + if (eb->folios[i]) + folio_put(eb->folios[i]); + eb->folios[i] = NULL; + } +} /* * Return 0 if eb->folios[i] is attached to btree inode successfully. @@ -3624,7 +3818,10 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, struct btrfs_fs_info *fs_info = eb->fs_info; struct address_space *mapping = fs_info->btree_inode->i_mapping; const unsigned long index = eb->start >> PAGE_SHIFT; + struct extent_buffer *existing_eb; struct folio *existing_folio; + int eb_order = folio_order(eb->folios[0]); + int existing_order; int ret; ASSERT(found_eb_ret); @@ -3643,37 +3840,63 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, if (IS_ERR(existing_folio)) goto retry; - /* For now, we should only have single-page folios for btree inode. */ - ASSERT(folio_nr_pages(existing_folio) == 1); + existing_order = folio_order(existing_folio); + if (fs_info->nodesize < PAGE_SIZE) { + /* + * We're going to reuse the existing page, can drop our page + * and subpage structure now. + */ + folio_put(eb->folios[i]); + eb->folios[i] = existing_folio; + return 0; + } - if (folio_size(existing_folio) != folio_size(eb->folios[0])) { + /* Non-subpage case, try if we can grab the eb from the existing folio. */ + existing_eb = grab_extent_buffer(fs_info, + folio_page(existing_folio, 0)); + if (existing_eb) { + /* + * The extent buffer still exists, we can use + * it directly. + */ + *found_eb_ret = existing_eb; folio_unlock(existing_folio); folio_put(existing_folio); - return -EAGAIN; + return 1; } - - if (fs_info->nodesize < PAGE_SIZE) { + if (existing_order > eb_order) { /* - * We're going to reuse the existing page, can drop our page - * and subpage structure now. + * The existing one has higher order, we need to drop + * ALL eb folios before reusing it. + * And this can only happen for the first folio. */ - __free_page(folio_page(eb->folios[i], 0)); + ASSERT(i == 0); + free_all_eb_folios(eb); eb->folios[i] = existing_folio; - } else { - struct extent_buffer *existing_eb; - - existing_eb = grab_extent_buffer(fs_info, - folio_page(existing_folio, 0)); - if (existing_eb) { - /* The extent buffer still exists, we can use it directly. */ - *found_eb_ret = existing_eb; - folio_unlock(existing_folio); - folio_put(existing_folio); - return 1; - } - /* The extent buffer no longer exists, we can reuse the folio. */ - __free_page(folio_page(eb->folios[i], 0)); + } else if (existing_order == eb_order) { + /* + * Can safely reuse the filemap folio, just + * release the eb one. + */ + folio_put(eb->folios[i]); eb->folios[i] = existing_folio; + } else if (existing_order < eb_order) { + /* + * The existing one has lower order (page based) + * meanwhile we have a better higher order eb. + * + * In theory we should be able to drop all the + * lower order folios in filemap and replace them + * with our better one. + * But we can not as the existing one still has + * private set. + * So here we force to fallback to 0 order folio + * and retry. + */ + ASSERT(i == 0); + folio_unlock(existing_folio); + folio_put(existing_folio); + return -EAGAIN; } return 0; } @@ -3690,6 +3913,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, struct btrfs_subpage *prealloc = NULL; u64 lockdep_owner = owner_root; bool page_contig = true; + int order = 0; int uptodate = 1; int ret; @@ -3707,6 +3931,10 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_warn_32bit_limit(fs_info); #endif + if (fs_info->nodesize > PAGE_SIZE && + IS_ALIGNED(start, fs_info->nodesize)) + order = ilog2(fs_info->nodesize >> PAGE_SHIFT); + eb = find_extent_buffer(fs_info, start); if (eb) return eb; @@ -3741,7 +3969,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, reallocate: /* Allocate all pages first. */ - ret = alloc_eb_folio_array(eb, __GFP_NOFAIL); + ret = alloc_eb_folio_array(eb, __GFP_NOFAIL, order); if (ret < 0) { btrfs_free_subpage(prealloc); goto out; @@ -3759,26 +3987,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } /* - * TODO: Special handling for a corner case where the order of - * folios mismatch between the new eb and filemap. - * - * This happens when: - * - * - the new eb is using higher order folio - * - * - the filemap is still using 0-order folios for the range - * This can happen at the previous eb allocation, and we don't - * have higher order folio for the call. - * - * - the existing eb has already been freed - * - * In this case, we have to free the existing folios first, and - * re-allocate using the same order. - * Thankfully this is not going to happen yet, as we're still - * using 0-order folios. + * This happens when we got a higher order (better) folio, but + * the filemap still has lower order (single paged) folio. + * We don't have a good way to replace them yet. + * Thus has to retry with lower order (0) folio. */ if (unlikely(ret == -EAGAIN)) { - ASSERT(0); + order = 0; + free_all_eb_folios(eb); goto reallocate; } attached++; @@ -3789,6 +4005,16 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * and free the allocated page. */ folio = eb->folios[i]; + eb->folio_size = folio_size(folio); + eb->folio_shift = folio_shift(folio); + + /* + * We may have changed from single page folios to a larger + * folios from filemap. + * Re-calculate num_folios; + */ + num_folios = num_extent_folios(eb); + spin_lock(&mapping->i_private_lock); /* Should not fail, as we have preallocated the memory */ ret = attach_extent_buffer_folio(eb, folio, prealloc); @@ -4238,7 +4464,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, for (int i = 0; i < num_folios; i++) { struct folio *folio = eb->folios[i]; - ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); } } @@ -4258,7 +4484,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, unsigned long len) { btrfs_warn(eb->fs_info, - "access to eb bytenr %llu len %lu out of range start %lu len %lu", + "access to eb bytenr %llu len %u out of range start %lu len %lu", eb->start, eb->len, start, len); WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); @@ -4287,7 +4513,7 @@ static inline int check_eb_range(const struct extent_buffer *eb, void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char *dst = (char *)dstv; @@ -4327,7 +4553,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, void __user *dstv, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char __user *dst = (char __user *)dstv; @@ -4367,7 +4593,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char *kaddr; @@ -4438,7 +4664,7 @@ static void __write_extent_buffer(const struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len, bool use_memmove) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char *kaddr; @@ -4487,7 +4713,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, static void memset_extent_buffer(const struct extent_buffer *eb, int c, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; unsigned long cur = start; if (eb->addr) { @@ -4518,7 +4744,7 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src) { - const int unit_size = folio_size(src->folios[0]); + const int unit_size = src->folio_size; unsigned long cur = 0; ASSERT(dst->len == src->len); @@ -4540,7 +4766,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - const int unit_size = folio_size(dst->folios[0]); + const int unit_size = dst->folio_size; u64 dst_len = dst->len; size_t cur; size_t offset; @@ -4596,10 +4822,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item. */ - offset = start + offset_in_folio(eb->folios[0], eb->start) + byte_offset; + offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset; - *folio_index = offset >> folio_shift(eb->folios[0]); - *folio_offset = offset_in_folio(eb->folios[0], offset); + *folio_index = offset >> eb->folio_shift; + *folio_offset = offset_in_eb_folio(eb, offset); } /* @@ -4713,7 +4939,7 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - const int unit_size = folio_size(dst->folios[0]); + const int unit_size = dst->folio_size; unsigned long cur_off = 0; if (check_eb_range(dst, dst_offset, len) || @@ -4837,7 +5063,7 @@ static struct extent_buffer *get_next_extent_buffer( static int try_release_subpage_extent_buffer(struct page *page) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = page_to_fs_info(page); u64 cur = page_offset(page); const u64 end = page_offset(page) + PAGE_SIZE; int ret; @@ -4910,7 +5136,7 @@ int try_release_extent_buffer(struct page *page) struct folio *folio = page_folio(page); struct extent_buffer *eb; - if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) + if (page_to_fs_info(page)->nodesize < PAGE_SIZE) return try_release_subpage_extent_buffer(page); /* diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 46050500529bff..ab5f7df29f120d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -7,11 +7,32 @@ #include #include #include +#include +#include +#include +#include +#include #include "compression.h" +#include "messages.h" #include "ulist.h" #include "misc.h" +struct page; +struct file; +struct folio; +struct inode; +struct fiemap_extent_info; +struct readahead_control; +struct address_space; +struct writeback_control; +struct extent_io_tree; +struct extent_map_tree; +struct btrfs_block_group; +struct btrfs_fs_info; +struct btrfs_inode; +struct btrfs_root; struct btrfs_trans_handle; +struct btrfs_tree_parent_check; enum { EXTENT_BUFFER_UPTODATE, @@ -63,11 +84,6 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) -struct btrfs_root; -struct btrfs_inode; -struct btrfs_fs_info; -struct extent_io_tree; -struct btrfs_tree_parent_check; int __init extent_buffer_init_cachep(void); void __cold extent_buffer_free_cachep(void); @@ -75,7 +91,8 @@ void __cold extent_buffer_free_cachep(void); #define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE) struct extent_buffer { u64 start; - unsigned long len; + u32 len; + u32 folio_size; unsigned long bflags; struct btrfs_fs_info *fs_info; @@ -90,6 +107,7 @@ struct extent_buffer { int read_mirror; /* >= 0 if eb belongs to a log tree, -1 otherwise */ s8 log_index; + u8 folio_shift; struct rcu_head rcu_head; struct rw_semaphore lock; @@ -113,6 +131,13 @@ struct btrfs_eb_write_context { struct btrfs_block_group *zoned_bg; }; +static inline unsigned long offset_in_eb_folio(const struct extent_buffer *eb, + u64 start) +{ + ASSERT(eb->folio_size); + return start & (eb->folio_size - 1); +} + /* * Get the correct offset inside the page of extent buffer. * @@ -151,13 +176,13 @@ static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb, * the folio_shift would be large enough to always make us * return 0 as index. * 1.2) Several page sized folios - * The folio_shift() would be PAGE_SHIFT, giving us the correct + * The folio_shift would be PAGE_SHIFT, giving us the correct * index. * * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case * The folio would only be page sized, and always give us 0 as index. */ - return offset >> folio_shift(eb->folios[0]); + return offset >> eb->folio_shift; } /* @@ -205,8 +230,6 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) kfree(changeset); } -struct extent_map_tree; - int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); @@ -221,6 +244,7 @@ int btree_write_cache_pages(struct address_space *mapping, void extent_readahead(struct readahead_control *rac); int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); +int set_folio_extent_mapped(struct folio *folio); int set_page_extent_mapped(struct page *page); void clear_page_extent_mapped(struct page *page); @@ -337,6 +361,8 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, gfp_t extra_gfp); +int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array, + gfp_t extra_gfp); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b61099bf97a824..347ca13d15a975 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -5,7 +5,6 @@ #include #include "messages.h" #include "ctree.h" -#include "volumes.h" #include "extent_map.h" #include "compression.h" #include "btrfs_inode.h" @@ -16,8 +15,7 @@ static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { extent_map_cache = kmem_cache_create("btrfs_extent_map", - sizeof(struct extent_map), 0, - SLAB_MEM_SPREAD, NULL); + sizeof(struct extent_map), 0, 0, NULL); if (!extent_map_cache) return -ENOMEM; return 0; @@ -291,6 +289,10 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) * Called after an extent has been written to disk properly. Set the generation * to the generation that actually added the file item to the inode so we know * we need to sync this extent when we call fsync(). + * + * Returns: 0 on success + * -ENOENT when the extent is not found in the tree + * -EUCLEAN if the found extent does not match the expected start */ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) { @@ -308,14 +310,18 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu", btrfs_ino(inode), btrfs_root_id(inode->root), start, len, gen); + ret = -ENOENT; goto out; } - if (WARN_ON(em->start != start)) + if (WARN_ON(em->start != start)) { btrfs_warn(fs_info, "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu", btrfs_ino(inode), btrfs_root_id(inode->root), em->start, start, len, gen); + ret = -EUCLEAN; + goto out; + } em->generation = gen; em->flags &= ~EXTENT_FLAG_PINNED; @@ -531,7 +537,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, u64 end; u64 start_diff; - BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); + if (map_start < em->start || map_start >= extent_map_end(em)) + return -EINVAL; if (existing->start > map_start) { next = existing; @@ -626,9 +633,9 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, free_extent_map(em); *em_in = NULL; WARN_ONCE(ret, -"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n", - ret, existing->start, existing->len, - orig_start, orig_len); +"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu\n", + existing->start, existing->len, + orig_start, orig_len, start); } free_extent_map(existing); } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index e380fc08bbe453..c5a098c99cc6e2 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -3,10 +3,18 @@ #ifndef BTRFS_EXTENT_MAP_H #define BTRFS_EXTENT_MAP_H +#include +#include #include +#include #include +#include "misc.h" +#include "extent_map.h" #include "compression.h" +struct btrfs_inode; +struct btrfs_fs_info; + #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) #define EXTENT_MAP_INLINE ((u64)-2) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 81ac1d474bf183..e58fb5347e65ee 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -10,17 +10,14 @@ #include #include #include "messages.h" -#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "bio.h" -#include "print-tree.h" #include "compression.h" #include "fs.h" #include "accessors.h" #include "file-item.h" -#include "super.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ @@ -179,7 +176,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, sizeof(*item)); if (ret < 0) goto out; - BUG_ON(ret); /* Can't happen */ leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -1229,8 +1225,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, ins_size); if (ret < 0) goto out; - if (WARN_ON(ret != 0)) - goto out; leaf = path->nodes[0]; csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 04bd2d34efb14b..15c05cc0fce60e 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -3,8 +3,21 @@ #ifndef BTRFS_FILE_ITEM_H #define BTRFS_FILE_ITEM_H +#include +#include #include "accessors.h" +struct extent_map; +struct btrfs_file_extent_item; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_bio; +struct btrfs_trans_handle; +struct btrfs_root; +struct btrfs_ordered_sum; +struct btrfs_path; +struct btrfs_inode; + #define BTRFS_FILE_EXTENT_INLINE_DATA_START \ (offsetof(struct btrfs_file_extent_item, disk_bytenr)) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 38dfcac4760990..f9d76072398da5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -22,10 +22,8 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "tree-log.h" #include "locking.h" -#include "volumes.h" #include "qgroup.h" #include "compression.h" #include "delalloc-space.h" @@ -1137,7 +1135,7 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); loff_t pos = iocb->ki_pos; int ret; loff_t oldsize; @@ -1185,7 +1183,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, struct file *file = iocb->ki_filp; loff_t pos; struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct page **pages = NULL; struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; @@ -1461,7 +1459,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); loff_t pos; ssize_t written = 0; ssize_t written_buffered; @@ -1787,7 +1785,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; @@ -1912,6 +1910,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out_release_extents; } + btrfs_init_log_ctx_scratch_eb(&ctx); + /* * We use start here because we will need to wait on the IO to complete * in btrfs_sync_log, which could require joining a transaction (for @@ -1931,6 +1931,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans->in_fsync = true; ret = btrfs_log_dentry_safe(trans, dentry, &ctx); + /* + * Scratch eb no longer needed, release before syncing log or commit + * transaction, to avoid holding unnecessary memory during such long + * operations. + */ + if (ctx.scratch_eb) { + free_extent_buffer(ctx.scratch_eb); + ctx.scratch_eb = NULL; + } btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ @@ -2006,6 +2015,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_commit_transaction(trans); out: + free_extent_buffer(ctx.scratch_eb); ASSERT(list_empty(&ctx.list)); ASSERT(list_empty(&ctx.conflict_inodes)); err = file_check_and_advance_wb_err(file); @@ -2176,7 +2186,7 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) struct extent_map *em; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, + em = btrfs_get_extent(inode, NULL, round_down(*start, fs_info->sectorsize), round_up(*len, fs_info->sectorsize)); if (IS_ERR(em)) @@ -2593,7 +2603,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; struct btrfs_path *path; @@ -2835,7 +2845,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize); + em = btrfs_get_extent(inode, NULL, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); @@ -2866,7 +2876,7 @@ static int btrfs_zero_range(struct inode *inode, u64 bytes_to_reserve = 0; bool space_reserved = false; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, alloc_end - alloc_start); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -2909,8 +2919,7 @@ static int btrfs_zero_range(struct inode *inode, if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, - sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -3005,7 +3014,7 @@ static int btrfs_zero_range(struct inode *inode, } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, - i_blocksize(inode), + fs_info->sectorsize, offset + len, &alloc_hint); unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); @@ -3049,7 +3058,7 @@ static long btrfs_fallocate(struct file *file, int mode, int ret; /* Do not allow fallocate in ZONED mode */ - if (btrfs_is_zoned(btrfs_sb(inode->i_sb))) + if (btrfs_is_zoned(inode_to_fs_info(inode))) return -EOPNOTSUPP; alloc_start = round_down(offset, blocksize); @@ -3126,7 +3135,7 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ while (cur_offset < alloc_end) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, + em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset, alloc_end - cur_offset); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -3177,7 +3186,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (!ret) { ret = btrfs_prealloc_file_range(inode, mode, range->start, - range->len, i_blocksize(inode), + range->len, blocksize, offset + len, &alloc_hint); /* * btrfs_prealloc_file_range() releases space even @@ -3754,7 +3763,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) if (fsverity_active(inode)) return 0; - if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) + if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos)) return 0; btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index 82b34fbb295f27..77aaca208c7bce 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -3,6 +3,21 @@ #ifndef BTRFS_FILE_H #define BTRFS_FILE_H +#include + +struct file; +struct extent_state; +struct kiocb; +struct iov_iter; +struct page; +struct btrfs_ioctl_encoded_io_args; +struct btrfs_drop_extents_args; +struct btrfs_inode; +struct btrfs_root; +struct btrfs_path; +struct btrfs_replace_extent_info; +struct btrfs_trans_handle; + extern const struct file_operations btrfs_file_operations; int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d372c7ce0e6b43..c8a05d5eb9cbc5 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -19,9 +19,7 @@ #include "transaction.h" #include "disk-io.h" #include "extent_io.h" -#include "volumes.h" #include "space-info.h" -#include "delalloc-space.h" #include "block-group.h" #include "discard.h" #include "subpage.h" @@ -399,7 +397,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, return -ENOMEM; io_ctl->num_pages = num_pages; - io_ctl->fs_info = btrfs_sb(inode->i_sb); + io_ctl->fs_info = inode_to_fs_info(inode); io_ctl->inode = inode; return 0; @@ -2621,7 +2619,7 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, } } -int __btrfs_add_free_space(struct btrfs_block_group *block_group, +static int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes, enum btrfs_trim_state trim_state) { @@ -4156,15 +4154,13 @@ int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool act int __init btrfs_free_space_init(void) { - btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", - sizeof(struct btrfs_free_space), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0); if (!btrfs_free_space_cachep) return -ENOMEM; btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", PAGE_SIZE, PAGE_SIZE, - SLAB_MEM_SPREAD, NULL); + 0, NULL); if (!btrfs_free_space_bitmap_cachep) { kmem_cache_destroy(btrfs_free_space_cachep); return -ENOMEM; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 33b4da3271b1be..83774bfd7b3bb0 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -6,6 +6,19 @@ #ifndef BTRFS_FREE_SPACE_CACHE_H #define BTRFS_FREE_SPACE_CACHE_H +#include +#include +#include +#include +#include "fs.h" + +struct inode; +struct page; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_trans_handle; +struct btrfs_trim_block_group; + /* * This is the trim state of an extent or bitmap. * @@ -114,8 +127,6 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl); -int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, - u64 size, enum btrfs_trim_state trim_state); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 7b598b070700e7..90f2938bd743d3 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1176,12 +1176,16 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) BTRFS_FREE_SPACE_TREE_OBJECTID); if (IS_ERR(free_space_root)) { ret = PTR_ERR(free_space_root); - goto abort; + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_clear; } ret = btrfs_global_root_insert(free_space_root); if (ret) { btrfs_put_root(free_space_root); - goto abort; + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_clear; } node = rb_first_cached(&fs_info->block_group_cache_tree); @@ -1189,8 +1193,11 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_clear; + } node = rb_next(node); } @@ -1206,11 +1213,9 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); return ret; -abort: +out_clear: clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); return ret; } @@ -1273,12 +1278,18 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); ret = clear_free_space_tree(trans, free_space_root); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } ret = btrfs_del_root(trans, &free_space_root->root_key); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } btrfs_global_root_delete(free_space_root); @@ -1295,11 +1306,6 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_put_root(free_space_root); return btrfs_commit_transaction(trans); - -abort: - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; } int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) @@ -1322,8 +1328,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); ret = clear_free_space_tree(trans, free_space_root); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { @@ -1332,8 +1341,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } node = rb_next(node); } @@ -1344,10 +1356,6 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) ret = btrfs_commit_transaction(trans); clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); return ret; -abort: - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; } static int __add_block_group_free_space(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index 6d5551d0ced810..e6c6d6f4f2210a 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -6,7 +6,13 @@ #ifndef BTRFS_FREE_SPACE_TREE_H #define BTRFS_FREE_SPACE_TREE_H +#include + struct btrfs_caching_control; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_block_group; +struct btrfs_trans_handle; /* * The default size for new free space bitmap items. The last bitmap in a block diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index f8bb73d6ab68c4..72e7e516844fa1 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -4,13 +4,50 @@ #define BTRFS_FS_H #include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include "extent-io-tree.h" -#include "extent_map.h" #include "async-thread.h" #include "block-rsv.h" +#include "fs.h" + +struct inode; +struct super_block; +struct kobject; +struct reloc_control; +struct crypto_shash; +struct ulist; +struct btrfs_device; +struct btrfs_block_group; +struct btrfs_root; +struct btrfs_fs_devices; +struct btrfs_transaction; +struct btrfs_delayed_root; +struct btrfs_balance_control; +struct btrfs_subpage_info; +struct btrfs_stripe_hash_table; +struct btrfs_space_info; #define BTRFS_MAX_EXTENT_SIZE SZ_128M @@ -732,10 +769,13 @@ struct btrfs_fs_info { /* Reclaim partially filled block groups in the background */ struct work_struct reclaim_bgs_work; + /* Protected by unused_bgs_lock. */ struct list_head reclaim_bgs; int bg_reclaim_threshold; + /* Protects the lists unused_bgs and reclaim_bgs. */ spinlock_t unused_bgs_lock; + /* Protected by unused_bgs_lock. */ struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; /* Protect block groups that are going to be deleted */ @@ -750,6 +790,16 @@ struct btrfs_fs_info { u32 csums_per_leaf; u32 stripesize; + /* + * For future subpage and multipage sectorsize support. + * + * For subpage, all of our data folios would still be PAGE_SIZE. + * But for multipage, those data folios would be sector sized. + * This is the cached result to read/write path to utilize. + */ + u32 folio_size; + u32 folio_shift; + /* * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular * filesystem, on zoned it depends on the device constraints. @@ -829,6 +879,17 @@ struct btrfs_fs_info { #endif }; +#define page_to_inode(_page) (BTRFS_I(_Generic((_page), \ + struct page *: (_page))->mapping->host)) +#define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \ + struct folio *: (_folio))->mapping->host)) + +#define page_to_fs_info(_page) (page_to_inode(_page)->root->fs_info) +#define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info) + +#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ + struct inode *: (_inode)))->root->fs_info) + static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) { return READ_ONCE(fs_info->generation); @@ -922,6 +983,8 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation op); +int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args); + /* Compatibility and incompatibility defines */ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 7d734830e514eb..9c1394c0a6d72d 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -9,7 +9,6 @@ #include "inode-item.h" #include "disk-io.h" #include "transaction.h" -#include "print-tree.h" #include "space-info.h" #include "accessors.h" #include "extent-tree.h" diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index 4337bb26f419b7..c4aded82709b1a 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -6,14 +6,15 @@ #include #include +struct fscrypt_str; +struct extent_buffer; struct btrfs_trans_handle; struct btrfs_root; struct btrfs_path; struct btrfs_key; struct btrfs_inode_extref; struct btrfs_inode; -struct extent_buffer; -struct fscrypt_str; +struct btrfs_truncate_control; /* * Return this if we need to call truncate_block for the last bit of the diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4795738d5785bc..7ca476210fab7e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -39,14 +39,12 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" #include "bio.h" #include "compression.h" #include "locking.h" -#include "free-space-cache.h" #include "props.h" #include "qgroup.h" #include "delalloc-space.h" @@ -514,12 +512,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, bool extent_inserted, size_t size, size_t compressed_size, int compress_type, - struct page **compressed_pages, + struct folio *compressed_folio, bool update_i_size) { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct page *page = NULL; + const u32 sectorsize = trans->fs_info->sectorsize; char *kaddr; unsigned long ptr; struct btrfs_file_extent_item *ei; @@ -527,10 +526,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t cur_size = size; u64 i_size; - ASSERT((compressed_size > 0 && compressed_pages) || - (compressed_size == 0 && !compressed_pages)); + /* + * The decompressed size must still be no larger than a sector. + * Under heavy race, we can have size == 0 passed in, but that + * shouldn't be a big deal and we can continue the insertion. + */ + ASSERT(size <= sectorsize); - if (compressed_size && compressed_pages) + /* + * The compressed size also need to be no larger than a sector. + * That's also why we only need one page as the parameter. + */ + if (compressed_folio) + ASSERT(compressed_size <= sectorsize); + else + ASSERT(compressed_size == 0); + + if (compressed_size && compressed_folio) cur_size = compressed_size; if (!extent_inserted) { @@ -558,21 +570,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, ptr = btrfs_file_extent_inline_start(ei); if (compress_type != BTRFS_COMPRESS_NONE) { - struct page *cpage; - int i = 0; - while (compressed_size > 0) { - cpage = compressed_pages[i]; - cur_size = min_t(unsigned long, compressed_size, - PAGE_SIZE); - - kaddr = kmap_local_page(cpage); - write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_local(kaddr); + kaddr = kmap_local_folio(compressed_folio, 0); + write_extent_buffer(leaf, kaddr, ptr, compressed_size); + kunmap_local(kaddr); - i++; - ptr += cur_size; - compressed_size -= cur_size; - } btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { @@ -622,7 +623,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, size_t compressed_size, int compress_type, - struct page **compressed_pages, + struct folio *compressed_folio, bool update_i_size) { struct btrfs_drop_extents_args drop_args = { 0 }; @@ -670,7 +671,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, size, compressed_size, compress_type, - compressed_pages, update_i_size); + compressed_folio, update_i_size); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -707,8 +708,8 @@ struct async_extent { u64 start; u64 ram_size; u64 compressed_size; - struct page **pages; - unsigned long nr_pages; + struct folio **folios; + unsigned long nr_folios; int compress_type; struct list_head list; }; @@ -733,19 +734,20 @@ struct async_cow { static noinline int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, u64 compressed_size, - struct page **pages, - unsigned long nr_pages, + struct folio **folios, + unsigned long nr_folios, int compress_type) { struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); - BUG_ON(!async_extent); /* -ENOMEM */ + if (!async_extent) + return -ENOMEM; async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; - async_extent->pages = pages; - async_extent->nr_pages = nr_pages; + async_extent->folios = folios; + async_extent->nr_folios = nr_folios; async_extent->compress_type = compress_type; list_add_tail(&async_extent->list, &cow->extents); return 0; @@ -849,8 +851,8 @@ static void compress_file_range(struct btrfs_work *work) u64 actual_end; u64 i_size; int ret = 0; - struct page **pages; - unsigned long nr_pages; + struct folio **folios; + unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; unsigned int poff; @@ -880,9 +882,9 @@ static void compress_file_range(struct btrfs_work *work) barrier(); actual_end = min_t(u64, i_size, end + 1); again: - pages = NULL; - nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); + folios = NULL; + nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; + nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES); /* * we don't want to send crud past the end of i_size through @@ -931,8 +933,8 @@ static void compress_file_range(struct btrfs_work *work) if (!inode_need_compress(inode, start, end)) goto cleanup_and_bail_uncompressed; - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) { + folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); + if (!folios) { /* * Memory allocation failure is not a fatal error, we can fall * back to uncompressed code. @@ -946,8 +948,8 @@ static void compress_file_range(struct btrfs_work *work) compress_type = inode->prop_compress; /* Compression level is applied here. */ - ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4), - mapping, start, pages, &nr_pages, &total_in, + ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4), + mapping, start, folios, &nr_folios, &total_in, &total_compressed); if (ret) goto mark_incompressible; @@ -958,7 +960,7 @@ static void compress_file_range(struct btrfs_work *work) */ poff = offset_in_page(total_compressed); if (poff) - memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff); + folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff); /* * Try to create an inline extent. @@ -977,7 +979,7 @@ static void compress_file_range(struct btrfs_work *work) } else { ret = cow_file_range_inline(inode, actual_end, total_compressed, - compress_type, pages, + compress_type, folios[0], false); } if (ret <= 0) { @@ -1027,8 +1029,9 @@ static void compress_file_range(struct btrfs_work *work) * The async work queues will take care of doing actual allocation on * disk for these compressed pages, and will submit the bios. */ - add_async_extent(async_chunk, start, total_in, total_compressed, pages, - nr_pages, compress_type); + ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios, + nr_folios, compress_type); + BUG_ON(ret); if (start + total_in < end) { start += total_in; cond_resched(); @@ -1040,15 +1043,16 @@ static void compress_file_range(struct btrfs_work *work) if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) inode->flags |= BTRFS_INODE_NOCOMPRESS; cleanup_and_bail_uncompressed: - add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, - BTRFS_COMPRESS_NONE); + ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, + BTRFS_COMPRESS_NONE); + BUG_ON(ret); free_pages: - if (pages) { - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - btrfs_free_compr_page(pages[i]); + if (folios) { + for (i = 0; i < nr_folios; i++) { + WARN_ON(folios[i]->mapping); + btrfs_free_compr_folio(folios[i]); } - kfree(pages); + kfree(folios); } } @@ -1056,16 +1060,16 @@ static void free_async_extent_pages(struct async_extent *async_extent) { int i; - if (!async_extent->pages) + if (!async_extent->folios) return; - for (i = 0; i < async_extent->nr_pages; i++) { - WARN_ON(async_extent->pages[i]->mapping); - btrfs_free_compr_page(async_extent->pages[i]); + for (i = 0; i < async_extent->nr_folios; i++) { + WARN_ON(async_extent->folios[i]->mapping); + btrfs_free_compr_folio(async_extent->folios[i]); } - kfree(async_extent->pages); - async_extent->nr_pages = 0; - async_extent->pages = NULL; + kfree(async_extent->folios); + async_extent->nr_folios = 0; + async_extent->folios = NULL; } static void submit_uncompressed_range(struct btrfs_inode *inode, @@ -1189,8 +1193,8 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); btrfs_submit_compressed_write(ordered, - async_extent->pages, /* compressed_pages */ - async_extent->nr_pages, + async_extent->folios, /* compressed_folios */ + async_extent->nr_folios, async_chunk->write_flags, true); *alloc_hint = ins.objectid + ins.offset; done: @@ -2302,6 +2306,8 @@ void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 size; + lockdep_assert_held(&inode->io_tree.lock); + /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return; @@ -2340,6 +2346,8 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state u64 new_size, old_size; u32 num_extents; + lockdep_assert_held(&inode->io_tree.lock); + /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) return; @@ -2387,55 +2395,50 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state spin_unlock(&inode->lock); } -static void btrfs_add_delalloc_inodes(struct btrfs_root *root, - struct btrfs_inode *inode) +static void btrfs_add_delalloc_inode(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&root->delalloc_lock); - if (list_empty(&inode->delalloc_inodes)) { - list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); - set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags); - root->nr_delalloc_inodes++; - if (root->nr_delalloc_inodes == 1) { - spin_lock(&fs_info->delalloc_root_lock); - BUG_ON(!list_empty(&root->delalloc_root)); - list_add_tail(&root->delalloc_root, - &fs_info->delalloc_roots); - spin_unlock(&fs_info->delalloc_root_lock); - } + ASSERT(list_empty(&inode->delalloc_inodes)); + list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); + root->nr_delalloc_inodes++; + if (root->nr_delalloc_inodes == 1) { + spin_lock(&fs_info->delalloc_root_lock); + ASSERT(list_empty(&root->delalloc_root)); + list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); } spin_unlock(&root->delalloc_lock); } -void __btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode) +void btrfs_del_delalloc_inode(struct btrfs_inode *inode) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; + lockdep_assert_held(&root->delalloc_lock); + + /* + * We may be called after the inode was already deleted from the list, + * namely in the transaction abort path btrfs_destroy_delalloc_inodes(), + * and then later through btrfs_clear_delalloc_extent() while the inode + * still has ->delalloc_bytes > 0. + */ if (!list_empty(&inode->delalloc_inodes)) { list_del_init(&inode->delalloc_inodes); - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &inode->runtime_flags); root->nr_delalloc_inodes--; if (!root->nr_delalloc_inodes) { ASSERT(list_empty(&root->delalloc_inodes)); spin_lock(&fs_info->delalloc_root_lock); - BUG_ON(list_empty(&root->delalloc_root)); + ASSERT(!list_empty(&root->delalloc_root)); list_del_init(&root->delalloc_root); spin_unlock(&fs_info->delalloc_root_lock); } } } -static void btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode) -{ - spin_lock(&root->delalloc_lock); - __btrfs_del_delalloc_inode(root, inode); - spin_unlock(&root->delalloc_lock); -} - /* * Properly track delayed allocation bytes in the inode and to maintain the * list of inodes that have pending delalloc work to be done. @@ -2445,6 +2448,8 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s { struct btrfs_fs_info *fs_info = inode->root->fs_info; + lockdep_assert_held(&inode->io_tree.lock); + if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); /* @@ -2453,10 +2458,9 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s * bit, which is only set or cleared with irqs on */ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { - struct btrfs_root *root = inode->root; u64 len = state->end + 1 - state->start; + u64 prev_delalloc_bytes; u32 num_extents = count_max_extents(fs_info, len); - bool do_list = !btrfs_is_free_space_inode(inode); spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, num_extents); @@ -2469,13 +2473,20 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s percpu_counter_add_batch(&fs_info->delalloc_bytes, len, fs_info->delalloc_batch); spin_lock(&inode->lock); + prev_delalloc_bytes = inode->delalloc_bytes; inode->delalloc_bytes += len; if (bits & EXTENT_DEFRAG) inode->defrag_bytes += len; - if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &inode->runtime_flags)) - btrfs_add_delalloc_inodes(root, inode); spin_unlock(&inode->lock); + + /* + * We don't need to be under the protection of the inode's lock, + * because we are called while holding the inode's io_tree lock + * and are therefore protected against concurrent calls of this + * function and btrfs_clear_delalloc_extent(). + */ + if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0) + btrfs_add_delalloc_inode(inode); } if (!(state->state & EXTENT_DELALLOC_NEW) && @@ -2497,6 +2508,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); + lockdep_assert_held(&inode->io_tree.lock); + if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); inode->defrag_bytes -= len; @@ -2510,7 +2523,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, */ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; - bool do_list = !btrfs_is_free_space_inode(inode); + u64 new_delalloc_bytes; spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, -num_extents); @@ -2530,7 +2543,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, return; if (!btrfs_is_data_reloc_root(root) && - do_list && !(state->state & EXTENT_NORESERVE) && + !btrfs_is_free_space_inode(inode) && + !(state->state & EXTENT_NORESERVE) && (bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); @@ -2538,11 +2552,20 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, fs_info->delalloc_batch); spin_lock(&inode->lock); inode->delalloc_bytes -= len; - if (do_list && inode->delalloc_bytes == 0 && - test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &inode->runtime_flags)) - btrfs_del_delalloc_inode(root, inode); + new_delalloc_bytes = inode->delalloc_bytes; spin_unlock(&inode->lock); + + /* + * We don't need to be under the protection of the inode's lock, + * because we are called while holding the inode's io_tree lock + * and are therefore protected against concurrent calls of this + * function and btrfs_set_delalloc_extent(). + */ + if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) { + spin_lock(&root->delalloc_lock); + btrfs_del_delalloc_inode(inode); + spin_unlock(&root->delalloc_lock); + } } if ((state->state & EXTENT_DELALLOC_NEW) && @@ -2632,7 +2655,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, u64 em_len; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); + em = btrfs_get_extent(inode, NULL, search_start, search_len); if (IS_ERR(em)) return PTR_ERR(em); @@ -2829,7 +2852,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) int btrfs_writepage_cow_fixup(struct page *page) { struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_writepage_fixup *fixup; /* This page has ordered extent covering it already */ @@ -3127,8 +3150,13 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) ordered_extent->disk_num_bytes); } } - unpin_extent_cache(inode, ordered_extent->file_offset, - ordered_extent->num_bytes, trans->transid); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = unpin_extent_cache(inode, ordered_extent->file_offset, + ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -3254,7 +3282,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) { - if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) && + if (btrfs_is_zoned(inode_to_fs_info(ordered->inode)) && !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && list_empty(&ordered->bioc_list)) btrfs_finish_ordered_zoned(ordered); @@ -3739,7 +3767,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *in_path) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; @@ -4383,7 +4411,14 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist a root + * with such id, but this is out of valid range. + */ + ret = -EUCLEAN; + goto out; + } ret = 0; if (path->slots[0] > 0) { @@ -4464,8 +4499,8 @@ static void btrfs_prune_dentries(struct btrfs_root *root) int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) { - struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode = d_inode(dentry); struct btrfs_root *dest = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -4695,7 +4730,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); - struct page *page; + struct folio *folio; gfp_t mask = btrfs_alloc_write_mask(mapping); size_t write_bytes = blocksize; int ret = 0; @@ -4727,8 +4762,9 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, goto out; } again: - page = find_or_create_page(mapping, index, mask); - if (!page) { + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + if (IS_ERR(folio)) { btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); btrfs_delalloc_release_extents(inode, blocksize); @@ -4736,15 +4772,15 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, goto out; } - if (!PageUptodate(page)) { - ret = btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != mapping) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + ret = btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); goto again; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto out_unlock; } @@ -4756,19 +4792,19 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, * folio private, but left the page in the mapping. Set the page mapped * here to make sure it's properly set for the subpage stuff. */ - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock; - wait_on_page_writeback(page); + folio_wait_writeback(folio); lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { unlock_extent(io_tree, block_start, block_end, &cached_state); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; @@ -4789,15 +4825,16 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, if (!len) len = blocksize - offset; if (front) - memzero_page(page, (block_start - page_offset(page)), - offset); + folio_zero_range(folio, block_start - folio_pos(folio), + offset); else - memzero_page(page, (block_start - page_offset(page)) + offset, - len); + folio_zero_range(folio, + (block_start - folio_pos(folio)) + offset, + len); } - btrfs_folio_clear_checked(fs_info, page_folio(page), block_start, + btrfs_folio_clear_checked(fs_info, folio, block_start, block_end + 1 - block_start); - btrfs_folio_set_dirty(fs_info, page_folio(page), block_start, + btrfs_folio_set_dirty(fs_info, folio, block_start, block_end + 1 - block_start); unlock_extent(io_tree, block_start, block_end, &cached_state); @@ -4814,8 +4851,8 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, block_start, blocksize, true); } btrfs_delalloc_release_extents(inode, blocksize); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out: if (only_release_metadata) btrfs_check_nocow_unlock(inode); @@ -4907,8 +4944,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) &cached_state); cur_offset = hole_start; while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, - block_end - cur_offset); + em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset); if (IS_ERR(em)) { err = PTR_ERR(em); em = NULL; @@ -5019,7 +5055,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); if (btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, @@ -5222,7 +5258,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, void btrfs_evict_inode(struct inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info; struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv = NULL; @@ -5236,6 +5272,7 @@ void btrfs_evict_inode(struct inode *inode) return; } + fs_info = inode_to_fs_info(inode); evict_inode_truncate_pages(inode); if (inode->i_nlink && @@ -5533,7 +5570,6 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.offset = 0; BTRFS_I(inode)->root = btrfs_grab_root(args->root); - BUG_ON(args->root && !BTRFS_I(inode)->root); if (args->root && args->root == args->root->fs_info->tree_root && args->ino != BTRFS_BTREE_INODE_OBJECTID) @@ -5661,7 +5697,7 @@ static inline u8 btrfs_inode_type(struct inode *inode) struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; @@ -6200,7 +6236,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct inode *dir = args->dir; struct inode *inode = args->inode; const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_root *root; struct btrfs_inode_item *inode_item; struct btrfs_key *location; @@ -6522,7 +6558,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, static int btrfs_create_common(struct inode *dir, struct dentry *dentry, struct inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_new_inode_args new_inode_args = { .dir = dir, @@ -6592,7 +6628,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_trans_handle *trans = NULL; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct fscrypt_name fname; u64 index; int err; @@ -6756,7 +6792,6 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * * @inode: file to search in * @page: page to read extent data into if the extent is inline - * @pg_offset: offset into @page to copy to * @start: file offset * @len: length of range starting at @start * @@ -6770,8 +6805,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len) + struct page *page, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; @@ -6914,7 +6948,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, * ensured by tree-checker and inline extent creation path. * Thus all members representing file offsets should be zero. */ - ASSERT(pg_offset == 0); ASSERT(extent_start == 0); ASSERT(em->start == 0); @@ -7078,7 +7111,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool nowait, bool strict) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; struct btrfs_path *path; int ret; @@ -7317,7 +7350,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, unsigned int iomap_flags) { const bool nowait = (iomap_flags & IOMAP_NOWAIT); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *em = *map; int type; u64 block_start, orig_start, orig_block_len, ram_bytes; @@ -7457,7 +7490,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, struct iomap *srcmap) { struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = iter->private; @@ -7555,7 +7588,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (ret < 0) goto err; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); + em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; @@ -7903,7 +7936,7 @@ static void btrfs_readahead(struct readahead_control *rac) */ static void wait_subpage_spinlock(struct page *page) { - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = page_to_fs_info(page); struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; @@ -7970,7 +8003,7 @@ static int btrfs_migrate_folio(struct address_space *mapping, static void btrfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { - struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); + struct btrfs_inode *inode = folio_to_inode(folio); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; struct extent_state *cached_state = NULL; @@ -8154,7 +8187,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) struct page *page = vmf->page; struct folio *folio = page_folio(page); struct inode *inode = file_inode(vmf->vma->vm_file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; @@ -8700,7 +8733,7 @@ int __init btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, init_once); if (!btrfs_inode_cachep) goto fail; @@ -8723,7 +8756,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, u64 delalloc_bytes; u64 inode_bytes; struct inode *inode = d_inode(path->dentry); - u32 blocksize = inode->i_sb->s_blocksize; + u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize; u32 bi_flags = BTRFS_I(inode)->flags; u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; @@ -8763,7 +8796,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct inode *new_dir, struct dentry *new_dentry) { - struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir); struct btrfs_trans_handle *trans; unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; @@ -9015,7 +9048,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir); struct btrfs_new_inode_args whiteout_args = { .dir = old_dir, .dentry = old_dentry, @@ -9457,7 +9490,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; @@ -9638,7 +9671,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, loff_t actual_len, u64 *alloc_hint, struct btrfs_trans_handle *trans) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; @@ -9790,7 +9823,7 @@ static int btrfs_permission(struct mnt_idmap *idmap, static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode; @@ -10164,7 +10197,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, cond_resched(); } - em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1); + em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_unlock_extent; @@ -10276,8 +10309,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, size_t orig_count; u64 start, end; u64 num_bytes, ram_bytes, disk_num_bytes; - unsigned long nr_pages, i; - struct page **pages; + unsigned long nr_folios, i; + struct folio **folios; struct btrfs_key ins; bool extent_reserved = false; struct extent_map *em; @@ -10366,24 +10399,24 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, * isn't. */ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); - nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); - pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); - if (!pages) + nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); + folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT); + if (!folios) return -ENOMEM; - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < nr_folios; i++) { size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); char *kaddr; - pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); - if (!pages[i]) { + folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0); + if (!folios[i]) { ret = -ENOMEM; - goto out_pages; + goto out_folios; } - kaddr = kmap_local_page(pages[i]); + kaddr = kmap_local_folio(folios[i], 0); if (copy_from_iter(kaddr, bytes, from) != bytes) { kunmap_local(kaddr); ret = -EFAULT; - goto out_pages; + goto out_folios; } if (bytes < PAGE_SIZE) memset(kaddr + bytes, 0, PAGE_SIZE - bytes); @@ -10395,12 +10428,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); if (ret) - goto out_pages; + goto out_folios; ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); if (ret) - goto out_pages; + goto out_folios; lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && @@ -10431,7 +10464,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (start == 0 && encoded->unencoded_len == encoded->len && encoded->unencoded_offset == 0) { ret = cow_file_range_inline(inode, encoded->len, orig_count, - compression, pages, true); + compression, folios[0], true); if (ret <= 0) { if (ret == 0) ret = orig_count; @@ -10475,7 +10508,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, btrfs_delalloc_release_extents(inode, num_bytes); - btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false); + btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false); ret = orig_count; goto out; @@ -10497,12 +10530,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); out_unlock: unlock_extent(io_tree, start, end, &cached_state); -out_pages: - for (i = 0; i < nr_pages; i++) { - if (pages[i]) - __free_page(pages[i]); +out_folios: + for (i = 0; i < nr_folios; i++) { + if (folios[i]) + __folio_put(folios[i]); } - kvfree(pages); + kvfree(folios); out: if (ret >= 0) iocb->ki_pos += encoded->len; @@ -10744,7 +10777,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_block_group *bg; u64 len = isize - start; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); + em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9d1eac15e09e14..38459a89b27c4d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -34,11 +34,9 @@ #include "export.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "volumes.h" #include "locking.h" #include "backref.h" -#include "rcu-string.h" #include "send.h" #include "dev-replace.h" #include "props.h" @@ -47,9 +45,7 @@ #include "tree-log.h" #include "compression.h" #include "space-info.h" -#include "delalloc-space.h" #include "block-group.h" -#include "subpage.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" @@ -231,6 +227,20 @@ static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, return 0; } +int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args) +{ + if (memchr(vol_args->name, 0, sizeof(vol_args->name)) == NULL) + return -ENAMETOOLONG; + return 0; +} + +static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_args_v2 *vol_args2) +{ + if (memchr(vol_args2->name, 0, sizeof(vol_args2->name)) == NULL) + return -ENAMETOOLONG; + return 0; +} + /* * Set flags/xflags from the internal inode flags. The remaining items of * fsxattr are zeroed. @@ -247,7 +257,7 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_inode *binode = BTRFS_I(inode); struct btrfs_root *root = binode->root; struct btrfs_trans_handle *trans; @@ -528,7 +538,7 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, * block group is in the logical address space, which can be any * sectorsize aligned bytenr in the range [0, U64_MAX]. */ - if (range.len < fs_info->sb->s_blocksize) + if (range.len < fs_info->sectorsize) return -EINVAL; range.minlen = max(range.minlen, minlen); @@ -584,7 +594,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct btrfs_qgroup_inherit *inherit) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_key key; struct btrfs_root_item *root_item; @@ -776,7 +786,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct dentry *dentry, bool readonly, struct btrfs_qgroup_inherit *inherit) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; unsigned int trans_num_items; @@ -910,7 +920,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap, if (d_really_is_negative(victim)) return -ENOENT; - BUG_ON(d_inode(victim->d_parent) != dir); + /* The @victim is not inside @dir. */ + if (d_inode(victim->d_parent) != dir) + return -EINVAL; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); @@ -962,7 +974,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, struct btrfs_qgroup_inherit *inherit) { struct inode *dir = d_inode(parent->dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct dentry *dentry; struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen); int error; @@ -1097,7 +1109,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 new_size; u64 old_size; u64 devid = 1; @@ -1128,7 +1140,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = PTR_ERR(vol_args); goto out_drop; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + sizestr = vol_args->name; cancel = (strcmp("cancel", sizestr) == 0); ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel); @@ -1328,12 +1343,15 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out; ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), vol_args->name, vol_args->fd, subvol, false, NULL); +out: kfree(vol_args); return ret; } @@ -1352,7 +1370,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args); + if (ret < 0) + goto free_args; if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) { ret = -EOPNOTSUPP; @@ -1362,7 +1382,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { - u64 nums; + struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file)); if (vol_args->size < sizeof(*inherit) || vol_args->size > PAGE_SIZE) { @@ -1375,19 +1395,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, goto free_args; } - if (inherit->num_qgroups > PAGE_SIZE || - inherit->num_ref_copies > PAGE_SIZE || - inherit->num_excl_copies > PAGE_SIZE) { - ret = -EINVAL; - goto free_inherit; - } - - nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + - 2 * inherit->num_excl_copies; - if (vol_args->size != struct_size(inherit, qgroups, nums)) { - ret = -EINVAL; + ret = btrfs_qgroup_check_inherit(fs_info, inherit, vol_args->size); + if (ret < 0) goto free_inherit; - } } ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), @@ -1405,7 +1415,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, void __user *arg) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; u64 flags = 0; @@ -1428,7 +1438,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; u64 root_flags; @@ -1675,7 +1685,7 @@ static noinline int search_ioctl(struct inode *inode, u64 *buf_size, char __user *ubuf) { - struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *info = inode_to_fs_info(inode); struct btrfs_root *root; struct btrfs_key key; struct btrfs_path *path; @@ -2346,9 +2356,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, bool destroy_v2) { struct dentry *parent = file->f_path.dentry; - struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb); struct dentry *dentry; struct inode *dir = d_inode(parent); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; @@ -2382,7 +2392,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * name, same as v1 currently does. */ if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) { - vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0; + err = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2); + if (err < 0) + goto out; subvol_name = vol_args2->name; err = mnt_want_write_file(file); @@ -2466,7 +2478,10 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = 0; + err = btrfs_check_ioctl_vol_args_path(vol_args); + if (err < 0) + goto out; + subvol_name = vol_args->name; err = mnt_want_write_file(file); @@ -2677,12 +2692,16 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) goto out; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + ret = btrfs_init_new_device(fs_info, vol_args->name); if (!ret) btrfs_info(fs_info, "disk added %s", vol_args->name); +out_free: kfree(vol_args); out: if (restore_op) @@ -2696,7 +2715,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_vol_args_v2 *vol_args; struct bdev_handle *bdev_handle = NULL; int ret; @@ -2714,7 +2733,10 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args); + if (ret < 0) + goto out; + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { args.devid = vol_args->devid; } else if (!strcmp("cancel", vol_args->name)) { @@ -2761,7 +2783,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_vol_args *vol_args; struct bdev_handle *bdev_handle = NULL; int ret; @@ -2774,7 +2796,10 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + if (!strcmp("cancel", vol_args->name)) { cancel = true; } else { @@ -2801,6 +2826,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) bdev_release(bdev_handle); out: btrfs_put_dev_args_from_path(&args); +out_free: kfree(vol_args); return ret; } @@ -2904,7 +2930,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *new_root; struct btrfs_dir_item *di; @@ -3178,7 +3204,7 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, static long btrfs_ioctl_scrub(struct file *file, void __user *arg) { - struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file)); struct btrfs_ioctl_scrub_args *sa; int ret; @@ -3696,7 +3722,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info, static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_quota_ctl_args *sa; int ret; @@ -3738,7 +3764,7 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_assign_args *sa; struct btrfs_trans_handle *trans; @@ -3894,7 +3920,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_quota_rescan_args *qsa; int ret; @@ -3958,7 +3984,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, struct btrfs_ioctl_received_subvol_args *sa) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; struct btrfs_trans_handle *trans; @@ -4146,7 +4172,7 @@ static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info, static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_trans_handle *trans; @@ -4289,7 +4315,7 @@ check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_ioctl_feature_flags flags[2]; @@ -4580,7 +4606,7 @@ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; void __user *argp = (void __user *)arg; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index d51b9a2f2f6e88..2c5dc25ec67011 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -3,6 +3,15 @@ #ifndef BTRFS_IOCTL_H #define BTRFS_IOCTL_H +#include + +struct file; +struct dentry; +struct mnt_idmap; +struct fileattr; +struct btrfs_fs_info; +struct btrfs_ioctl_balance_args; + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 74d8e2003f58c0..99ccab86bb8656 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -13,7 +13,6 @@ #include "ctree.h" #include "extent_io.h" #include "locking.h" -#include "accessors.h" /* * Lockdep class keys for extent_buffer->lock's in this root. For a given @@ -85,7 +84,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int { struct btrfs_lockdep_keyset *ks; - BUG_ON(level >= ARRAY_SIZE(ks->keys)); + ASSERT(level < ARRAY_SIZE(ks->keys)); /* Find the matching keyset, id 0 is the default entry */ for (ks = btrfs_lockdep_keysets; ks->id; ks++) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 7d6ee1e609bf2b..9576f485a300b1 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -8,8 +8,14 @@ #include #include +#include #include #include "extent_io.h" +#include "locking.h" + +struct extent_buffer; +struct btrfs_path; +struct btrfs_root; #define BTRFS_WRITE_LOCK 1 #define BTRFS_READ_LOCK 2 @@ -157,8 +163,6 @@ enum btrfs_lockdep_trans_states { static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, "too many lock subclasses defined"); -struct btrfs_path; - void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h index 00328c856be6ef..e32906ab6faaab 100644 --- a/fs/btrfs/lru_cache.h +++ b/fs/btrfs/lru_cache.h @@ -3,8 +3,10 @@ #ifndef BTRFS_LRU_CACHE_H #define BTRFS_LRU_CACHE_H +#include #include #include +#include "lru_cache.h" /* * A cache entry. This is meant to be embedded in a structure of a user of @@ -50,11 +52,6 @@ struct btrfs_lru_cache { #define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \ list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list) -static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache) -{ - return cache->size; -} - static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( struct btrfs_lru_cache *cache) { diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index e43bc0fdc74ec9..e0fe2bf27bc71f 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -130,17 +130,17 @@ static inline size_t read_compress_length(const char *buf) */ static int copy_compressed_data_to_page(char *compressed_data, size_t compressed_size, - struct page **out_pages, - unsigned long max_nr_page, + struct folio **out_folios, + unsigned long max_nr_folio, u32 *cur_out, const u32 sectorsize) { u32 sector_bytes_left; u32 orig_out; - struct page *cur_page; + struct folio *cur_folio; char *kaddr; - if ((*cur_out / PAGE_SIZE) >= max_nr_page) + if ((*cur_out / PAGE_SIZE) >= max_nr_folio) return -E2BIG; /* @@ -149,16 +149,16 @@ static int copy_compressed_data_to_page(char *compressed_data, */ ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize); - cur_page = out_pages[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out / PAGE_SIZE]; /* Allocate a new page */ - if (!cur_page) { - cur_page = btrfs_alloc_compr_page(); - if (!cur_page) + if (!cur_folio) { + cur_folio = btrfs_alloc_compr_folio(); + if (!cur_folio) return -ENOMEM; - out_pages[*cur_out / PAGE_SIZE] = cur_page; + out_folios[*cur_out / PAGE_SIZE] = cur_folio; } - kaddr = kmap_local_page(cur_page); + kaddr = kmap_local_folio(cur_folio, 0); write_compress_length(kaddr + offset_in_page(*cur_out), compressed_size); *cur_out += LZO_LEN; @@ -172,18 +172,18 @@ static int copy_compressed_data_to_page(char *compressed_data, kunmap_local(kaddr); - if ((*cur_out / PAGE_SIZE) >= max_nr_page) + if ((*cur_out / PAGE_SIZE) >= max_nr_folio) return -E2BIG; - cur_page = out_pages[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out / PAGE_SIZE]; /* Allocate a new page */ - if (!cur_page) { - cur_page = btrfs_alloc_compr_page(); - if (!cur_page) + if (!cur_folio) { + cur_folio = btrfs_alloc_compr_folio(); + if (!cur_folio) return -ENOMEM; - out_pages[*cur_out / PAGE_SIZE] = cur_page; + out_folios[*cur_out / PAGE_SIZE] = cur_folio; } - kaddr = kmap_local_page(cur_page); + kaddr = kmap_local_folio(cur_folio, 0); memcpy(kaddr + offset_in_page(*cur_out), compressed_data + *cur_out - orig_out, copy_len); @@ -209,15 +209,15 @@ static int copy_compressed_data_to_page(char *compressed_data, return 0; } -int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, +int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); - const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize; - struct page *page_in = NULL; + const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize; + struct folio *folio_in = NULL; char *sizes_ptr; - const unsigned long max_nr_page = *out_pages; + const unsigned long max_nr_folio = *out_folios; int ret = 0; /* Points to the file offset of input data */ u64 cur_in = start; @@ -225,8 +225,8 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, u32 cur_out = 0; u32 len = *total_out; - ASSERT(max_nr_page > 0); - *out_pages = 0; + ASSERT(max_nr_folio > 0); + *out_folios = 0; *total_out = 0; *total_in = 0; @@ -243,15 +243,16 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, size_t out_len; /* Get the input page first */ - if (!page_in) { - page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT); - ASSERT(page_in); + if (!folio_in) { + ret = btrfs_compress_filemap_get_folio(mapping, cur_in, &folio_in); + if (ret < 0) + goto out; } /* Compress at most one sector of data each time */ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); ASSERT(in_len); - data_in = kmap_local_page(page_in); + data_in = kmap_local_folio(folio_in, 0); ret = lzo1x_1_compress(data_in + offset_in_page(cur_in), in_len, workspace->cbuf, &out_len, @@ -264,7 +265,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } ret = copy_compressed_data_to_page(workspace->cbuf, out_len, - pages, max_nr_page, + folios, max_nr_folio, &cur_out, sectorsize); if (ret < 0) goto out; @@ -282,13 +283,13 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we have reached page boundary */ if (PAGE_ALIGNED(cur_in)) { - put_page(page_in); - page_in = NULL; + folio_put(folio_in); + folio_in = NULL; } } /* Store the size of all chunks of compressed data */ - sizes_ptr = kmap_local_page(pages[0]); + sizes_ptr = kmap_local_folio(folios[0], 0); write_compress_length(sizes_ptr, cur_out); kunmap_local(sizes_ptr); @@ -296,9 +297,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, *total_out = cur_out; *total_in = cur_in - start; out: - if (page_in) - put_page(page_in); - *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE); + if (folio_in) + folio_put(folio_in); + *out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE); return ret; } @@ -313,15 +314,15 @@ static void copy_compressed_segment(struct compressed_bio *cb, u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { - struct page *cur_page; + struct folio *cur_folio; u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), orig_in + len - *cur_in); ASSERT(copy_len); - cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; + cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE]; - memcpy_from_page(dest + *cur_in - orig_in, cur_page, - offset_in_page(*cur_in), copy_len); + memcpy_from_folio(dest + *cur_in - orig_in, cur_folio, + offset_in_folio(cur_folio, *cur_in), copy_len); *cur_in += copy_len; } @@ -341,7 +342,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Bytes decompressed so far */ u32 cur_out = 0; - kaddr = kmap_local_page(cb->compressed_pages[0]); + kaddr = kmap_local_folio(cb->compressed_folios[0], 0); len_in = read_compress_length(kaddr); kunmap_local(kaddr); cur_in += LZO_LEN; @@ -363,7 +364,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Go through each lzo segment */ while (cur_in < len_in) { - struct page *cur_page; + struct folio *cur_folio; /* Length of the compressed segment */ u32 seg_len; u32 sector_bytes_left; @@ -375,9 +376,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) */ ASSERT(cur_in / sectorsize == (cur_in + LZO_LEN - 1) / sectorsize); - cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; - ASSERT(cur_page); - kaddr = kmap_local_page(cur_page); + cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE]; + ASSERT(cur_folio); + kaddr = kmap_local_folio(cur_folio, 0); seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); kunmap_local(kaddr); cur_in += LZO_LEN; @@ -429,7 +430,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page); const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index cdada4865837fc..c96dd66fd0f722 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -3,8 +3,6 @@ #include "fs.h" #include "messages.h" #include "discard.h" -#include "transaction.h" -#include "space-info.h" #include "super.h" #ifdef CONFIG_PRINTK diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 40f2d9f1a17a9c..dde4904aead9bc 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -3,6 +3,8 @@ #ifndef BTRFS_MISC_H #define BTRFS_MISC_H +#include +#include #include #include #include diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 59850dc17b22f0..b749ba45da2ba2 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -19,7 +19,6 @@ #include "qgroup.h" #include "subpage.h" #include "file.h" -#include "super.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -1236,10 +1235,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( int __init ordered_data_init(void) { - btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", - sizeof(struct btrfs_ordered_extent), 0, - SLAB_MEM_SPREAD, - NULL); + btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0); if (!btrfs_ordered_extent_cache) return -ENOMEM; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 127ef8bf0ffd78..34413fc5b4bd2b 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -6,6 +6,21 @@ #ifndef BTRFS_ORDERED_DATA_H #define BTRFS_ORDERED_DATA_H +#include +#include +#include +#include +#include +#include +#include "async-thread.h" + +struct inode; +struct page; +struct extent_state; +struct btrfs_inode; +struct btrfs_root; +struct btrfs_fs_info; + struct btrfs_ordered_sum { /* * Logical start address and length for of the blocks covered by diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 7a1b021b5669d2..6195a2215b8fee 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -4,7 +4,6 @@ */ #include "ctree.h" -#include "disk-io.h" #include "orphan.h" int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h index 3faab5cbb59ac9..aa54a88a60de1e 100644 --- a/fs/btrfs/orphan.h +++ b/fs/btrfs/orphan.h @@ -3,6 +3,11 @@ #ifndef BTRFS_ORPHAN_H #define BTRFS_ORPHAN_H +#include + +struct btrfs_trans_handle; +struct btrfs_root; + int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index c42bc666d5eeab..8504bf1702c7a2 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -9,6 +9,9 @@ /* Buffer size to contain tree name and possibly additional data (offset) */ #define BTRFS_ROOT_NAME_BUF_LEN 48 +struct extent_buffer; +struct btrfs_key; + void btrfs_print_leaf(const struct extent_buffer *l); void btrfs_print_tree(const struct extent_buffer *c, bool follow); const char *btrfs_root_name(const struct btrfs_key *key, char *buf); diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index f9bf591a07187a..2a9b7b029eeba3 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -4,6 +4,7 @@ */ #include +#include #include "messages.h" #include "props.h" #include "btrfs_inode.h" @@ -302,7 +303,7 @@ static int prop_compression_validate(const struct btrfs_inode *inode, static int prop_compression_apply(struct inode *inode, const char *value, size_t len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); int type; /* Reset to defaults */ diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 6e283196e38aba..f60cd89feb2930 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -6,7 +6,12 @@ #ifndef BTRFS_PROPS_H #define BTRFS_PROPS_H -#include "ctree.h" +#include + +struct inode; +struct btrfs_inode; +struct btrfs_path; +struct btrfs_trans_handle; int __init btrfs_props_init(void); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5470e1cdf10c53..5f90f0605b12f7 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1324,7 +1324,7 @@ static int flush_reservations(struct btrfs_fs_info *fs_info) trans = btrfs_join_transaction(fs_info->tree_root); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_commit_transaction(trans); + ret = btrfs_commit_transaction(trans); return ret; } @@ -2505,8 +2505,8 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *eb = root_eb; struct btrfs_path *path = NULL; - BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); - BUG_ON(root_eb == NULL); + ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL); + ASSERT(root_eb != NULL); if (!btrfs_qgroup_full_accounting(fs_info)) return 0; @@ -2861,8 +2861,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (nr_old_roots == 0 && nr_new_roots == 0) goto out_free; - BUG_ON(!fs_info->quota_root); - trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, num_bytes, nr_old_roots, nr_new_roots); @@ -2959,11 +2957,6 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) ctx.roots = NULL; } - /* Free the reserved data space */ - btrfs_qgroup_free_refroot(fs_info, - record->data_rsv_refroot, - record->data_rsv, - BTRFS_QGROUP_RSV_DATA); /* * Use BTRFS_SEQ_LAST as time_seq to do special search, * which doesn't lock tree or delayed_refs and search @@ -2987,6 +2980,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) record->old_roots = NULL; new_roots = NULL; } + /* Free the reserved data space */ + btrfs_qgroup_free_refroot(fs_info, + record->data_rsv_refroot, + record->data_rsv, + BTRFS_QGROUP_RSV_DATA); cleanup: ulist_free(record->old_roots); ulist_free(new_roots); @@ -3048,6 +3046,57 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) return ret; } +int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_inherit *inherit, + size_t size) +{ + if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP) + return -EOPNOTSUPP; + if (size < sizeof(*inherit) || size > PAGE_SIZE) + return -EINVAL; + + /* + * In the past we allowed btrfs_qgroup_inherit to specify to copy + * rfer/excl numbers directly from other qgroups. This behavior has + * been disabled in userspace for a very long time, but here we should + * also disable it in kernel, as this behavior is known to mark qgroup + * inconsistent, and a rescan would wipe out the changes anyway. + * + * Reject any btrfs_qgroup_inherit with num_ref_copies or num_excl_copies. + */ + if (inherit->num_ref_copies > 0 || inherit->num_excl_copies > 0) + return -EINVAL; + + if (inherit->num_qgroups > PAGE_SIZE) + return -EINVAL; + + if (size != struct_size(inherit, qgroups, inherit->num_qgroups)) + return -EINVAL; + + /* + * Now check all the remaining qgroups, they should all: + * + * - Exist + * - Be higher level qgroups. + */ + for (int i = 0; i < inherit->num_qgroups; i++) { + struct btrfs_qgroup *qgroup; + u64 qgroupid = inherit->qgroups[i]; + + if (btrfs_qgroup_level(qgroupid) == 0) + return -EINVAL; + + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) { + spin_unlock(&fs_info->qgroup_lock); + return -ENOENT; + } + spin_unlock(&fs_info->qgroup_lock); + } + return 0; +} + static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, u64 inode_rootid, struct btrfs_qgroup_inherit **inherit) @@ -3089,6 +3138,62 @@ static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, return 0; } +/* + * Check if we can skip rescan when inheriting qgroups. If @src has a single + * @parent, and that @parent is owning all its bytes exclusively, we can skip + * the full rescan, by just adding nodesize to the @parent's excl/rfer. + * + * Return <0 for fatal errors (like srcid/parentid has no qgroup). + * Return 0 if a quick inherit is done. + * Return >0 if a quick inherit is not possible, and a full rescan is needed. + */ +static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info, + u64 srcid, u64 parentid) +{ + struct btrfs_qgroup *src; + struct btrfs_qgroup *parent; + struct btrfs_qgroup_list *list; + int nr_parents = 0; + + src = find_qgroup_rb(fs_info, srcid); + if (!src) + return -ENOENT; + parent = find_qgroup_rb(fs_info, parentid); + if (!parent) + return -ENOENT; + + /* + * Source has no parent qgroup, but our new qgroup would have one. + * Qgroup numbers would become inconsistent. + */ + if (list_empty(&src->groups)) + return 1; + + list_for_each_entry(list, &src->groups, next_group) { + /* The parent is not the same, quick update is not possible. */ + if (list->group->qgroupid != parentid) + return 1; + nr_parents++; + /* + * More than one parent qgroup, we can't be sure about accounting + * consistency. + */ + if (nr_parents > 1) + return 1; + } + + /* + * The parent is not exclusively owning all its bytes. We're not sure + * if the source has any bytes not fully owned by the parent. + */ + if (parent->excl != parent->rfer) + return 1; + + parent->excl += fs_info->nodesize; + parent->rfer += fs_info->nodesize; + return 0; +} + /* * Copy the accounting information between qgroups. This is necessary * when a snapshot or a subvolume is created. Throwing an error will @@ -3257,6 +3362,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, qgroup_dirty(fs_info, dstgroup); qgroup_dirty(fs_info, srcgroup); + + /* + * If the source qgroup has parent but the new one doesn't, + * we need a full rescan. + */ + if (!inherit && !list_empty(&srcgroup->groups)) + need_rescan = true; } if (!inherit) @@ -3271,14 +3383,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (ret) goto unlock; } + if (srcid) { + /* Check if we can do a quick inherit. */ + ret = qgroup_snapshot_quick_inherit(fs_info, srcid, *i_qgroups); + if (ret < 0) + goto unlock; + if (ret > 0) + need_rescan = true; + ret = 0; + } ++i_qgroups; - - /* - * If we're doing a snapshot, and adding the snapshot to a new - * qgroup, the numbers are guaranteed to be incorrect. - */ - if (srcid) - need_rescan = true; } for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index be18c862e64ede..706640be0ec24a 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -6,12 +6,22 @@ #ifndef BTRFS_QGROUP_H #define BTRFS_QGROUP_H +#include #include #include #include -#include "ulist.h" -#include "delayed-ref.h" -#include "misc.h" +#include +#include + +struct extent_buffer; +struct extent_changeset; +struct btrfs_delayed_extent_op; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_ioctl_quota_ctl_args; +struct btrfs_trans_handle; +struct btrfs_delayed_ref_root; +struct btrfs_inode; /* * Btrfs qgroup overview @@ -321,7 +331,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit); int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); -struct btrfs_delayed_extent_op; int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, @@ -341,6 +350,9 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct ulist *new_roots); int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans); int btrfs_run_qgroups(struct btrfs_trans_handle *trans); +int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_inherit *inherit, + size_t size); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, u64 objectid, u64 inode_rootid, struct btrfs_qgroup_inherit *inherit); diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 9589362acfbf9e..6af6b4b9a32ef9 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -11,7 +11,6 @@ #include "disk-io.h" #include "raid-stripe-tree.h" #include "volumes.h" -#include "misc.h" #include "print-tree.h" int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index cdb58b38fcb5ea..c9c258f8490374 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -6,6 +6,10 @@ #ifndef BTRFS_RAID_STRIPE_TREE_H #define BTRFS_RAID_STRIPE_TREE_H +#include +#include +#include "fs.h" + #define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID1_MASK | \ BTRFS_BLOCK_GROUP_RAID0 | \ @@ -13,6 +17,7 @@ struct btrfs_io_context; struct btrfs_io_stripe; +struct btrfs_fs_info; struct btrfs_ordered_extent; struct btrfs_trans_handle; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 792c8e17c31d76..6f4a9cfeea44a3 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -14,7 +14,6 @@ #include #include #include "messages.h" -#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "volumes.h" @@ -918,6 +917,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, */ ASSERT(stripe_nsectors <= BITS_PER_LONG); + /* + * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256 + * (limited by u8). + */ + ASSERT(real_stripes >= 2); + ASSERT(real_stripes <= U8_MAX); + rbio = kzalloc(sizeof(*rbio), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -955,6 +961,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); + ASSERT(rbio->nr_data > 0); return rbio; } @@ -1181,6 +1188,26 @@ static inline void bio_list_put(struct bio_list *bio_list) bio_put(bio); } +static void assert_rbio(struct btrfs_raid_bio *rbio) +{ + if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) || + !IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + /* + * At least two stripes (2 disks RAID5), and since real_stripes is U8, + * we won't go beyond 256 disks anyway. + */ + ASSERT(rbio->real_stripes >= 2); + ASSERT(rbio->nr_data > 0); + + /* + * This is another check to make sure nr data stripes is smaller + * than total stripes. + */ + ASSERT(rbio->nr_data < rbio->real_stripes); +} + /* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { @@ -1212,6 +1239,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + assert_rbio(rbio); raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { @@ -2473,6 +2501,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) } if (has_qstripe) { + assert_rbio(rbio); /* RAID6, call the library function to fill in our P/Q */ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 470213688872ec..0d7b4c2fb6ae80 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -7,9 +7,18 @@ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H +#include +#include +#include +#include +#include #include #include "volumes.h" +struct page; +struct sector_ptr; +struct btrfs_fs_info; + enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index 5c2b66d155ef72..1c2d7cb1fe6f63 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -6,6 +6,12 @@ #ifndef BTRFS_RCU_STRING_H #define BTRFS_RCU_STRING_H +#include +#include +#include +#include +#include + struct rcu_string { struct rcu_head rcu; char str[]; diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index 855de37719b546..3511e1a5c96ba9 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -6,7 +6,16 @@ #ifndef BTRFS_REF_VERIFY_H #define BTRFS_REF_VERIFY_H +#include +#include + +struct btrfs_fs_info; +struct btrfs_ref; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY + +#include + int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info); int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index ae90894dc7dc7d..08d0fb46ceec4d 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -174,7 +174,7 @@ static int clone_copy_inline_extent(struct inode *dst, char *inline_data, struct btrfs_trans_handle **trans_out) { - struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(dst); struct btrfs_root *root = BTRFS_I(dst)->root; const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); @@ -337,7 +337,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, const u64 destoff, int no_time_update) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_path *path = NULL; struct extent_buffer *leaf; struct btrfs_trans_handle *trans; @@ -663,7 +663,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; - const u64 bs = fs_info->sb->s_blocksize; + const u64 bs = fs_info->sectorsize; int ret; /* @@ -726,11 +726,11 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, { struct inode *inode = file_inode(file); struct inode *src = file_inode(file_src); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); int ret; int wb_ret; u64 len = olen; - u64 bs = fs_info->sb->s_blocksize; + u64 bs = fs_info->sectorsize; /* * VFS's generic_remap_file_range_prep() protects us from cloning the @@ -796,7 +796,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); - u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; + u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize; u64 wb_len; int ret; diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h index ecb309b4dad0fc..1e291f7d85c428 100644 --- a/fs/btrfs/reflink.h +++ b/fs/btrfs/reflink.h @@ -3,7 +3,9 @@ #ifndef BTRFS_REFLINK_H #define BTRFS_REFLINK_H -#include +#include + +struct file; loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index abe594f77f99c0..3a7f47ad4bf4d7 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -523,7 +523,8 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( if (handle_useless_nodes(rc, node)) node = NULL; out: - btrfs_backref_iter_free(iter); + btrfs_free_path(iter->path); + kfree(iter); btrfs_free_path(path); if (err) { btrfs_backref_error_cleanup(cache, node); @@ -2849,7 +2850,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * btrfs_do_readpage() call of previously relocated file cluster. * * If the current cluster starts in the above range, btrfs_do_readpage() - * will skip the read, and relocate_one_page() will later writeback + * will skip the read, and relocate_one_folio() will later writeback * the padding zeros as new data, causing data corruption. * * Here we have to manually invalidate the range (i_size, PAGE_END + 1). @@ -2858,7 +2859,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( struct address_space *mapping = inode->vfs_inode.i_mapping; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; - struct page *page; + struct folio *folio; ASSERT(sectorsize < PAGE_SIZE); ASSERT(IS_ALIGNED(i_size, sectorsize)); @@ -2889,16 +2890,16 @@ static noinline_for_stack int prealloc_file_extent_cluster( clear_extent_bits(&inode->io_tree, i_size, round_up(i_size, PAGE_SIZE) - 1, EXTENT_UPTODATE); - page = find_lock_page(mapping, i_size >> PAGE_SHIFT); + folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT); /* * If page is freed we don't need to do anything then, as we * will re-read the whole page anyway. */ - if (page) { - btrfs_subpage_clear_uptodate(fs_info, page_folio(page), i_size, + if (!IS_ERR(folio)) { + btrfs_subpage_clear_uptodate(fs_info, folio, i_size, round_up(i_size, PAGE_SIZE) - i_size); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } } @@ -2983,68 +2984,71 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, return cluster->boundary[cluster_nr + 1] - 1; } -static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, +static int relocate_one_folio(struct inode *inode, struct file_ra_state *ra, const struct file_extent_cluster *cluster, - int *cluster_nr, unsigned long page_index) + int *cluster_nr, unsigned long index) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 offset = BTRFS_I(inode)->index_cnt; const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - struct page *page; - u64 page_start; - u64 page_end; + struct folio *folio; + u64 folio_start; + u64 folio_end; u64 cur; int ret; - ASSERT(page_index <= last_index); - page = find_lock_page(inode->i_mapping, page_index); - if (!page) { + ASSERT(index <= last_index); + folio = filemap_lock_folio(inode->i_mapping, index); + if (IS_ERR(folio)) { page_cache_sync_readahead(inode->i_mapping, ra, NULL, - page_index, last_index + 1 - page_index); - page = find_or_create_page(inode->i_mapping, page_index, mask); - if (!page) - return -ENOMEM; + index, last_index + 1 - index); + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + if (IS_ERR(folio)) + return PTR_ERR(folio); } - if (PageReadahead(page)) + WARN_ON(folio_order(folio)); + + if (folio_test_readahead(folio)) page_cache_async_readahead(inode->i_mapping, ra, NULL, - page_folio(page), page_index, - last_index + 1 - page_index); + folio, index, + last_index + 1 - index); - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (!folio_test_uptodate(folio)) { ret = -EIO; - goto release_page; + goto release_folio; } } /* - * We could have lost page private when we dropped the lock to read the - * page above, make sure we set_page_extent_mapped here so we have any + * We could have lost folio private when we dropped the lock to read the + * folio above, make sure we set_page_extent_mapped here so we have any * of the subpage blocksize stuff we need in place. */ - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) - goto release_page; + goto release_folio; - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; + folio_start = folio_pos(folio); + folio_end = folio_start + PAGE_SIZE - 1; /* * Start from the cluster, as for subpage case, the cluster can start - * inside the page. + * inside the folio. */ - cur = max(page_start, cluster->boundary[*cluster_nr] - offset); - while (cur <= page_end) { + cur = max(folio_start, cluster->boundary[*cluster_nr] - offset); + while (cur <= folio_end) { struct extent_state *cached_state = NULL; u64 extent_start = cluster->boundary[*cluster_nr] - offset; u64 extent_end = get_cluster_boundary_end(cluster, *cluster_nr) - offset; - u64 clamped_start = max(page_start, extent_start); - u64 clamped_end = min(page_end, extent_end); + u64 clamped_start = max(folio_start, extent_start); + u64 clamped_end = min(folio_end, extent_end); u32 clamped_len = clamped_end + 1 - clamped_start; /* Reserve metadata for this range */ @@ -3052,7 +3056,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, clamped_len, clamped_len, false); if (ret) - goto release_page; + goto release_folio; /* Mark the range delalloc and dirty for later writeback */ lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, @@ -3068,20 +3072,18 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, clamped_len, true); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); - goto release_page; + goto release_folio; } - btrfs_folio_set_dirty(fs_info, page_folio(page), - clamped_start, clamped_len); + btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len); /* - * Set the boundary if it's inside the page. + * Set the boundary if it's inside the folio. * Data relocation requires the destination extents to have the * same size as the source. * EXTENT_BOUNDARY bit prevents current extent from being merged * with previous extent. */ - if (in_range(cluster->boundary[*cluster_nr] - offset, - page_start, PAGE_SIZE)) { + if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) { u64 boundary_start = cluster->boundary[*cluster_nr] - offset; u64 boundary_end = boundary_start + @@ -3104,8 +3106,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, break; } } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); @@ -3113,9 +3115,9 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, ret = -ECANCELED; return ret; -release_page: - unlock_page(page); - put_page(page); +release_folio: + folio_unlock(folio); + folio_put(folio); return ret; } @@ -3150,7 +3152,7 @@ static int relocate_file_extent_cluster(struct inode *inode, last_index = (cluster->end - offset) >> PAGE_SHIFT; for (index = (cluster->start - offset) >> PAGE_SHIFT; index <= last_index && !ret; index++) - ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index); + ret = relocate_one_folio(inode, ra, cluster, &cluster_nr, index); if (ret == 0) WARN_ON(cluster_nr != cluster->nr); out: diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 5fb60f2deb5305..788c86d8633aff 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -3,6 +3,15 @@ #ifndef BTRFS_RELOCATION_H #define BTRFS_RELOCATION_H +#include + +struct extent_buffer; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_trans_handle; +struct btrfs_ordered_extent; +struct btrfs_pending_snapshot; + int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 603ad1459368c3..4bb538a372ce56 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -10,7 +10,6 @@ #include "messages.h" #include "transaction.h" #include "disk-io.h" -#include "print-tree.h" #include "qgroup.h" #include "space-info.h" #include "accessors.h" @@ -82,7 +81,14 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, if (ret > 0) goto out; } else { - BUG_ON(ret == 0); /* Logical error */ + /* + * Key with offset -1 found, there would have to exist a root + * with such id, but this is out of the valid range. + */ + if (ret == 0) { + ret = -EUCLEAN; + goto out; + } if (path->slots[0] == 0) goto out; path->slots[0]--; @@ -323,8 +329,11 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) goto out; - - BUG_ON(ret != 0); + if (ret != 0) { + /* The root must exist but we did not find it by the key. */ + ret = -EUCLEAN; + goto out; + } ret = btrfs_del_item(trans, root, path); out: diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h index 8b2c3859e4647a..6f929cf3bd4967 100644 --- a/fs/btrfs/root-tree.h +++ b/fs/btrfs/root-tree.h @@ -3,7 +3,17 @@ #ifndef BTRFS_ROOT_TREE_H #define BTRFS_ROOT_TREE_H +#include + struct fscrypt_str; +struct extent_buffer; +struct btrfs_key; +struct btrfs_root; +struct btrfs_root_item; +struct btrfs_path; +struct btrfs_fs_info; +struct btrfs_block_rsv; +struct btrfs_trans_handle; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0123d272892373..c4bd0e60db5925 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1390,8 +1390,15 @@ static int find_first_extent_item(struct btrfs_root *extent_root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + btrfs_release_path(path); + return -EUCLEAN; + } - ASSERT(ret > 0); /* * Here we intentionally pass 0 as @min_objectid, as there could be * an extent item starting before @search_start. diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h index 7639103ebf9df3..f0df597b75c7c7 100644 --- a/fs/btrfs/scrub.h +++ b/fs/btrfs/scrub.h @@ -3,6 +3,12 @@ #ifndef BTRFS_SCRUB_H #define BTRFS_SCRUB_H +#include + +struct btrfs_fs_info; +struct btrfs_device; +struct btrfs_scrub_progress; + int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e48a063ef0851f..4ee27daf18a7bf 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -25,7 +25,6 @@ #include "btrfs_inode.h" #include "transaction.h" #include "compression.h" -#include "xattr.h" #include "print-tree.h" #include "accessors.h" #include "dir-item.h" @@ -777,7 +776,12 @@ static int begin_cmd(struct send_ctx *sctx, int cmd) if (WARN_ON(!sctx->send_buf)) return -EINVAL; - BUG_ON(sctx->send_size); + if (unlikely(sctx->send_size != 0)) { + btrfs_err(sctx->send_root->fs_info, + "send: command header buffer not empty cmd %d offset %llu", + cmd, sctx->send_off); + return -EINVAL; + } sctx->send_size += sizeof(*hdr); hdr = (struct btrfs_cmd_header *)sctx->send_buf; @@ -1070,7 +1074,15 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, ret = PTR_ERR(start); goto out; } - BUG_ON(start < p->buf); + if (unlikely(start < p->buf)) { + btrfs_err(root->fs_info, + "send: path ref buffer underflow for key (%llu %u %llu)", + found_key->objectid, + found_key->type, + found_key->offset); + ret = -EINVAL; + goto out; + } } p->start = start; } else { @@ -1406,7 +1418,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; - if (btrfs_lru_cache_size(&sctx->backref_cache) == 0) + if (sctx->backref_cache.size == 0) return false; /* @@ -1504,7 +1516,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (btrfs_lru_cache_size(&sctx->backref_cache) == 1) + if (sctx->backref_cache.size == 1) sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; } @@ -2809,8 +2821,7 @@ static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) static int trim_dir_utimes_cache(struct send_ctx *sctx) { - while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) > - SEND_MAX_DIR_UTIMES_CACHE_SIZE) { + while (sctx->dir_utimes_cache.size > SEND_MAX_DIR_UTIMES_CACHE_SIZE) { struct btrfs_lru_cache_entry *lru; int ret; @@ -4182,7 +4193,13 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * This should never happen as the root dir always has the same ref * which is always '..' */ - BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); + if (unlikely(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID)) { + btrfs_err(fs_info, + "send: unexpected inode %llu in process_recorded_refs()", + sctx->cur_ino); + ret = -EINVAL; + goto out; + } valid_path = fs_path_alloc(); if (!valid_path) { @@ -5257,10 +5274,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct page *page; + struct folio *folio; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; unsigned pg_offset = offset_in_page(offset); + struct address_space *mapping = sctx->cur_inode->i_mapping; int ret; ret = put_data_header(sctx, len); @@ -5273,44 +5291,45 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); - page = find_lock_page(sctx->cur_inode->i_mapping, index); - if (!page) { - page_cache_sync_readahead(sctx->cur_inode->i_mapping, + folio = filemap_lock_folio(mapping, index); + if (IS_ERR(folio)) { + page_cache_sync_readahead(mapping, &sctx->ra, NULL, index, last_index + 1 - index); - page = find_or_create_page(sctx->cur_inode->i_mapping, - index, GFP_KERNEL); - if (!page) { - ret = -ENOMEM; + folio = filemap_grab_folio(mapping, index); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); break; } } - if (PageReadahead(page)) - page_cache_async_readahead(sctx->cur_inode->i_mapping, - &sctx->ra, NULL, page_folio(page), + WARN_ON(folio_order(folio)); + + if (folio_test_readahead(folio)) + page_cache_async_readahead(mapping, + &sctx->ra, NULL, folio, index, last_index + 1 - index); - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); + if (!folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); btrfs_err(fs_info, "send: IO error at offset %llu for inode %llu root %llu", - page_offset(page), sctx->cur_ino, + folio_pos(folio), sctx->cur_ino, sctx->send_root->root_key.objectid); - put_page(page); + folio_put(folio); ret = -EIO; break; } } - memcpy_from_page(sctx->send_buf + sctx->send_size, page, + memcpy_from_folio(sctx->send_buf + sctx->send_size, folio, pg_offset, cur_len); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); index++; pg_offset = 0; len -= cur_len; @@ -6140,7 +6159,7 @@ static int send_write_or_clone(struct send_ctx *sctx, int ret = 0; u64 offset = key->offset; u64 end; - u64 bs = sctx->send_root->fs_info->sb->s_blocksize; + u64 bs = sctx->send_root->fs_info->sectorsize; end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); if (offset >= end) @@ -6458,21 +6477,18 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) return 0; - if (sctx->cur_inode_last_extent == (u64)-1) { - ret = get_last_extent(sctx, key->offset - 1); - if (ret) - return ret; - } - - if (path->slots[0] == 0 && - sctx->cur_inode_last_extent < key->offset) { - /* - * We might have skipped entire leafs that contained only - * file extent items for our current inode. These leafs have - * a generation number smaller (older) than the one in the - * current leaf and the leaf our last extent came from, and - * are located between these 2 leafs. - */ + /* + * Get last extent's end offset (exclusive) if we haven't determined it + * yet (we're processing the first file extent item that is new), or if + * we're at the first slot of a leaf and the last extent's end is less + * than the current extent's offset, because we might have skipped + * entire leaves that contained only file extent items for our current + * inode. These leaves have a generation number smaller (older) than the + * one in the current leaf and the leaf our last extent came from, and + * are located between these 2 leaves. + */ + if ((sctx->cur_inode_last_extent == (u64)-1) || + (path->slots[0] == 0 && sctx->cur_inode_last_extent < key->offset)) { ret = get_last_extent(sctx, key->offset - 1); if (ret) return ret; @@ -7429,8 +7445,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen u64 reada_done = 0; lockdep_assert_held_read(&parent->fs_info->commit_root_sem); + ASSERT(*level != 0); - BUG_ON(*level == 0); eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) return PTR_ERR(eb); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 4f5509cb180358..dd1c9f02b01118 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -8,6 +8,11 @@ #define BTRFS_SEND_H #include +#include +#include + +struct inode; +struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" /* Conditional support for the upcoming protocol version. */ @@ -25,9 +30,6 @@ #define BTRFS_SEND_BUF_SIZE_V1 SZ_64K #define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE) -struct inode; -struct btrfs_ioctl_send_args; - enum btrfs_tlv_type { BTRFS_TLV_U8, BTRFS_TLV_U16, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 3b54eb5834746b..74bf59a303bccb 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include "misc.h" #include "ctree.h" #include "space-info.h" @@ -9,7 +10,6 @@ #include "ordered-data.h" #include "transaction.h" #include "block-group.h" -#include "zoned.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" @@ -191,6 +191,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) */ #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) +#define BTRFS_DYNAMIC_RECLAIM_THRESH_MAX (90) + /* * Calculate chunk size depending on volume type (regular or zoned). */ @@ -233,6 +235,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) if (!space_info) return -ENOMEM; + space_info->fs_info = info; for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&space_info->block_groups[i]); init_rwsem(&space_info->groups_sem); @@ -1869,3 +1872,164 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } + +static u64 calc_pct_ratio(u64 x, u64 y) +{ + int err; + + if (!y) + return 0; +again: + err = check_mul_overflow(100, x, &x); + if (err) + goto lose_precision; + return div64_u64(x, y); +lose_precision: + x >>= 10; + y >>= 10; + if (!y) + y = 1; + goto again; +} + +/* + * The dynamic threshold formula is: + * (unused / allocated) * (unused / unallocated) or equivalently + * unused^2 / (allocated * unallocated) + * + * The fundamental goal of automatic reclaim is to protect the filesystem's + * unallocated space and thus minimize the probability of the filesystem going + * read only when a metadata allocation failure causes a transaction abort. + * + * However, relocations happen into the space_info's unused space, therefore + * automatic reclaim must also back off as that space runs low. There is no + * value in doing trivial "relocations" of re-writing the same block group + * into a fresh one. + * + * unused / allocated sets a baseline, very conservative threshold which + * properly goes to 0 as unused goes to a small portion of the allocated space. + * + * On its own, this would likely do very little reclaim, so include + * unused / unallocated (which can be greatly in excess of 100%) to bias heavily + * towards reclaim when unallocated goes low or unused goes high. + */ + +static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + u64 unalloc = atomic64_read(&fs_info->free_chunk_space); + u64 alloc = space_info->total_bytes; + u64 used = btrfs_space_info_used(space_info, false); + u64 unused = alloc - used; + /* unused <= alloc; clamped to 100 */ + int unused_pct = calc_pct_ratio(unused, alloc); + u64 unused_unalloc_ratio = calc_pct_ratio(unused, unalloc); + int err; + u64 thresh; + + err = check_mul_overflow(unused_pct, unused_unalloc_ratio, &thresh); + if (err) + return BTRFS_DYNAMIC_RECLAIM_THRESH_MAX; + /* Both quantities are percentages; remove the squared factor of 100. */ + thresh = div64_u64(thresh, 100); + return clamp_val(thresh, 0, BTRFS_DYNAMIC_RECLAIM_THRESH_MAX); +} + +int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info) +{ + lockdep_assert_held(&space_info->lock); + + if (READ_ONCE(space_info->dynamic_reclaim)) + return calc_dynamic_reclaim_threshold(space_info); + return READ_ONCE(space_info->bg_reclaim_threshold); +} + +/* + * Under "urgent" reclaim, we will reclaim even fresh block groups that have + * recently seen successful allocations, as we are desperate to reclaim + * whatever we can to avoid ENOSPC in a transaction leading to a readonly fs. + */ +static bool is_reclaim_urgent(struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + u64 unalloc = atomic64_read(&fs_info->free_chunk_space); + u64 chunk_size = min(READ_ONCE(space_info->chunk_size), SZ_1G); + + return unalloc < chunk_size; +} + +static int do_reclaim_sweep(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, int raid) +{ + struct btrfs_block_group *bg; + int thresh_pct; + bool try_again = true; + bool urgent; + + spin_lock(&space_info->lock); + if (space_info->periodic_reclaim_ready) { + space_info->periodic_reclaim_ready = false; + } else { + spin_unlock(&space_info->lock); + return 0; + } + urgent = is_reclaim_urgent(space_info); + thresh_pct = btrfs_calc_reclaim_threshold(space_info); + spin_unlock(&space_info->lock); + + down_read(&space_info->groups_sem); +again: + list_for_each_entry(bg, &space_info->block_groups[raid], list) { + u64 thresh; + bool reclaim = false; + + btrfs_get_block_group(bg); + spin_lock(&bg->lock); + thresh = mult_perc(bg->length, thresh_pct); + if (bg->used < thresh && bg->reclaim_mark) { + try_again = false; + reclaim = true; + } + bg->reclaim_mark++; + spin_unlock(&bg->lock); + if (reclaim) + btrfs_mark_bg_to_reclaim(bg); + btrfs_put_block_group(bg); + } + + /* + * In situations where we are very motivated to reclaim (low unalloc) + * use two passes to make the reclaim mark check best effort. + * + * If we have any staler groups, we don't touch the fresher ones, but if we + * really need a block group, do take a fresh one. + */ + if (try_again && urgent) { + try_again = false; + goto again; + } + + up_read(&space_info->groups_sem); + return 0; +} + +int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info) +{ + int ret; + int raid; + struct btrfs_space_info *space_info; + + list_for_each_entry(space_info, &fs_info->space_info, list) { + if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) + continue; + if (!READ_ONCE(space_info->periodic_reclaim)) + continue; + for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) { + ret = do_reclaim_sweep(fs_info, space_info, raid); + if (ret) + return ret; + } + } + + return ret; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 92c595fed1b0a6..739f5953a2a59c 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -4,8 +4,17 @@ #define BTRFS_SPACE_INFO_H #include +#include +#include +#include +#include +#include +#include #include "volumes.h" +struct btrfs_fs_info; +struct btrfs_block_group; + /* * Different levels for to flush space when doing space reservations. * @@ -85,6 +94,7 @@ enum btrfs_flush_state { }; struct btrfs_space_info { + struct btrfs_fs_info *fs_info; spinlock_t lock; u64 total_bytes; /* total bytes in the space, @@ -156,6 +166,30 @@ struct btrfs_space_info { struct kobject kobj; struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; + + /* + * Monotonically increasing counter of relocated block groups. + * Exposed in /sys/fs//allocation//reclaim_count + */ + u64 reclaim_count; + + /* + * If true, use the dynamic relocation threshold, instead of the + * fixed bg_reclaim_threshold. + */ + bool dynamic_reclaim; + + /* + * Periodically check all block groups against the reclaim + * threshold in the cleaner thread. + */ + bool periodic_reclaim; + + /* + * Periodic reclaim should be a no-op if a space_info hasn't + * freed any space since the last time we tried. + */ + bool periodic_reclaim_ready; }; struct reserve_ticket { @@ -238,4 +272,7 @@ void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); +int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info); +int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info); + #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 0e49dab8dad248..54736f6238e659 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -111,6 +111,9 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector subpage_info->checked_offset = cur; cur += nr_bits; + subpage_info->locked_offset = cur; + cur += nr_bits; + subpage_info->total_nr_bits = cur; } @@ -237,28 +240,58 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, start + len <= folio_pos(folio) + PAGE_SIZE); } +#define subpage_calc_start_bit(fs_info, folio, name, start, len) \ +({ \ + unsigned int start_bit; \ + \ + btrfs_subpage_assert(fs_info, folio, start, len); \ + start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ + start_bit += fs_info->subpage_info->name##_offset; \ + start_bit; \ +}) + void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = len >> fs_info->sectorsize_bits; + unsigned long flags; + btrfs_subpage_assert(fs_info, folio, start, len); + spin_lock_irqsave(&subpage->lock, flags); + /* + * Even though it's just for reading the page, no one should have + * locked the subpage range. + */ + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + bitmap_set(subpage->bitmaps, start_bit, nbits); atomic_add(nbits, &subpage->readers); + spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = len >> fs_info->sectorsize_bits; + unsigned long flags; bool is_data; bool last; btrfs_subpage_assert(fs_info, folio, start, len); is_data = is_data_inode(folio->mapping->host); + + spin_lock_irqsave(&subpage->lock, flags); + + /* The range should have already been locked. */ + ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits)); ASSERT(atomic_read(&subpage->readers) >= nbits); + + bitmap_clear(subpage->bitmaps, start_bit, nbits); last = atomic_sub_and_test(nbits, &subpage->readers); /* @@ -270,6 +303,7 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, */ if (is_data && last) folio_unlock(folio); + spin_unlock_irqrestore(&subpage->lock, flags); } static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) @@ -290,28 +324,38 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) orig_start + orig_len) - *start; } -void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); + unsigned long flags; int ret; btrfs_subpage_assert(fs_info, folio, start, len); + spin_lock_irqsave(&subpage->lock, flags); ASSERT(atomic_read(&subpage->readers) == 0); + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + bitmap_set(subpage->bitmaps, start_bit, nbits); ret = atomic_add_return(nbits, &subpage->writers); ASSERT(ret == nbits); + spin_unlock_irqrestore(&subpage->lock, flags); } -bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); + unsigned long flags; + bool last; btrfs_subpage_assert(fs_info, folio, start, len); + spin_lock_irqsave(&subpage->lock, flags); /* * We have call sites passing @lock_page into * extent_clear_unlock_delalloc() for compression path. @@ -319,11 +363,18 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, * This @locked_page is locked by plain lock_page(), thus its * subpage::writers is 0. Handle them in a special way. */ - if (atomic_read(&subpage->writers) == 0) + if (atomic_read(&subpage->writers) == 0) { + spin_unlock_irqrestore(&subpage->lock, flags); return true; + } ASSERT(atomic_read(&subpage->writers) >= nbits); - return atomic_sub_and_test(nbits, &subpage->writers); + /* The target range should have been locked. */ + ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits)); + bitmap_clear(subpage->bitmaps, start_bit, nbits); + last = atomic_sub_and_test(nbits, &subpage->writers); + spin_unlock_irqrestore(&subpage->lock, flags); + return last; } /* @@ -365,16 +416,6 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, folio_unlock(folio); } -#define subpage_calc_start_bit(fs_info, folio, name, start, len) \ -({ \ - unsigned int start_bit; \ - \ - btrfs_subpage_assert(fs_info, folio, start, len); \ - start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ - start_bit += fs_info->subpage_info->name##_offset; \ - start_bit; \ -}) - #define subpage_test_bitmap_all_set(fs_info, subpage, name) \ bitmap_test_range_all_set(subpage->bitmaps, \ fs_info->subpage_info->name##_offset, \ @@ -751,6 +792,7 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap); GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap); GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, locked, &checked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 793c2b314a583a..b6dc013b0fdc9d 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -4,6 +4,11 @@ #define BTRFS_SUBPAGE_H #include +#include + +struct address_space; +struct folio; +struct btrfs_fs_info; /* * Extra info for subpapge bitmap. @@ -28,7 +33,7 @@ struct btrfs_subpage_info { unsigned int total_nr_bits; /* - * *_start indicates where the bitmap starts, the length is always + * *_offset indicates where the bitmap starts, the length is always * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. */ unsigned int uptodate_offset; @@ -36,6 +41,16 @@ struct btrfs_subpage_info { unsigned int writeback_offset; unsigned int ordered_offset; unsigned int checked_offset; + + /* + * For locked bitmaps, normally it's subpage representation for folio + * Locked flag, but metadata is different: + * + * - Metadata doesn't really lock the folio + * It's just to prevent page::private get cleared before the last + * end_page_read(). + */ + unsigned int locked_offset; }; /* @@ -93,10 +108,6 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); -void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 101f786963d4d7..7e44ccaf348f26 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -34,13 +34,11 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "props.h" #include "xattr.h" #include "bio.h" #include "export.h" #include "compression.h" -#include "rcu-string.h" #include "dev-replace.h" #include "free-space-cache.h" #include "backref.h" @@ -1767,7 +1765,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; - buf->f_bsize = dentry->d_sb->s_blocksize; + buf->f_bsize = fs_info->sectorsize; buf->f_namelen = BTRFS_NAME_LEN; /* We treat it as constant endianness (it doesn't matter _which_) @@ -2203,7 +2201,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, vol = memdup_user((void __user *)arg, sizeof(*vol)); if (IS_ERR(vol)) return PTR_ERR(vol); - vol->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol); + if (ret < 0) + goto out; switch (cmd) { case BTRFS_IOC_SCAN_DEV: @@ -2245,6 +2245,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, break; } +out: kfree(vol); return ret; } diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h index f18253ca280d3e..cbcab434b5ecb9 100644 --- a/fs/btrfs/super.h +++ b/fs/btrfs/super.h @@ -3,6 +3,13 @@ #ifndef BTRFS_SUPER_H #define BTRFS_SUPER_H +#include +#include +#include "fs.h" + +struct super_block; +struct btrfs_fs_info; + bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, unsigned long flags); int btrfs_sync_fs(struct super_block *sb, int wait); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 84c05246ffd8ad..40c6bca5ebcf5b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -421,7 +421,7 @@ BTRFS_ATTR(static_feature, supported_sectorsizes, static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { - return sysfs_emit(buf, "%d\n", !!IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); + return sysfs_emit(buf, "%d\n", IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); } BTRFS_ATTR(static_feature, acl, acl_show); @@ -894,6 +894,7 @@ SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); +SPACE_INFO_ATTR(reclaim_count); BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show); @@ -902,8 +903,12 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, char *buf) { struct btrfs_space_info *space_info = to_space_info(kobj); + ssize_t ret; - return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold)); + spin_lock(&space_info->lock); + ret = sysfs_emit(buf, "%d\n", btrfs_calc_reclaim_threshold(space_info)); + spin_unlock(&space_info->lock); + return ret; } static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, @@ -914,6 +919,9 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, int thresh; int ret; + if (READ_ONCE(space_info->dynamic_reclaim)) + return -EINVAL; + ret = kstrtoint(buf, 10, &thresh); if (ret) return ret; @@ -930,6 +938,72 @@ BTRFS_ATTR_RW(space_info, bg_reclaim_threshold, btrfs_sinfo_bg_reclaim_threshold_show, btrfs_sinfo_bg_reclaim_threshold_store); +static ssize_t btrfs_sinfo_dynamic_reclaim_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + + return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->dynamic_reclaim)); +} + +static ssize_t btrfs_sinfo_dynamic_reclaim_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + int dynamic_reclaim; + int ret; + + ret = kstrtoint(buf, 10, &dynamic_reclaim); + if (ret) + return ret; + + if (dynamic_reclaim < 0) + return -EINVAL; + + WRITE_ONCE(space_info->dynamic_reclaim, dynamic_reclaim != 0); + + return len; +} + +BTRFS_ATTR_RW(space_info, dynamic_reclaim, + btrfs_sinfo_dynamic_reclaim_show, + btrfs_sinfo_dynamic_reclaim_store); + +static ssize_t btrfs_sinfo_periodic_reclaim_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + + return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->periodic_reclaim)); +} + +static ssize_t btrfs_sinfo_periodic_reclaim_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + int periodic_reclaim; + int ret; + + ret = kstrtoint(buf, 10, &periodic_reclaim); + if (ret) + return ret; + + if (periodic_reclaim < 0) + return -EINVAL; + + WRITE_ONCE(space_info->periodic_reclaim, periodic_reclaim != 0); + + return len; +} + +BTRFS_ATTR_RW(space_info, periodic_reclaim, + btrfs_sinfo_periodic_reclaim_show, + btrfs_sinfo_periodic_reclaim_store); + /* * Allocation information about block group types. * @@ -947,8 +1021,11 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), + BTRFS_ATTR_PTR(space_info, dynamic_reclaim), BTRFS_ATTR_PTR(space_info, chunk_size), BTRFS_ATTR_PTR(space_info, size_classes), + BTRFS_ATTR_PTR(space_info, reclaim_count), + BTRFS_ATTR_PTR(space_info, periodic_reclaim), #ifdef CONFIG_BTRFS_DEBUG BTRFS_ATTR_PTR(space_info, force_chunk_alloc), #endif @@ -1228,11 +1305,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + const enum btrfs_read_policy policy = READ_ONCE(fs_devices->read_policy); ssize_t ret = 0; int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (fs_devices->read_policy == i) + if (policy == i) ret += sysfs_emit_at(buf, ret, "%s[%s]", (ret == 0 ? "" : " "), btrfs_read_policy_name[i]); @@ -1256,8 +1334,8 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { if (sysfs_streq(buf, btrfs_read_policy_name[i])) { - if (i != fs_devices->read_policy) { - fs_devices->read_policy = i; + if (i != READ_ONCE(fs_devices->read_policy)) { + WRITE_ONCE(fs_devices->read_policy, i); btrfs_info(fs_devices->fs_info, "read policy set to '%s'", btrfs_read_policy_name[i]); @@ -1306,6 +1384,47 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); +#ifdef CONFIG_BTRFS_DEBUG +static ssize_t btrfs_offload_csum_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + + switch (READ_ONCE(fs_devices->offload_csum_mode)) { + case BTRFS_OFFLOAD_CSUM_AUTO: + return sysfs_emit(buf, "auto\n"); + case BTRFS_OFFLOAD_CSUM_FORCE_ON: + return sysfs_emit(buf, "1\n"); + case BTRFS_OFFLOAD_CSUM_FORCE_OFF: + return sysfs_emit(buf, "0\n"); + default: + WARN_ON(1); + return -EINVAL; + } +} + +static ssize_t btrfs_offload_csum_store(struct kobject *kobj, + struct kobj_attribute *a, const char *buf, + size_t len) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + int ret; + bool val; + + ret = kstrtobool(buf, &val); + if (ret == 0) + WRITE_ONCE(fs_devices->offload_csum_mode, + val ? BTRFS_OFFLOAD_CSUM_FORCE_ON : BTRFS_OFFLOAD_CSUM_FORCE_OFF); + else if (ret == -EINVAL && sysfs_streq(buf, "auto")) + WRITE_ONCE(fs_devices->offload_csum_mode, BTRFS_OFFLOAD_CSUM_AUTO); + else + return -EINVAL; + + return len; +} +BTRFS_ATTR_RW(, offload_csum, btrfs_offload_csum_show, btrfs_offload_csum_store); +#endif + /* * Per-filesystem information and stats. * @@ -1325,6 +1444,9 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), +#ifdef CONFIG_BTRFS_DEBUG + BTRFS_ATTR_PTR(, offload_csum), +#endif NULL, }; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 86c7eef128731e..e6a284c59809c9 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -3,8 +3,17 @@ #ifndef BTRFS_SYSFS_H #define BTRFS_SYSFS_H +#include +#include #include +struct btrfs_fs_info; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_block_group; +struct btrfs_space_info; +struct btrfs_qgroup; + enum btrfs_feature_set { FEAT_COMPAT, FEAT_COMPAT_RO, diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 25b3349595e005..865d4af4b30356 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -11,6 +11,7 @@ #include "btrfs-tests.h" #include "../ctree.h" #include "../extent_io.h" +#include "../disk-io.h" #include "../btrfs_inode.h" #define PROCESS_UNLOCK (1 << 0) @@ -105,9 +106,11 @@ static void dump_extent_io_tree(const struct extent_io_tree *tree) } } -static int test_find_delalloc(u32 sectorsize) +static int test_find_delalloc(u32 sectorsize, u32 nodesize) { - struct inode *inode; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root = NULL; + struct inode *inode = NULL; struct extent_io_tree *tmp; struct page *page; struct page *locked_page = NULL; @@ -121,12 +124,27 @@ static int test_find_delalloc(u32 sectorsize) test_msg("running find delalloc tests"); + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + goto out; + } + inode = btrfs_new_test_inode(); if (!inode) { test_std_err(TEST_ALLOC_INODE); - return -ENOMEM; + ret = -ENOMEM; + goto out; } tmp = &BTRFS_I(inode)->io_tree; + BTRFS_I(inode)->root = root; /* * Passing NULL as we don't have fs_info but tracepoints are not used @@ -316,6 +334,8 @@ static int test_find_delalloc(u32 sectorsize) process_page_range(inode, 0, total_dirty - 1, PROCESS_UNLOCK | PROCESS_RELEASE); iput(inode); + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } @@ -794,7 +814,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) test_msg("running extent I/O tests"); - ret = test_find_delalloc(sectorsize); + ret = test_find_delalloc(sectorsize, nodesize); if (ret) goto out; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 9957de9f7806d1..99da9d34b77aed 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -258,7 +258,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize); if (IS_ERR(em)) { em = NULL; test_err("got an error when we shouldn't have"); @@ -278,7 +278,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) */ setup_file_extents(root, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, (u64)-1); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -316,7 +316,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -339,7 +339,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -367,7 +367,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -396,7 +396,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -418,7 +418,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -452,7 +452,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -481,7 +481,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -511,7 +511,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -544,7 +544,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -579,7 +579,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -613,7 +613,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -648,7 +648,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -675,7 +675,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -710,7 +710,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -737,7 +737,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -770,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -850,7 +850,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) insert_inode_item_key(root); insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -872,7 +872,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) } free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bf8e64c766b63b..46e8426adf4f15 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -23,12 +23,10 @@ #include "qgroup.h" #include "block-group.h" #include "space-info.h" -#include "zoned.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" -#include "defrag.h" #include "dir-item.h" #include "uuid-tree.h" #include "ioctl.h" @@ -1959,19 +1957,6 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->uuid_tree_generation = root_item->generation; } -int btrfs_transaction_in_commit(struct btrfs_fs_info *info) -{ - struct btrfs_transaction *trans; - int ret = 0; - - spin_lock(&info->trans_lock); - trans = info->running_transaction; - if (trans) - ret = (trans->state >= TRANS_STATE_COMMIT_START); - spin_unlock(&info->trans_lock); - return ret; -} - int btrfs_transaction_blocked(struct btrfs_fs_info *info) { struct btrfs_transaction *trans; @@ -2686,9 +2671,7 @@ void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, int __init btrfs_transaction_init(void) { - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", - sizeof(struct btrfs_trans_handle), 0, - SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); + btrfs_trans_handle_cachep = KMEM_CACHE(btrfs_trans_handle, SLAB_TEMPORARY); if (!btrfs_trans_handle_cachep) return -ENOMEM; return 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 2bf8bbdfd0b38b..4e451ab173b107 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -6,12 +6,27 @@ #ifndef BTRFS_TRANSACTION_H #define BTRFS_TRANSACTION_H +#include #include +#include +#include +#include +#include #include "btrfs_inode.h" #include "delayed-ref.h" -#include "ctree.h" +#include "extent-io-tree.h" +#include "block-rsv.h" +#include "messages.h" #include "misc.h" +struct dentry; +struct inode; +struct btrfs_pending_snapshot; +struct btrfs_fs_info; +struct btrfs_root_item; +struct btrfs_root; +struct btrfs_path; + /* Radix-tree tag for roots that are part of the trasaction. */ #define BTRFS_ROOT_TRANS_TAG 0 @@ -262,7 +277,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); -int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 6eccf8496486c0..c8fbcae4e88ea5 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -21,7 +21,6 @@ #include "messages.h" #include "ctree.h" #include "tree-checker.h" -#include "disk-io.h" #include "compression.h" #include "volumes.h" #include "misc.h" @@ -30,7 +29,6 @@ #include "file-item.h" #include "inode-item.h" #include "dir-item.h" -#include "raid-stripe-tree.h" #include "extent-tree.h" /* @@ -67,6 +65,7 @@ static void generic_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -94,6 +93,7 @@ static void file_extent_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -154,6 +154,7 @@ static void dir_item_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -649,6 +650,7 @@ static void block_group_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -1005,6 +1007,7 @@ static void dev_item_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(eb->fs_info, "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -1260,6 +1263,7 @@ static void extent_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(eb->fs_info, "corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 14b9fbe82da474..5c809b50b2d09b 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -6,10 +6,12 @@ #ifndef BTRFS_TREE_CHECKER_H #define BTRFS_TREE_CHECKER_H +#include #include struct extent_buffer; struct btrfs_chunk; +struct btrfs_key; /* All the extra info needed to verify the parentness of a tree block. */ struct btrfs_tree_parent_check { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 331fc7429952fd..472918a5bc73ae 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -13,13 +13,11 @@ #include "tree-log.h" #include "disk-io.h" #include "locking.h" -#include "print-tree.h" #include "backref.h" #include "compression.h" #include "qgroup.h" #include "block-group.h" #include "space-info.h" -#include "zoned.h" #include "inode-item.h" #include "fs.h" #include "accessors.h" @@ -2820,6 +2818,52 @@ static void wait_for_writer(struct btrfs_root *root) finish_wait(&root->log_writer_wait, &wait); } +void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode) +{ + ctx->log_ret = 0; + ctx->log_transid = 0; + ctx->log_new_dentries = false; + ctx->logging_new_name = false; + ctx->logging_new_delayed_dentries = false; + ctx->logged_before = false; + ctx->inode = inode; + INIT_LIST_HEAD(&ctx->list); + INIT_LIST_HEAD(&ctx->ordered_extents); + INIT_LIST_HEAD(&ctx->conflict_inodes); + ctx->num_conflict_inodes = 0; + ctx->logging_conflict_inodes = false; + ctx->scratch_eb = NULL; +} + +void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx) +{ + struct btrfs_inode *inode = BTRFS_I(ctx->inode); + + if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && + !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) + return; + + /* + * Don't care about allocation failure. This is just for optimization, + * if we fail to allocate here, we will try again later if needed. + */ + ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0); +} + +void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; + + ASSERT(inode_is_locked(ctx->inode)); + + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } +} + + static inline void btrfs_remove_log_ctx(struct btrfs_root *root, struct btrfs_log_ctx *ctx) { @@ -3619,6 +3663,30 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, return ret; } +static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx) +{ + const int slot = path->slots[0]; + + if (ctx->scratch_eb) { + copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]); + } else { + ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!ctx->scratch_eb) + return -ENOMEM; + } + + btrfs_release_path(path); + path->nodes[0] = ctx->scratch_eb; + path->slots[0] = slot; + /* + * Add extra ref to scratch eb so that it is not freed when callers + * release the path, so we can reuse it later if needed. + */ + atomic_inc(&ctx->scratch_eb->refs); + + return 0; +} + static int process_dir_items_leaf(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, @@ -3633,23 +3701,20 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, bool last_found = false; int batch_start = 0; int batch_size = 0; - int i; + int ret; /* * We need to clone the leaf, release the read lock on it, and use the * clone before modifying the log tree. See the comment at copy_items() * about why we need to do this. */ - src = btrfs_clone_extent_buffer(path->nodes[0]); - if (!src) - return -ENOMEM; + ret = clone_leaf(path, ctx); + if (ret < 0) + return ret; - i = path->slots[0]; - btrfs_release_path(path); - path->nodes[0] = src; - path->slots[0] = i; + src = path->nodes[0]; - for (; i < nritems; i++) { + for (int i = path->slots[0]; i < nritems; i++) { struct btrfs_dir_item *di; struct btrfs_key key; int ret; @@ -4259,17 +4324,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_path *dst_path, struct btrfs_path *src_path, int start_slot, int nr, int inode_only, - u64 logged_isize) + u64 logged_isize, struct btrfs_log_ctx *ctx) { struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *extent; struct extent_buffer *src; - int ret = 0; + int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; struct btrfs_item_batch batch; char *ins_data; - int i; int dst_index; const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); const u64 i_size = i_size_read(&inode->vfs_inode); @@ -4302,14 +4366,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * while the other is holding the delayed node's mutex and wants to * write lock the same subvolume leaf for flushing delayed items. */ - src = btrfs_clone_extent_buffer(src_path->nodes[0]); - if (!src) - return -ENOMEM; + ret = clone_leaf(src_path, ctx); + if (ret < 0) + return ret; - i = src_path->slots[0]; - btrfs_release_path(src_path); - src_path->nodes[0] = src; - src_path->slots[0] = i; + src = src_path->nodes[0]; ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); @@ -4324,7 +4385,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, batch.nr = 0; dst_index = 0; - for (i = 0; i < nr; i++) { + for (int i = 0; i < nr; i++) { const int src_slot = start_slot + i; struct btrfs_root *csum_root; struct btrfs_ordered_sum *sums; @@ -4431,7 +4492,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, goto out; dst_index = 0; - for (i = 0; i < nr; i++) { + for (int i = 0; i < nr; i++) { const int src_slot = start_slot + i; const int dst_slot = dst_path->slots[0] + dst_index; struct btrfs_key key; @@ -4704,7 +4765,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans, */ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, - struct btrfs_path *path) + struct btrfs_path *path, + struct btrfs_log_ctx *ctx) { struct btrfs_root *root = inode->root; struct btrfs_key key; @@ -4770,7 +4832,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, if (slot >= btrfs_header_nritems(leaf)) { if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); if (ret < 0) goto out; ins_nr = 0; @@ -4820,7 +4882,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, } if (ins_nr > 0) ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); out: btrfs_release_path(path); btrfs_free_path(dst_path); @@ -4899,7 +4961,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_unlock(&tree->lock); if (!ret) - ret = btrfs_log_prealloc_extents(trans, inode, path); + ret = btrfs_log_prealloc_extents(trans, inode, path, ctx); if (ret) return ret; @@ -4980,7 +5042,8 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_path *dst_path) + struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx) { struct btrfs_root *root = inode->root; int ret; @@ -5009,7 +5072,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, if (slot >= nritems) { if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5035,7 +5098,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, } if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); if (ret < 0) return ret; } @@ -5847,7 +5910,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, } ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, - inode_only, logged_isize); + inode_only, logged_isize, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5866,7 +5929,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, goto next_slot; ret = copy_items(trans, inode, dst_path, path, ins_start_slot, - ins_nr, inode_only, logged_isize); + ins_nr, inode_only, logged_isize, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5883,7 +5946,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, } ret = copy_items(trans, inode, dst_path, path, ins_start_slot, - ins_nr, inode_only, logged_isize); + ins_nr, inode_only, logged_isize, ctx); if (ret < 0) return ret; ins_nr = 1; @@ -5898,7 +5961,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, - logged_isize); + logged_isize, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5923,7 +5986,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, } if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, - ins_nr, inode_only, logged_isize); + ins_nr, inode_only, logged_isize, ctx); if (ret) return ret; } @@ -5934,7 +5997,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, * lock the same leaf with btrfs_log_prealloc_extents() below. */ btrfs_release_path(path); - ret = btrfs_log_prealloc_extents(trans, inode, dst_path); + ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx); } return ret; @@ -6526,7 +6589,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, btrfs_release_path(path); btrfs_release_path(dst_path); - ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx); if (ret) goto out_unlock; xattrs_logged = true; @@ -6553,7 +6616,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * BTRFS_INODE_COPY_EVERYTHING set. */ if (!xattrs_logged && inode->logged_trans < trans->transid) { - ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx); if (ret) goto out_unlock; btrfs_release_path(path); @@ -7502,6 +7565,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, btrfs_init_log_ctx(&ctx, &inode->vfs_inode); ctx.logging_new_name = true; + btrfs_init_log_ctx_scratch_eb(&ctx); /* * We don't care about the return value. If we fail to log the new name * then we know the next attempt to sync the log will fallback to a full @@ -7510,6 +7574,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * inconsistent state after a rename operation. */ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); + free_extent_buffer(ctx.scratch_eb); ASSERT(list_empty(&ctx.conflict_inodes)); out: /* diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index a550a8a375cd15..22e9cbc8157742 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -6,10 +6,18 @@ #ifndef BTRFS_TREE_LOG_H #define BTRFS_TREE_LOG_H +#include +#include #include "messages.h" #include "ctree.h" #include "transaction.h" +struct inode; +struct dentry; +struct btrfs_ordered_extent; +struct btrfs_root; +struct btrfs_trans_handle; + /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 @@ -36,37 +44,20 @@ struct btrfs_log_ctx { struct list_head conflict_inodes; int num_conflict_inodes; bool logging_conflict_inodes; + /* + * Used for fsyncs that need to copy items from the subvolume tree to + * the log tree (full sync flag set or copy everything flag set) to + * avoid allocating a temporary extent buffer while holding a lock on + * an extent buffer of the subvolume tree and under the log transaction. + * Also helps to avoid allocating and freeing a temporary extent buffer + * in case we need to process multiple leaves from the subvolume tree. + */ + struct extent_buffer *scratch_eb; }; -static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, - struct inode *inode) -{ - ctx->log_ret = 0; - ctx->log_transid = 0; - ctx->log_new_dentries = false; - ctx->logging_new_name = false; - ctx->logging_new_delayed_dentries = false; - ctx->logged_before = false; - ctx->inode = inode; - INIT_LIST_HEAD(&ctx->list); - INIT_LIST_HEAD(&ctx->ordered_extents); - INIT_LIST_HEAD(&ctx->conflict_inodes); - ctx->num_conflict_inodes = 0; - ctx->logging_conflict_inodes = false; -} - -static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) -{ - struct btrfs_ordered_extent *ordered; - struct btrfs_ordered_extent *tmp; - - ASSERT(inode_is_locked(ctx->inode)); - - list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { - list_del_init(&ordered->log_list); - btrfs_put_ordered_extent(ordered); - } -} +void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode); +void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx); +void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx); static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans) { diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 3df6153d5d5a80..43b3accbed7ad1 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -44,7 +44,7 @@ struct tree_mod_elem { /* * Pull a new tree mod seq number for our operation. */ -static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) +static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) { return atomic64_inc_return(&fs_info->tree_mod_seq); } @@ -170,8 +170,7 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, * this until all tree mod log insertions are recorded in the rb tree and then * write unlock fs_info::tree_mod_log_lock. */ -static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) +static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return true; @@ -188,7 +187,7 @@ static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, } /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ -static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, +static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) @@ -367,9 +366,9 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, return ret; } -static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, - struct tree_mod_elem **tm_list, - int nritems) +static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) { int i, j; int ret; diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index 94f10afeee9725..ff00c8e8a393cb 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -3,7 +3,13 @@ #ifndef BTRFS_TREE_MOD_LOG_H #define BTRFS_TREE_MOD_LOG_H -#include "ctree.h" +#include + +struct extent_buffer; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_root; +struct btrfs_seq_list; /* Represents a tree mod log user. */ struct btrfs_seq_list { diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index b4ac2b0cd2359a..183863f4bfa417 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -7,7 +7,6 @@ #include #include "messages.h" #include "ulist.h" -#include "ctree.h" /* * ulist is a generic data structure to hold a collection of unique u64 diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index b2cef187ea8efc..8e200fe1a2dd3c 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -7,6 +7,7 @@ #ifndef BTRFS_ULIST_H #define BTRFS_ULIST_H +#include #include #include diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 5be74f9e47ebf3..b0aff297d67d23 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -9,7 +9,6 @@ #include "ctree.h" #include "transaction.h" #include "disk-io.h" -#include "print-tree.h" #include "fs.h" #include "accessors.h" #include "uuid-tree.h" @@ -114,7 +113,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, ret = btrfs_insert_empty_item(trans, uuid_root, path, &key, sizeof(subid_le)); - if (ret >= 0) { + if (ret == 0) { /* Add an item for the type for the first time */ eb = path->nodes[0]; slot = path->slots[0]; diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h index 5350c87fe2caf3..080ede0227aee0 100644 --- a/fs/btrfs/uuid-tree.h +++ b/fs/btrfs/uuid-tree.h @@ -3,6 +3,11 @@ #ifndef BTRFS_UUID_TREE_H #define BTRFS_UUID_TREE_H +#include + +struct btrfs_trans_handle; +struct btrfs_fs_info; + int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 66e2270b0dae9f..4042dd6437aefa 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -14,7 +14,6 @@ #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" -#include "disk-io.h" #include "locking.h" #include "fs.h" #include "accessors.h" diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h index 91c10f7d0a4654..d696659e43e43d 100644 --- a/fs/btrfs/verity.h +++ b/fs/btrfs/verity.h @@ -3,8 +3,13 @@ #ifndef BTRFS_VERITY_H #define BTRFS_VERITY_H +struct inode; +struct btrfs_inode; + #ifdef CONFIG_FS_VERITY +#include + extern const struct fsverity_operations btrfs_verityops; int btrfs_drop_verity_items(struct btrfs_inode *inode); @@ -12,6 +17,8 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size) #else +#include + static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) { return 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d67785be2c778c..e49935a54da0a3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -14,10 +14,8 @@ #include #include "misc.h" #include "ctree.h" -#include "extent_map.h" #include "disk-io.h" #include "transaction.h" -#include "print-tree.h" #include "volumes.h" #include "raid56.h" #include "rcu-string.h" @@ -769,8 +767,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (same_fsid_diff_dev) { generate_random_uuid(fs_devices->fsid); fs_devices->temp_fsid = true; - pr_info("BTRFS: device %s using temp-fsid %pU\n", - path, fs_devices->fsid); + pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n", + path, MAJOR(path_devt), MINOR(path_devt), + fs_devices->fsid); } mutex_lock(&fs_devices->device_list_mutex); @@ -799,8 +798,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (fs_devices->opened) { btrfs_err(NULL, -"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", - path, fs_devices->fsid, current->comm, +"device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", + path, MAJOR(path_devt), MINOR(path_devt), + fs_devices->fsid, current->comm, task_pid_nr(current)); mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); @@ -826,13 +826,15 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (disk_super->label[0]) pr_info( - "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", +"BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", disk_super->label, devid, found_transid, path, + MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); else pr_info( - "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", +"BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", disk_super->fsid, devid, found_transid, path, + MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); } else if (!device->name || strcmp(device->name->str, path)) { @@ -1368,7 +1370,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, else btrfs_free_stale_devices(devt, NULL); - pr_debug("BTRFS: skip registering single non-seed device %s\n", path); + pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n", + path, MAJOR(devt), MINOR(devt)); device = NULL; goto free_disk_super; } @@ -1403,7 +1406,7 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, if (in_range(physical_start, *start, len) || in_range(*start, physical_start, - physical_end - physical_start)) { + physical_end + 1 - physical_start)) { *start = physical_end + 1; return true; } @@ -2032,11 +2035,10 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, copy_num, ret); } -void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, - struct block_device *bdev, - const char *device_path) +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device) { int copy_num; + struct block_device *bdev = device->bdev; if (!bdev) return; @@ -2052,7 +2054,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, btrfs_kobject_uevent(bdev, KOBJ_CHANGE); /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); + update_dev_time(device->name->str); } int btrfs_rm_device(struct btrfs_fs_info *fs_info, @@ -2187,8 +2189,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, * device and let the caller do the final bdev_release. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { - btrfs_scratch_superblocks(fs_info, device->bdev, - device->name->str); + btrfs_scratch_superblocks(fs_info, device); if (device->bdev) { sync_blockdev(device->bdev); invalidate_bdev(device->bdev); @@ -2301,8 +2302,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_unlock(&fs_devices->device_list_mutex); - btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, - tgtdev->name->str); + btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev); btrfs_close_bdev(tgtdev); synchronize_rcu(); @@ -3393,7 +3393,17 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* + * On the first search we would find chunk tree with + * offset -1, which is not possible. On subsequent + * loops this would find an existing item on an invalid + * offset (one less than the previous one, wrong + * alignment and size). + */ + ret = -EUCLEAN; + goto error; + } ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); @@ -3480,6 +3490,44 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, return 0; } +static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + const struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); + cpu->stripes_min = le32_to_cpu(disk->stripes_min); + cpu->stripes_max = le32_to_cpu(disk->stripes_max); +} + +static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, + const struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); + disk->stripes_min = cpu_to_le32(cpu->stripes_min); + disk->stripes_max = cpu_to_le32(cpu->stripes_max); +} + static int insert_balance_item(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl) { @@ -3624,7 +3672,7 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info) struct btrfs_balance_control *bctl = fs_info->balance_ctl; int ret; - BUG_ON(!fs_info->balance_ctl); + ASSERT(fs_info->balance_ctl); spin_lock(&fs_info->balance_lock); fs_info->balance_ctl = NULL; @@ -5944,6 +5992,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) { + const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy); int i; int num_stripes; int preferred_mirror; @@ -5958,13 +6007,12 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, else num_stripes = map->num_stripes; - switch (fs_info->fs_devices->read_policy) { + switch (policy) { default: /* Shouldn't happen, just warn and use pid instead of failing */ - btrfs_warn_rl(fs_info, - "unknown read_policy type %u, reset to pid", - fs_info->fs_devices->read_policy); - fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; + btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid", + policy); + WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID); fallthrough; case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 53f87f398da779..feba8d53526c41 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -6,13 +6,28 @@ #ifndef BTRFS_VOLUMES_H #define BTRFS_VOLUMES_H +#include +#include +#include #include -#include -#include "async-thread.h" +#include +#include +#include +#include +#include +#include +#include +#include #include "messages.h" -#include "tree-checker.h" #include "rcu-string.h" +struct block_device; +struct bdev_handle; +struct btrfs_fs_info; +struct btrfs_block_group; +struct btrfs_trans_handle; +struct btrfs_zoned_device_info; + #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) extern struct mutex uuid_mutex; @@ -77,7 +92,7 @@ enum btrfs_raid_types { #define BTRFS_DEV_STATE_FLUSH_SENT (4) #define BTRFS_DEV_STATE_NO_READA (5) -struct btrfs_zoned_device_info; +struct btrfs_fs_devices; struct btrfs_device { struct list_head dev_list; /* device_list_mutex */ @@ -276,6 +291,25 @@ enum btrfs_read_policy { BTRFS_NR_READ_POLICY, }; +#ifdef CONFIG_BTRFS_DEBUG +/* + * Checksum mode - offload it to workqueues or do it synchronously in + * btrfs_submit_chunk(). + */ +enum btrfs_offload_csum_mode { + /* + * Choose offloading checksum or do it synchronously automatically. + * Do it synchronously if the checksum is fast, or offload to workqueues + * otherwise. + */ + BTRFS_OFFLOAD_CSUM_AUTO, + /* Always offload checksum to workqueues. */ + BTRFS_OFFLOAD_CSUM_FORCE_ON, + /* Never offload checksum to workqueues. */ + BTRFS_OFFLOAD_CSUM_FORCE_OFF, +}; +#endif + struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ @@ -380,6 +414,11 @@ struct btrfs_fs_devices { /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; + +#ifdef CONFIG_BTRFS_DEBUG + /* Checksum mode - offload it or do it synchronously. */ + enum btrfs_offload_csum_mode offload_csum_mode; +#endif }; #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ @@ -557,8 +596,6 @@ static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map) } } -struct btrfs_balance_args; -struct btrfs_balance_progress; struct btrfs_balance_control { struct btrfs_balance_args data; struct btrfs_balance_args meta; @@ -780,9 +817,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); -void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, - struct block_device *bdev, - const char *device_path); +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device); enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags); int btrfs_bg_type_to_factor(u64 flags); diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 118118ca3e1de7..b9376ea258ff32 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -6,7 +6,11 @@ #ifndef BTRFS_XATTR_H #define BTRFS_XATTR_H -#include +struct dentry; +struct inode; +struct qstr; +struct xattr_handler; +struct btrfs_trans_handle; extern const struct xattr_handler * const btrfs_xattr_handlers[]; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 8da66ea699e8fe..1a4de77ba46d77 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -91,24 +91,24 @@ struct list_head *zlib_alloc_workspace(unsigned int level) return ERR_PTR(-ENOMEM); } -int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, +int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; char *data_in = NULL; - char *cpage_out; - int nr_pages = 0; - struct page *in_page = NULL; - struct page *out_page = NULL; + char *cfolio_out; + int nr_folios = 0; + struct folio *in_folio = NULL; + struct folio *out_folio = NULL; unsigned long bytes_left; - unsigned int in_buf_pages; + unsigned int in_buf_folios; unsigned long len = *total_out; - unsigned long nr_dest_pages = *out_pages; - const unsigned long max_out = nr_dest_pages * PAGE_SIZE; + unsigned long nr_dest_folios = *out_folios; + const unsigned long max_out = nr_dest_folios * PAGE_SIZE; - *out_pages = 0; + *out_folios = 0; *total_out = 0; *total_in = 0; @@ -121,18 +121,18 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); - pages[0] = out_page; - nr_pages = 1; + cfolio_out = folio_address(out_folio); + folios[0] = out_folio; + nr_folios = 1; workspace->strm.next_in = workspace->buf; workspace->strm.avail_in = 0; - workspace->strm.next_out = cpage_out; + workspace->strm.next_out = cfolio_out; workspace->strm.avail_out = PAGE_SIZE; while (workspace->strm.total_in < len) { @@ -142,19 +142,22 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, */ if (workspace->strm.avail_in == 0) { bytes_left = len - workspace->strm.total_in; - in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), - workspace->buf_size / PAGE_SIZE); - if (in_buf_pages > 1) { + in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), + workspace->buf_size / PAGE_SIZE); + if (in_buf_folios > 1) { int i; - for (i = 0; i < in_buf_pages; i++) { + for (i = 0; i < in_buf_folios; i++) { if (data_in) { kunmap_local(data_in); - put_page(in_page); + folio_put(in_folio); + data_in = NULL; } - in_page = find_get_page(mapping, - start >> PAGE_SHIFT); - data_in = kmap_local_page(in_page); + ret = btrfs_compress_filemap_get_folio(mapping, + start, &in_folio); + if (ret < 0) + goto out; + data_in = kmap_local_folio(in_folio, 0); copy_page(workspace->buf + i * PAGE_SIZE, data_in); start += PAGE_SIZE; @@ -163,11 +166,14 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, } else { if (data_in) { kunmap_local(data_in); - put_page(in_page); + folio_put(in_folio); + data_in = NULL; } - in_page = find_get_page(mapping, - start >> PAGE_SHIFT); - data_in = kmap_local_page(in_page); + ret = btrfs_compress_filemap_get_folio(mapping, + start, &in_folio); + if (ret < 0) + goto out; + data_in = kmap_local_folio(in_folio, 0); start += PAGE_SIZE; workspace->strm.next_in = data_in; } @@ -196,20 +202,20 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, * the stream end if required */ if (workspace->strm.avail_out == 0) { - if (nr_pages == nr_dest_pages) { + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); - pages[nr_pages] = out_page; - nr_pages++; + cfolio_out = folio_address(out_folio); + folios[nr_folios] = out_folio; + nr_folios++; workspace->strm.avail_out = PAGE_SIZE; - workspace->strm.next_out = cpage_out; + workspace->strm.next_out = cfolio_out; } /* we're all done */ if (workspace->strm.total_in >= len) @@ -231,21 +237,21 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -EIO; goto out; } else if (workspace->strm.avail_out == 0) { - /* get another page for the stream end */ - if (nr_pages == nr_dest_pages) { + /* get another folio for the stream end */ + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); - pages[nr_pages] = out_page; - nr_pages++; + cfolio_out = folio_address(out_folio); + folios[nr_folios] = out_folio; + nr_folios++; workspace->strm.avail_out = PAGE_SIZE; - workspace->strm.next_out = cpage_out; + workspace->strm.next_out = cfolio_out; } } zlib_deflateEnd(&workspace->strm); @@ -259,10 +265,10 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, *total_out = workspace->strm.total_out; *total_in = workspace->strm.total_in; out: - *out_pages = nr_pages; + *out_folios = nr_folios; if (data_in) { kunmap_local(data_in); - put_page(in_page); + folio_put(in_folio); } return ret; @@ -275,13 +281,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) int wbits = MAX_WBITS; char *data_in; size_t total_out = 0; - unsigned long page_in_index = 0; + unsigned long folio_in_index = 0; size_t srclen = cb->compressed_len; - unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; - struct page **pages_in = cb->compressed_pages; + struct folio **folios_in = cb->compressed_folios; - data_in = kmap_local_page(pages_in[page_in_index]); + data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; @@ -331,12 +337,12 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; kunmap_local(data_in); - page_in_index++; - if (page_in_index >= total_pages_in) { + folio_in_index++; + if (folio_in_index >= total_folios_in) { data_in = NULL; break; } - data_in = kmap_local_page(pages_in[page_in_index]); + data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, PAGE_SIZE); @@ -398,7 +404,7 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, out: if (unlikely(to_copy != destlen)) { - pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n", + pr_warn_ratelimited("BTRFS: inflate failed, decompressed=%lu expected=%zu\n", to_copy, destlen); ret = -EIO; } else { diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 5f750fa53a2b2a..23a4ceccd70ed5 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -12,10 +12,8 @@ #include "rcu-string.h" #include "disk-io.h" #include "block-group.h" -#include "transaction.h" #include "dev-replace.h" #include "space-info.h" -#include "super.h" #include "fs.h" #include "accessors.h" #include "bio.h" @@ -1648,6 +1646,15 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) return -EINVAL; } + /* Reject non SINGLE data profiles without RST. */ + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && + (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && + !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + if (cache->alloc_offset > cache->zone_capacity) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index f573bda496fbd1..77c4321e331f37 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -4,12 +4,27 @@ #define BTRFS_ZONED_H #include +#include #include +#include +#include +#include +#include #include "messages.h" #include "volumes.h" #include "disk-io.h" #include "block-group.h" #include "btrfs_inode.h" +#include "fs.h" + +struct block_device; +struct extent_buffer; +struct btrfs_bio; +struct btrfs_ordered_extent; +struct btrfs_fs_info; +struct btrfs_space_info; +struct btrfs_eb_write_context; +struct btrfs_fs_devices; #define BTRFS_DEFAULT_RECLAIM_THRESH (75) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 0d66db8bc1d477..c018f0fab349ad 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -18,8 +18,9 @@ #include #include #include "misc.h" +#include "fs.h" #include "compression.h" -#include "ctree.h" +#include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) @@ -373,25 +374,25 @@ struct list_head *zstd_alloc_workspace(unsigned int level) return ERR_PTR(-ENOMEM); } -int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, +int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); zstd_cstream *stream; int ret = 0; - int nr_pages = 0; - struct page *in_page = NULL; /* The current page to read */ - struct page *out_page = NULL; /* The current page to write to */ + int nr_folios = 0; + struct folio *in_folio = NULL; /* The current page to read */ + struct folio *out_folio = NULL; /* The current page to write to */ unsigned long tot_in = 0; unsigned long tot_out = 0; unsigned long len = *total_out; - const unsigned long nr_dest_pages = *out_pages; - unsigned long max_out = nr_dest_pages * PAGE_SIZE; + const unsigned long nr_dest_folios = *out_folios; + unsigned long max_out = nr_dest_folios * PAGE_SIZE; zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, len); - *out_pages = 0; + *out_folios = 0; *total_out = 0; *total_in = 0; @@ -405,19 +406,21 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, } /* map in the first page of input data */ - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap_local_page(in_page); + ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); + if (ret < 0) + goto out; + workspace->in_buf.src = kmap_local_folio(in_folio, 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); /* Allocate and map in the output buffer */ - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + folios[nr_folios++] = out_folio; + workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -452,17 +455,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - if (nr_pages == nr_dest_pages) { + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + folios[nr_folios++] = out_folio; + workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -478,11 +481,15 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; kunmap_local(workspace->in_buf.src); - put_page(in_page); + workspace->in_buf.src = NULL; + folio_put(in_folio); start += PAGE_SIZE; len -= PAGE_SIZE; - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap_local_page(in_page); + ret = btrfs_compress_filemap_get_folio(mapping, start, + &in_folio); + if (ret < 0) + goto out; + workspace->in_buf.src = kmap_local_folio(in_folio, 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } @@ -509,17 +516,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - if (nr_pages == nr_dest_pages) { + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + folios[nr_folios++] = out_folio; + workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } @@ -533,10 +540,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = tot_in; *total_out = tot_out; out: - *out_pages = nr_pages; + *out_folios = nr_folios; if (workspace->in_buf.src) { kunmap_local(workspace->in_buf.src); - put_page(in_page); + folio_put(in_folio); } return ret; } @@ -544,12 +551,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct page **pages_in = cb->compressed_pages; + struct folio **folios_in = cb->compressed_folios; size_t srclen = cb->compressed_len; zstd_dstream *stream; int ret = 0; - unsigned long page_in_index = 0; - unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long folio_in_index = 0; + unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long total_out = 0; @@ -561,7 +568,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); + workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); @@ -598,14 +605,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->in_buf.pos == workspace->in_buf.size) { kunmap_local(workspace->in_buf.src); - page_in_index++; - if (page_in_index >= total_pages_in) { + folio_in_index++; + if (folio_in_index >= total_folios_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); + workspace->in_buf.src = + kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } @@ -618,80 +626,48 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; - size_t ret2; - unsigned long total_out = 0; - unsigned long pg_offset = 0; + unsigned long to_copy = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (!stream) { pr_warn("BTRFS: zstd_init_dstream failed\n"); - ret = -EIO; goto finish; } - destlen = min_t(size_t, destlen, PAGE_SIZE); - workspace->in_buf.src = data_in; workspace->in_buf.pos = 0; workspace->in_buf.size = srclen; workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = PAGE_SIZE; - - ret2 = 1; - while (pg_offset < destlen - && workspace->in_buf.pos < workspace->in_buf.size) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - - /* Check if the frame is over and we still need more input */ - if (ret2 == 0) { - pr_debug("BTRFS: zstd_decompress_stream ended early\n"); - ret = -EIO; - goto finish; - } - ret2 = zstd_decompress_stream(stream, &workspace->out_buf, - &workspace->in_buf); - if (zstd_is_error(ret2)) { - pr_debug("BTRFS: zstd_decompress_stream returned %d\n", - zstd_get_error_code(ret2)); - ret = -EIO; - goto finish; - } - - buf_start = total_out; - total_out += workspace->out_buf.pos; - workspace->out_buf.pos = 0; - - if (total_out <= start_byte) - continue; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min_t(unsigned long, destlen - pg_offset, - workspace->out_buf.size - buf_offset); - - memcpy_to_page(dest_page, pg_offset, - workspace->out_buf.dst + buf_offset, bytes); - - pg_offset += bytes; + workspace->out_buf.size = sectorsize; + + /* + * Since both input and output buffers should not exceed one sector, + * one call should end the decompression. + */ + ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); + if (zstd_is_error(ret)) { + pr_warn_ratelimited("BTRFS: zstd_decompress_stream return %d\n", + zstd_get_error_code(ret)); + goto finish; } - ret = 0; + to_copy = workspace->out_buf.pos; + memcpy_to_page(dest_page, dest_pgoff, workspace->out_buf.dst, to_copy); finish: - if (pg_offset < destlen) { - memzero_page(dest_page, pg_offset, destlen - pg_offset); + /* Error or early end. */ + if (unlikely(to_copy < destlen)) { + ret = -EIO; + memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); } return ret; } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7669d154c05e0c..a96d23db6a3872 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2215,6 +2215,7 @@ static int ext4_fill_es_cache_info(struct inode *inode, (__u64)es.es_lblk << blksize_bits, (__u64)es.es_pblk << blksize_bits, (__u64)es.es_len << blksize_bits, + (__u64)es.es_len << blksize_bits, flags); if (next == 0) break; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 26e317696b3389..775ae73b6641e1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1906,7 +1906,8 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!xnid) flags |= FIEMAP_EXTENT_LAST; - err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + err = fiemap_fill_next_extent( + fieinfo, 0, phys, len, len, flags); trace_f2fs_fiemap(inode, 0, phys, len, flags, err); if (err) return err; @@ -1932,7 +1933,8 @@ static int f2fs_xattr_fiemap(struct inode *inode, } if (phys) { - err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + err = fiemap_fill_next_extent( + fieinfo, 0, phys, len, len, flags); trace_f2fs_fiemap(inode, 0, phys, len, flags, err); } @@ -2051,7 +2053,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); + phys, size, size, flags); trace_f2fs_fiemap(inode, logical, phys, size, flags, ret); if (ret) goto out; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index ac00423f117b5c..825b51978c56b9 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -806,7 +806,8 @@ int f2fs_inline_data_fiemap(struct inode *inode, byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; byteaddr += (char *)inline_data_addr(inode, ipage) - (char *)F2FS_INODE(ipage); - err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); + err = fiemap_fill_next_extent( + fieinfo, start, byteaddr, ilen, ilen, flags); trace_f2fs_fiemap(inode, start, byteaddr, ilen, flags, err); out: f2fs_put_page(ipage, 1); diff --git a/fs/ioctl.c b/fs/ioctl.c index 76cf22ac97d762..4f5e75feec2de5 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -99,7 +99,8 @@ static int ioctl_fibmap(struct file *filp, int __user *p) * @fieinfo: Fiemap context passed into ->fiemap * @logical: Extent logical start offset, in bytes * @phys: Extent physical start offset, in bytes - * @len: Extent length, in bytes + * @log_len: Extent logical length, in bytes + * @phys_len: Extent physical length, in bytes * @flags: FIEMAP_EXTENT flags that describe this extent * * Called from file system ->fiemap callback. Will populate extent @@ -110,7 +111,7 @@ static int ioctl_fibmap(struct file *filp, int __user *p) * extent that will fit in user array. */ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, - u64 phys, u64 len, u32 flags) + u64 phys, u64 log_len, u64 phys_len, u32 flags) { struct fiemap_extent extent; struct fiemap_extent __user *dest = fieinfo->fi_extents_start; @@ -138,7 +139,8 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, memset(&extent, 0, sizeof(extent)); extent.fe_logical = logical; extent.fe_physical = phys; - extent.fe_length = len; + extent.fe_length = log_len; + extent.fe_physical_length = phys_len; extent.fe_flags = flags; dest += fieinfo->fi_extents_mapped; diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 610ca6f1ec9b6d..30c5f14f908fbc 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -36,7 +36,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, return fiemap_fill_next_extent(fi, iomap->offset, iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, - iomap->length, flags); + iomap->length, iomap->length, flags); } static loff_t iomap_fiemap_iter(const struct iomap_iter *iter, diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 9c334c722fc1c1..fc8acfbcfbdf0a 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -1196,7 +1196,8 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (size) { /* End of the current extent */ ret = fiemap_fill_next_extent( - fieinfo, logical, phys, size, flags); + fieinfo, logical, phys, size, size, + flags); if (ret) break; } @@ -1246,7 +1247,8 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= FIEMAP_EXTENT_LAST; ret = fiemap_fill_next_extent( - fieinfo, logical, phys, size, flags); + fieinfo, logical, phys, size, size, + flags); if (ret) break; size = 0; @@ -1262,7 +1264,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* Terminate the current extent */ ret = fiemap_fill_next_extent( fieinfo, logical, phys, size, - flags); + size, flags); if (ret || blkoff > end_blkoff) break; diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 7f27382e0ce25b..ba443e0b8d2b0d 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -1948,6 +1948,7 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, err = fiemap_fill_next_extent( fieinfo, 0, 0, attr ? le32_to_cpu(attr->res.data_size) : 0, + attr ? le32_to_cpu(attr->res.data_size) : 0, FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST | FIEMAP_EXTENT_MERGED); goto out; @@ -2042,7 +2043,7 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, flags |= FIEMAP_EXTENT_LAST; err = fiemap_fill_next_extent(fieinfo, vbo, lbo, dlen, - flags); + dlen, flags); if (err < 0) break; if (err == 1) { @@ -2062,7 +2063,8 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, if (vbo + bytes >= end) flags |= FIEMAP_EXTENT_LAST; - err = fiemap_fill_next_extent(fieinfo, vbo, lbo, bytes, flags); + err = fiemap_fill_next_extent(fieinfo, vbo, lbo, bytes, bytes, + flags); if (err < 0) break; if (err == 1) { diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 70a768b623cf40..553b1695671d6f 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -723,7 +723,7 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, id2.i_data.id_data); ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, - flags); + id_count, flags); if (ret < 0) return ret; } @@ -794,7 +794,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits; ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes, - len_bytes, fe_flags); + len_bytes, len_bytes, fe_flags); if (ret) break; diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 4695433fcf397f..0d47ea5c440048 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -3880,6 +3880,7 @@ static int smb3_fiemap(struct cifs_tcon *tcon, le64_to_cpu(out_data[i].file_offset), le64_to_cpu(out_data[i].file_offset), le64_to_cpu(out_data[i].length), + le64_to_cpu(out_data[i].length), flags); if (rc < 0) goto out; diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h index c50882f19235a2..17a6c32cdf3f6b 100644 --- a/include/linux/fiemap.h +++ b/include/linux/fiemap.h @@ -16,6 +16,6 @@ struct fiemap_extent_info { int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 *len, u32 supported_flags); int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, - u64 phys, u64 len, u32 flags); + u64 phys, u64 log_len, u64 phys_len, u32 flags); #endif /* _LINUX_FIEMAP_H 1 */ diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index f8bc34a6bcfa2f..cdf6ad872149cd 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -92,6 +92,7 @@ struct btrfs_qgroup_limit { * struct btrfs_qgroup_inherit.flags */ #define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0) +#define BTRFS_QGROUP_INHERIT_FLAGS_SUPP (BTRFS_QGROUP_INHERIT_SET_LIMITS) struct btrfs_qgroup_inherit { __u64 flags; diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h index 24ca0c00cae362..fd3c7d380666a1 100644 --- a/include/uapi/linux/fiemap.h +++ b/include/uapi/linux/fiemap.h @@ -15,13 +15,23 @@ #include struct fiemap_extent { - __u64 fe_logical; /* logical offset in bytes for the start of - * the extent from the beginning of the file */ - __u64 fe_physical; /* physical offset in bytes for the start - * of the extent from the beginning of the disk */ - __u64 fe_length; /* length in bytes for this extent */ - __u64 fe_reserved64[2]; - __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ + /* + * logical offset in bytes for the start of + * the extent from the beginning of the file + */ + __u64 fe_logical; + /* + * physical offset in bytes for the start + * of the extent from the beginning of the disk + */ + __u64 fe_physical; + /* length in bytes for this extent */ + __u64 fe_length; + /* physical length in bytes for this extent */ + __u64 fe_physical_length; + __u64 fe_reserved64[1]; + /* FIEMAP_EXTENT_* flags for this extent */ + __u32 fe_flags; __u32 fe_reserved[3]; };