diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 4fb925e8c981d8..ead317f1eeb859 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -78,6 +78,15 @@ config BTRFS_ASSERT If unsure, say N. +config BTRFS_EXPERIMENTAL + bool "Btrfs experimental features" + depends on BTRFS_FS + help + Enable experimental features. These features may not be stable enough + for end users. This is meant for btrfs developers only. + + If unsure, say N. + config BTRFS_FS_REF_VERIFY bool "Btrfs with the ref verify tool compiled in" depends on BTRFS_FS diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f8e1d5b2c5128a..d0492b1c9486cb 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1127,6 +1127,7 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, if (ret) return ret; ptr += btrfs_extent_inline_ref_size(type); + cond_resched(); } return 0; @@ -1230,7 +1231,7 @@ static int add_keyed_refs(struct btrfs_backref_walk_ctx *ctx, } if (ret) return ret; - + cond_resched(); } return ret; diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 7e0f9600b80c43..7583a9b74e22b1 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -450,6 +450,14 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), dev->devid, bio->bi_iter.bi_size); + /* + * Track reads if tracking is enabled; ignore I/O operations before + * fully initialized. + */ + if (dev->fs_devices->fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info) + percpu_counter_add(&dev->fs_info->stats_read_blocks, + bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits); + if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) blkcg_punt_bio_submit(bio); else diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b11bfe68dd65fb..277490cc5ae24d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1259,6 +1259,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; + percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->ordered_bytes); @@ -2858,6 +2859,10 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; + ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL); + if (ret) + return ret; + fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); @@ -3324,6 +3329,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; + fs_info->fs_devices->fs_info = fs_info; /* * Handle the space caching options appropriately now that we have the diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 79f64e383eddf8..8960e141886b3e 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -625,6 +625,9 @@ struct btrfs_fs_info { struct kobject *qgroups_kobj; struct kobject *discard_kobj; + /* Track the number of blocks (sectors) read by the filesystem. */ + struct percpu_counter stats_read_blocks; + /* Used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c64d0713412231..ea5ff01881d706 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2468,6 +2468,9 @@ static __cold void btrfs_interface_exit(void) static int __init btrfs_print_mod_info(void) { static const char options[] = "" +#ifdef CONFIG_BTRFS_EXPERIMENTAL + ", experimental=on" +#endif #ifdef CONFIG_BTRFS_DEBUG ", debug=on" #endif @@ -2488,7 +2491,17 @@ static int __init btrfs_print_mod_info(void) ", fsverity=no" #endif ; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (btrfs_get_mod_read_policy() == NULL) + pr_info("Btrfs loaded%s\n", options); + else + pr_info("Btrfs loaded%s, read_policy=%s\n", + options, btrfs_get_mod_read_policy()); +#else pr_info("Btrfs loaded%s\n", options); +#endif + return 0; } @@ -2546,6 +2559,11 @@ static const struct init_sequence mod_init_seq[] = { }, { .init_func = extent_map_init, .exit_func = extent_map_exit, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + }, { + .init_func = btrfs_read_policy_init, + .exit_func = NULL, +#endif }, { .init_func = ordered_data_init, .exit_func = ordered_data_exit, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 03926ad467c919..b36b08e6dfc88b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -10,6 +10,9 @@ #include #include #include +#ifdef CONFIG_BTRFS_EXPERIMENTAL +#include +#endif #include #include "messages.h" #include "ctree.h" @@ -1305,7 +1308,67 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, } BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); -static const char * const btrfs_read_policy_name[] = { "pid" }; +static const char * const btrfs_read_policy_name[] = { + "pid", +#ifdef CONFIG_BTRFS_EXPERIMENTAL + "round-robin", + "latency", + "latency-rr", + "queue", + "devid", +#endif +}; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +/* Global module configuration parameters */ +static char *read_policy; +char *btrfs_get_mod_read_policy(void) +{ + return read_policy; +} + +/* Set perm 0, disable sys/module/btrfs/parameter/read_policy interface */ +module_param(read_policy, charp, 0); +MODULE_PARM_DESC(read_policy, +"Global read policy; pid (default), round-robin[:min_contig_read], latency, latency-rr[:min_contig_read], queue, devid[:devid]"); +#endif + +int btrfs_read_policy_to_enum(const char *str, s64 *value) +{ + char param[32] = {'\0'}; + char *__maybe_unused value_str; + + if (!str || strlen(str) == 0) + return 0; + + strncpy(param, str, sizeof(param) - 1); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Separate value from input in policy:value format. */ + if ((value_str = strchr(param, ':'))) { + *value_str = '\0'; + value_str++; + if (value && kstrtou64(value_str, 10, value) != 0) + return -EINVAL; + } +#endif + + return sysfs_match_string(btrfs_read_policy_name, param); +} + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void) +{ + s64 value; + + if (btrfs_read_policy_to_enum(read_policy, &value) == -EINVAL) { + btrfs_err(NULL, "invalid read policy or value %s", read_policy); + return -EINVAL; + } + + return 0; +} +#endif static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -1316,14 +1379,29 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (policy == i) - ret += sysfs_emit_at(buf, ret, "%s[%s]", - (ret == 0 ? "" : " "), - btrfs_read_policy_name[i]); - else - ret += sysfs_emit_at(buf, ret, "%s%s", - (ret == 0 ? "" : " "), - btrfs_read_policy_name[i]); + if (ret != 0) + ret += sysfs_emit_at(buf, ret, " "); + + if (i == policy) + ret += sysfs_emit_at(buf, ret, "["); + + ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (i == BTRFS_READ_POLICY_LATENCY_RR) + ret += sysfs_emit_at(buf, ret, ":%d", + READ_ONCE(fs_devices->rr_min_contig_read)); + + if (i == BTRFS_READ_POLICY_RR) + ret += sysfs_emit_at(buf, ret, ":%d", + READ_ONCE(fs_devices->rr_min_contig_read)); + + if (i == BTRFS_READ_POLICY_DEVID) + ret += sysfs_emit_at(buf, ret, ":%llu", + READ_ONCE(fs_devices->read_devid)); +#endif + if (i == policy) + ret += sysfs_emit_at(buf, ret, "]"); } ret += sysfs_emit_at(buf, ret, "\n"); @@ -1336,21 +1414,87 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, const char *buf, size_t len) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); - int i; + int index; + s64 value = -1; - for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (sysfs_streq(buf, btrfs_read_policy_name[i])) { - if (i != READ_ONCE(fs_devices->read_policy)) { - WRITE_ONCE(fs_devices->read_policy, i); - btrfs_info(fs_devices->fs_info, - "read policy set to '%s'", - btrfs_read_policy_name[i]); + index = btrfs_read_policy_to_enum(buf, &value); + if (index < 0) + return -EINVAL; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* If moving out of RR then disable fs_stats */ + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && + index != BTRFS_READ_POLICY_RR) + fs_devices->fs_stats = false; + + if (fs_devices->read_policy == BTRFS_READ_POLICY_LATENCY_RR && + index != BTRFS_READ_POLICY_LATENCY_RR) + fs_devices->fs_stats = false; + + if ((index == BTRFS_READ_POLICY_RR) || (index == BTRFS_READ_POLICY_LATENCY_RR)) { + if (value != -1) { + u32 sectorsize = fs_devices->fs_info->sectorsize; + + if (!IS_ALIGNED(value, sectorsize)) { + u64 temp_value = round_up(value, sectorsize); + + btrfs_warn(fs_devices->fs_info, +"read_policy: min contiguous read %lld should be multiples of the sectorsize %u, rounded to %llu", + value, sectorsize, temp_value); + value = temp_value; } - return len; + } else { + value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; + } + + if (index != READ_ONCE(fs_devices->read_policy) || + value != READ_ONCE(fs_devices->rr_min_contig_read)) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->rr_min_contig_read, value); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", + btrfs_read_policy_name[index], value); } + + fs_devices->fs_stats = true; + + return len; } - return -EINVAL; + if (index == BTRFS_READ_POLICY_DEVID) { + + if (value != -1) { + BTRFS_DEV_LOOKUP_ARGS(args); + + /* Validate input devid */ + args.devid = value; + if (btrfs_find_device(fs_devices, &args) == NULL) + return -EINVAL; + } else { + /* Set default devid to the devid of the latest device */ + value = fs_devices->latest_dev->devid; + } + + if (index != READ_ONCE(fs_devices->read_policy) || + (value != READ_ONCE(fs_devices->read_devid))) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->read_devid, value); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'", + btrfs_read_policy_name[index], value); + + } + + return len; + } +#endif + if (index != READ_ONCE(fs_devices->read_policy)) { + WRITE_ONCE(fs_devices->read_policy, index); + btrfs_info(fs_devices->fs_info, "read policy set to '%s'", + btrfs_read_policy_name[index]); + } + + return len; } BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); @@ -1972,12 +2116,111 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, } BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); +static ssize_t btrfs_devinfo_type_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + return scnprintf(buf, PAGE_SIZE, "0x%08llx\n", device->type); +} + +static ssize_t btrfs_devinfo_type_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_device *device; + int ret; + struct btrfs_trans_handle *trans; + + u64 type, prev_type; + + device = container_of(kobj, struct btrfs_device, devid_kobj); + fs_info = device->fs_info; + if (!fs_info) + return -EPERM; + + root = fs_info->chunk_root; + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + ret = kstrtou64(buf, 0, &type); + if (ret < 0) + return -EINVAL; + + /* for now, allow to touch only the 'allocation hint' bits */ + if (type & ~((1 << BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1)) + return -EINVAL; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + prev_type = device->type; + device->type = type; + + ret = btrfs_update_device(trans, device); + + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto abort; + } + + ret = btrfs_commit_transaction(trans); + if (ret < 0) + goto abort; + + return len; +abort: + device->type = prev_type; + return ret; +} +BTRFS_ATTR_RW(devid, type, btrfs_devinfo_type_show, btrfs_devinfo_type_store); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static ssize_t btrfs_devinfo_read_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + u64 read_wait = part_stat_read(device->bdev, nsecs[READ]); + u64 last_nsecs_read = (u64)atomic64_read(&device->last_nsecs_read); + unsigned long read_ios = part_stat_read(device->bdev, ios[READ]); + unsigned long last_ios_read = (unsigned long)atomic64_read(&device->last_ios_read); + s64 delta_read_wait = read_wait - last_nsecs_read; + long delta_read_ios = read_ios - last_ios_read; + u64 avg_wait = 0, delta_avg_wait = 0; + + if (read_wait && read_ios && read_wait >= read_ios) + avg_wait = div_u64(read_wait, read_ios); + + if (delta_read_wait > 0 && delta_read_ios > 0 && delta_read_wait >= delta_read_ios) + delta_avg_wait = div_u64(delta_read_wait, delta_read_ios); + + return scnprintf(buf, PAGE_SIZE, + "cumulative ios %lu wait %llu avg %llu " + "checkpoint ios %ld wait %lld avg %llu " + "age %lld count %llu ignored %lld\n", + read_ios, read_wait, avg_wait, + delta_read_ios, delta_read_wait, delta_avg_wait, + atomic64_read(&device->last_io_age), atomic64_read(&device->checkpoints), + atomic64_read(&device->stripe_ignored)); +} +BTRFS_ATTR(devid, read_stats, btrfs_devinfo_read_stats_show); +#endif + /* * Information about one device. * * Path: /sys/fs/btrfs//devinfo// */ static struct attribute *devid_attrs[] = { +#ifdef CONFIG_BTRFS_EXPERIMENTAL + BTRFS_ATTR_PTR(devid, read_stats), +#endif BTRFS_ATTR_PTR(devid, error_stats), BTRFS_ATTR_PTR(devid, fsid), BTRFS_ATTR_PTR(devid, in_fs_metadata), @@ -1985,6 +2228,7 @@ static struct attribute *devid_attrs[] = { BTRFS_ATTR_PTR(devid, replace_target), BTRFS_ATTR_PTR(devid, scrub_speed_max), BTRFS_ATTR_PTR(devid, writeable), + BTRFS_ATTR_PTR(devid, type), NULL }; ATTRIBUTE_GROUPS(devid); diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index e6a284c59809c9..e83efc44e30071 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -47,5 +47,10 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info); int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info); void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup); +int btrfs_read_policy_to_enum(const char *str, s64 *value); +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void); +char *btrfs_get_mod_read_policy(void); +#endif #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index eb51b609190fb5..5b1e5e12384c2b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -12,6 +12,9 @@ #include #include #include +#ifdef CONFIG_BTRFS_EXPERIMENTAL +#include +#endif #include "misc.h" #include "ctree.h" #include "disk-io.h" @@ -184,6 +187,21 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags return BTRFS_BG_FLAG_TO_INDEX(profile); } +#define BTRFS_DEV_ALLOCATION_MASK ((1ULL << \ + BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1) +#define BTRFS_DEV_ALLOCATION_MASK_COUNT (1ULL << \ + BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) + +static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = { + [BTRFS_DEV_ALLOCATION_NONE_ONLY] = -99, + [BTRFS_DEV_ALLOCATION_DATA_ONLY] = -1, + [BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0, + [BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1, + [BTRFS_DEV_ALLOCATION_METADATA_ONLY] = 2, + [BTRFS_DEV_ALLOCATION_PREFERRED_NONE] = 99, + /* the other values are set to 0 */ +}; + const char *btrfs_bg_type_to_raid_name(u64 flags) { const int index = btrfs_bg_flags_to_raid_index(flags); @@ -1190,6 +1208,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; struct btrfs_device *tmp_device; + s64 __maybe_unused value = 0; int ret = 0; list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, @@ -1219,7 +1238,22 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->latest_dev = latest_dev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; + fs_devices->read_devid = latest_dev->devid; + fs_devices->read_policy = + btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(), &value); + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) + fs_devices->fs_stats = true; + if (value) { + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) + fs_devices->rr_min_contig_read = value; + if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID) + fs_devices->read_devid = value; + } +#else fs_devices->read_policy = BTRFS_READ_POLICY_PID; +#endif return 0; } @@ -2882,7 +2916,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; } -static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, +noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; @@ -5022,13 +5056,18 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, } /* - * sort the devices in descending order by max_avail, total_avail + * sort the devices in descending order by alloc_hint, + * max_avail, total_avail */ static int btrfs_cmp_device_info(const void *a, const void *b) { const struct btrfs_device_info *di_a = a; const struct btrfs_device_info *di_b = b; + if (di_a->alloc_hint > di_b->alloc_hint) + return -1; + if (di_a->alloc_hint < di_b->alloc_hint) + return 1; if (di_a->max_avail > di_b->max_avail) return -1; if (di_a->max_avail < di_b->max_avail) @@ -5181,6 +5220,8 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, int ndevs = 0; u64 max_avail; u64 dev_offset; + int hint; + int i; /* * in the first pass through the devices list, we gather information @@ -5233,16 +5274,104 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, devices_info[ndevs].max_avail = max_avail; devices_info[ndevs].total_avail = total_avail; devices_info[ndevs].dev = device; + + if ((ctl->type & BTRFS_BLOCK_GROUP_DATA) && + (ctl->type & BTRFS_BLOCK_GROUP_METADATA)) { + /* + * if mixed bg set all the alloc_hint + * fields to the same value, so the sorting + * is not affected + */ + devices_info[ndevs].alloc_hint = 0; + } else if (ctl->type & BTRFS_BLOCK_GROUP_DATA) { + hint = device->type & BTRFS_DEV_ALLOCATION_MASK; + + /* + * skip BTRFS_DEV_METADATA_ONLY disks + */ + if (hint == BTRFS_DEV_ALLOCATION_METADATA_ONLY) + continue; + /* + * skip BTRFS_DEV_NONE_ONLY disks + */ + if (hint == BTRFS_DEV_ALLOCATION_NONE_ONLY) + continue; + /* + * if a data chunk must be allocated, + * sort also by hint (data disk + * higher priority) + */ + devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; + } else { /* BTRFS_BLOCK_GROUP_METADATA */ + hint = device->type & BTRFS_DEV_ALLOCATION_MASK; + + /* + * skip BTRFS_DEV_DATA_ONLY disks + */ + if (hint == BTRFS_DEV_ALLOCATION_DATA_ONLY) + continue; + /* + * skip BTRFS_DEV_NONE_ONLY disks + */ + if (hint == BTRFS_DEV_ALLOCATION_NONE_ONLY) + continue; + /* + * if a data chunk must be allocated, + * sort also by hint (metadata hint + * higher priority) + */ + if (hint == BTRFS_DEV_ALLOCATION_PREFERRED_NONE) + devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; + else + devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; + } + ++ndevs; } ctl->ndevs = ndevs; + /* + * no devices available + */ + if (!ndevs) + return 0; + /* * now sort the devices by hole size / available space */ sort(devices_info, ndevs, sizeof(struct btrfs_device_info), btrfs_cmp_device_info, NULL); + /* + * select the minimum set of disks grouped by hint that + * can host the chunk + */ + ndevs = 0; + while (ndevs < ctl->ndevs) { + hint = devices_info[ndevs++].alloc_hint; + while (ndevs < ctl->ndevs && + devices_info[ndevs].alloc_hint == hint) + ndevs++; + if (ndevs >= ctl->devs_min) + break; + } + + BUG_ON(ndevs > ctl->ndevs); + ctl->ndevs = ndevs; + + /* + * the next layers require the devices_info ordered by + * max_avail. If we are returing two (or more) different + * group of alloc_hint, this is not always true. So sort + * these gain. + */ + + for (i = 0 ; i < ndevs ; i++) + devices_info[i].alloc_hint = 0; + + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), + btrfs_cmp_device_info, NULL); + return 0; } @@ -5860,6 +5989,241 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return ret; } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, + int num_stripe) +{ + int last = first + num_stripe; + int stripe_index; + + for (stripe_index = first; stripe_index < last; stripe_index++) { + struct btrfs_device *device = map->stripes[stripe_index].dev; + + if (device->devid == READ_ONCE(device->fs_devices->read_devid)) + return stripe_index; + } + + /* If no read-preferred device, use first stripe */ + return first; +} + +/* + * btrfs_device_read_latency + * + * Compute the average latency of the device by dividing total latency by + * number of IOs. + */ +#define BTRFS_DEVICE_LATENCY_CHECKPOINT_AGE 30000 +static u64 btrfs_device_read_latency(struct btrfs_device *device) +{ + u64 avg_wait = 0; + + if (likely(device->bdev)) { + u64 last_io_age = (u64)atomic64_read(&device->last_io_age); + + if (likely(last_io_age >= 0 && last_io_age < BTRFS_DEVICE_LATENCY_CHECKPOINT_AGE)) { + u64 read_wait = part_stat_read(device->bdev, nsecs[READ]); + u64 last_nsecs_read = (u64)atomic64_read(&device->last_nsecs_read); + unsigned long read_ios = part_stat_read(device->bdev, ios[READ]); + unsigned long last_ios_read = (unsigned long)atomic64_read(&device->last_ios_read); + + s64 delta_read_wait = read_wait - last_nsecs_read; + s64 delta_read_ios = read_ios - last_ios_read; + + if (delta_read_wait > 0 && delta_read_ios > 0 && delta_read_wait >= delta_read_ios) + avg_wait = div_u64(delta_read_wait, delta_read_ios); + } + } + + return avg_wait; +} + +/* + * btrfs_best_stripe + * + * Select a stripe for reading using the average latency: + * Store minimum latency and selected stripe in best_wait / best_stripe. + * + * Will always find at least one stripe. + */ +static void btrfs_best_stripe(struct btrfs_fs_info *fs_info, + struct btrfs_chunk_map *map, int first, + int num_stripes, u64 *best_wait, int *best_stripe) +{ + *best_wait = U64_MAX; + *best_stripe = 0; + + for (int index = first; index < first + num_stripes; index++) { + u64 avg_wait = btrfs_device_read_latency(map->stripes[index].dev); + if (*best_wait > avg_wait) { + *best_wait = avg_wait; + *best_stripe = index; + } + } +} + +static unsigned int part_in_flight(struct block_device *part) +{ + unsigned int inflight = 0; + int cpu; + + for_each_possible_cpu(cpu) { + inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) + + part_stat_local_read_cpu(part, in_flight[1], cpu); + } + if ((int)inflight < 0) + inflight = 0; + + return inflight; +} + +/* + * btrfs_earliest_stripe + * + * Select a stripe from the device with shortest in-flight requests. + */ +static int btrfs_read_earliest(struct btrfs_fs_info *fs_info, + struct btrfs_chunk_map *map, int first, + int num_stripes) +{ + u64 best_in_flight = U64_MAX; + int best_stripe = 0; + + for (int index = first; index < first + num_stripes; index++) { + struct block_device *part = map->stripes[index].dev->bdev; + u64 in_flight = part ? part_in_flight(part) : 0; + if (best_in_flight > in_flight) { + best_in_flight = in_flight; + best_stripe = index; + } + } + + return best_stripe; +} + +static int btrfs_read_fastest(struct btrfs_fs_info *fs_info, + struct btrfs_chunk_map *map, int first, + int num_stripes) +{ + u64 best_wait; + int best_stripe; + + btrfs_best_stripe(fs_info, map, first, num_stripes, &best_wait, + &best_stripe); + + return best_stripe; +} + +struct stripe_mirror { + u64 devid; + int num; +}; + +static int btrfs_cmp_devid(const void *a, const void *b) +{ + const struct stripe_mirror *s1 = (struct stripe_mirror *)a; + const struct stripe_mirror *s2 = (struct stripe_mirror *)b; + + if (s1->devid < s2->devid) + return -1; + if (s1->devid > s2->devid) + return 1; + return 0; +} + +/* + * btrfs_read_rr. + * + * Select a stripe for reading using a round-robin algorithm: + * + * 1. Compute the read cycle as the total sectors read divided by the minimum + * sectors per device. + * 2. Determine the stripe number for the current read by taking the modulus + * of the read cycle with the total number of stripes: + * + * stripe index = (total sectors / min sectors per dev) % num stripes + * + * The calculated stripe index is then used to select the corresponding device + * from the list of devices, which is ordered by devid. + */ +static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripes, + u64 min_latency) +{ + struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 }; + struct btrfs_device *device = map->stripes[first].dev; + struct btrfs_fs_info *fs_info = device->fs_devices->fs_info; + unsigned int read_cycle; + unsigned int total_reads; + unsigned int min_reads_per_dev; + int count_stripes = 0; + + total_reads = percpu_counter_sum(&fs_info->stats_read_blocks); + min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >> + fs_info->sectorsize_bits; + + for (int i = first; i < first + num_stripes; i++) { + if (min_latency > 0) { + u64 avg_wait = btrfs_device_read_latency(map->stripes[i].dev); + if (min_latency < avg_wait) + continue; + } + + stripes[count_stripes].devid = map->stripes[i].dev->devid; + stripes[count_stripes].num = i; + count_stripes++; + } + + /* if the caller passed a minimum latency and we filtered for no + * stripes, return -1 to indicate that no stripe qualified. + */ + if (unlikely(min_latency && !count_stripes)) + return -1; + + sort(stripes, count_stripes, sizeof(struct stripe_mirror), + btrfs_cmp_devid, NULL); + + read_cycle = total_reads / min_reads_per_dev; + return stripes[read_cycle % count_stripes].num; +} + +/* + * btrfs_read_fastest_rr. + * + * Select a stripe for reading using a hybrid algorithm: + * + * 1. Determine the fastest stripe using btrfs_best_stripe. + * 2. Add 20% headroom to the selected latency. + * 3. Select a stripe using btrfs_read_rr filtered by latency. + */ +static int btrfs_read_fastest_rr(struct btrfs_fs_info *fs_info, + struct btrfs_chunk_map *map, int first, + int num_stripes) +{ + u64 min_latency; + int ret_stripe = -1; + + /* find the lowest latency of all stripes first */ + btrfs_best_stripe(fs_info, map, first, num_stripes, &min_latency, + &ret_stripe); + + /* min_latency will be 0 if no latency has been recorded yet, + * add 25% headroom otherwise, and round-robin among the fast + * stripes only. + */ + if (likely(min_latency)) { + min_latency += (min_latency >> 2); + ret_stripe = btrfs_read_rr(map, first, num_stripes, min_latency); + } + + /* retry with default round-robin if no stripe has been found */ + if (unlikely(ret_stripe < 0)) + ret_stripe = btrfs_read_rr(map, first, num_stripes, 0); + + return ret_stripe; +} +#endif + +#define BTRFS_DEVICE_LATENCY_CHECKPOINT_BURST_IO 30 static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) @@ -5879,6 +6243,14 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, else num_stripes = map->num_stripes; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* age each possible stripe by 1 IO */ + for (int i = first; i < first + num_stripes; i++) { + atomic64_inc(&map->stripes[i].dev->last_io_age); + atomic64_inc(&map->stripes[i].dev->stripe_ignored); + } +#endif + switch (policy) { default: /* Shouldn't happen, just warn and use pid instead of failing */ @@ -5889,6 +6261,26 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); break; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + case BTRFS_READ_POLICY_RR: + preferred_mirror = btrfs_read_rr(map, first, num_stripes, 0); + break; + case BTRFS_READ_POLICY_DEVID: + preferred_mirror = btrfs_read_preferred(map, first, num_stripes); + break; + case BTRFS_READ_POLICY_LATENCY: + preferred_mirror = btrfs_read_fastest(fs_info, map, first, + num_stripes); + break; + case BTRFS_READ_POLICY_LATENCY_RR: + preferred_mirror = btrfs_read_fastest_rr(fs_info, map, first, + num_stripes); + break; + case BTRFS_READ_POLICY_QUEUE: + preferred_mirror = btrfs_read_earliest(fs_info, map, first, + num_stripes); + break; +#endif } if (dev_replace_is_ongoing && @@ -5906,14 +6298,40 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, for (tolerance = 0; tolerance < 2; tolerance++) { if (map->stripes[preferred_mirror].dev->bdev && (tolerance || map->stripes[preferred_mirror].dev != srcdev)) - return preferred_mirror; + goto out; for (i = first; i < first + num_stripes; i++) { if (map->stripes[i].dev->bdev && - (tolerance || map->stripes[i].dev != srcdev)) - return i; + (tolerance || map->stripes[i].dev != srcdev)) { + preferred_mirror = i; + goto out; + } } } +out: +#ifdef CONFIG_BTRFS_EXPERIMENTAL + do { + /* reset age of selected stripe */ + s64 current_age; + struct btrfs_device *pref_dev = map->stripes[preferred_mirror].dev; + + spin_lock(&pref_dev->latency_lock); + + current_age = atomic64_read(&pref_dev->last_io_age); + if (current_age >= BTRFS_DEVICE_LATENCY_CHECKPOINT_AGE && pref_dev->bdev) { + atomic64_inc(&pref_dev->checkpoints); + atomic64_set(&pref_dev->last_io_age, -BTRFS_DEVICE_LATENCY_CHECKPOINT_BURST_IO); + atomic64_set(&pref_dev->last_nsecs_read, part_stat_read(pref_dev->bdev, nsecs[READ])); + atomic64_set(&pref_dev->last_ios_read, part_stat_read(pref_dev->bdev, ios[READ])); + } else if (current_age > 0) { + atomic64_set(&pref_dev->last_io_age, 0); + } + atomic64_dec(&pref_dev->stripe_ignored); + + spin_unlock(&pref_dev->latency_lock); + } while (0); +#endif + /* we couldn't find one that doesn't fail. Just return something * and the io error handling code will clean up eventually */ @@ -7467,8 +7885,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) struct btrfs_device *device; int ret = 0; - fs_devices->fs_info = fs_info; - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) device->fs_info = fs_info; @@ -7570,7 +7986,7 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) list_for_each_entry(device, &fs_devices->devices, dev_list) { ret = btrfs_device_init_dev_stats(device, path); if (ret) - goto out; + return ret; } list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { list_for_each_entry(device, &seed_devs->devices, dev_list) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 4481575dd70f35..2c43365cff5132 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -185,7 +185,7 @@ struct btrfs_device { * enum btrfs_dev_stat_values in ioctl.h */ int dev_stats_valid; - /* Counter to record the change of device stats */ + /* Counter to record of the change of device stats */ atomic_t dev_stats_ccnt; atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; @@ -197,6 +197,20 @@ struct btrfs_device { /* Bandwidth limit for scrub, in bytes */ u64 scrub_speed_max; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* store an age of last read access */ + atomic64_t last_io_age; + atomic64_t checkpoints; + atomic64_t stripe_ignored; + + /* lock while updating values */ + spinlock_t latency_lock; + + /* last latency values for short term latency calculation */ + atomic64_t last_nsecs_read; + atomic64_t last_ios_read; +#endif }; /* @@ -296,6 +310,8 @@ enum btrfs_chunk_allocation_policy { BTRFS_CHUNK_ALLOC_ZONED, }; +#define BTRFS_DEFAULT_RR_MIN_CONTIG_READ (SZ_256K) +#define BTRFS_RAID1_MAX_MIRRORS (4) /* * Read policies for mirrored block group profiles, read picks the stripe based * on these policies. @@ -303,6 +319,18 @@ enum btrfs_chunk_allocation_policy { enum btrfs_read_policy { /* Use process PID to choose the stripe */ BTRFS_READ_POLICY_PID, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Balancing raid1 reads across all striped devices (round-robin) */ + BTRFS_READ_POLICY_RR, + /* Use the lowest-latency device dynamically */ + BTRFS_READ_POLICY_LATENCY, + /* Use hybrid approach of lowest-latency and round-robin */ + BTRFS_READ_POLICY_LATENCY_RR, + /* Read from the device with least in-flight requests */ + BTRFS_READ_POLICY_QUEUE, + /* Read from the specific device */ + BTRFS_READ_POLICY_DEVID, +#endif BTRFS_NR_READ_POLICY, }; @@ -417,6 +445,8 @@ struct btrfs_fs_devices { bool seeding; /* The mount needs to use a randomly generated fsid. */ bool temp_fsid; + /* Enable/disable the filesystem stats tracking */ + bool fs_stats; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ @@ -430,6 +460,14 @@ struct btrfs_fs_devices { /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Min contiguous reads before switching to next device. */ + int rr_min_contig_read; + + /* Device to be used for reading in case of RAID1. */ + u64 read_devid; +#endif + #ifdef CONFIG_BTRFS_DEBUG /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; @@ -565,6 +603,7 @@ struct btrfs_device_info { u64 dev_offset; u64 max_avail; u64 total_avail; + int alloc_hint; }; struct btrfs_raid_attr { @@ -836,6 +875,8 @@ int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); +int btrfs_update_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index fc29d273845d84..3db20734aacfc6 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -578,6 +578,24 @@ struct btrfs_node { struct btrfs_key_ptr ptrs[]; } __attribute__ ((__packed__)); +/* dev_item.type */ + +/* btrfs chunk allocation hints */ +#define BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT 3 +/* preferred data chunk, but metadata chunk allowed */ +#define BTRFS_DEV_ALLOCATION_PREFERRED_DATA (0ULL) +/* preferred metadata chunk, but data chunk allowed */ +#define BTRFS_DEV_ALLOCATION_PREFERRED_METADATA (1ULL) +/* only metadata chunk are allowed */ +#define BTRFS_DEV_ALLOCATION_METADATA_ONLY (2ULL) +/* only data chunk allowed */ +#define BTRFS_DEV_ALLOCATION_DATA_ONLY (3ULL) +/* preferred no chunk, but chunks allowed */ +#define BTRFS_DEV_ALLOCATION_PREFERRED_NONE (4ULL) +/* no chunks allowed */ +#define BTRFS_DEV_ALLOCATION_NONE_ONLY (5ULL) +/* 6..7 are unused values */ + struct btrfs_dev_item { /* the internal btrfs device id */ __le64 devid;