From 3d59f4a13c2ce240b2994ba28e876244dd94cb45 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 18 Jul 2022 00:52:18 +0200 Subject: [PATCH 01/37] lib-string.c-Optimize-memchr Signed-off-by: Peter Jung --- lib/string.c | 62 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/lib/string.c b/lib/string.c index 3371d26a0e390c..db397ee8b0ad0d 100644 --- a/lib/string.c +++ b/lib/string.c @@ -874,24 +874,61 @@ char *strnstr(const char *s1, const char *s2, size_t len) EXPORT_SYMBOL(strnstr); #endif +#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 + +#define MEMCHR_MASK_GEN(mask) (mask *= 0x0101010101010101ULL) + +#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) + +#define MEMCHR_MASK_GEN(mask) \ + do { \ + mask *= 0x01010101; \ + mask |= mask << 32; \ + } while (0) + +#else + +#define MEMCHR_MASK_GEN(mask) \ + do { \ + mask |= mask << 8; \ + mask |= mask << 16; \ + mask |= mask << 32; \ + } while (0) + +#endif + #ifndef __HAVE_ARCH_MEMCHR /** * memchr - Find a character in an area of memory. - * @s: The memory area + * @p: The memory area * @c: The byte to search for - * @n: The size of the area. + * @length: The size of the area. * * returns the address of the first occurrence of @c, or %NULL * if @c is not found */ -void *memchr(const void *s, int c, size_t n) +void *memchr(const void *p, int c, unsigned long length) { - const unsigned char *p = s; - while (n-- != 0) { - if ((unsigned char)c == *p++) { - return (void *)(p - 1); + u64 mask, val; + const void *end = p + length; + + c &= 0xff; + if (p <= end - 8) { + mask = c; + MEMCHR_MASK_GEN(mask); + + for (; p <= end - 8; p += 8) { + val = *(u64 *)p ^ mask; + if ((val + 0xfefefefefefefeffu) & + (~val & 0x8080808080808080u)) + break; } } + + for (; p < end; p++) + if (*(unsigned char *)p == c) + return (void *)p; + return NULL; } EXPORT_SYMBOL(memchr); @@ -927,16 +964,7 @@ void *memchr_inv(const void *start, int c, size_t bytes) return check_bytes8(start, value, bytes); value64 = value; -#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 - value64 *= 0x0101010101010101ULL; -#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) - value64 *= 0x01010101; - value64 |= value64 << 32; -#else - value64 |= value64 << 8; - value64 |= value64 << 16; - value64 |= value64 << 32; -#endif + MEMCHR_MASK_GEN(value64); prefix = (unsigned long)start % 8; if (prefix) { From 7e6eb7d2a8659da831d8edc85a21ff4e5a1aa368 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 18 Jul 2022 01:34:46 +0200 Subject: [PATCH 02/37] ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C Signed-off-by: Peter Jung --- include/linux/user_namespace.h | 4 ++++ init/Kconfig | 16 ++++++++++++++++ kernel/fork.c | 14 ++++++++++++++ kernel/sysctl.c | 12 ++++++++++++ kernel/user_namespace.c | 7 +++++++ 5 files changed, 53 insertions(+) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 45f09bec02c485..87b20e2ee27445 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -148,6 +148,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, #ifdef CONFIG_USER_NS +extern int unprivileged_userns_clone; + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) @@ -181,6 +183,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns); struct ns_common *ns_get_owner(struct ns_common *ns); #else +#define unprivileged_userns_clone 0 + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; diff --git a/init/Kconfig b/init/Kconfig index 94125d3b6893c7..9f7139b536f638 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1247,6 +1247,22 @@ config USER_NS If unsure, say N. +config USER_NS_UNPRIVILEGED + bool "Allow unprivileged users to create namespaces" + default y + depends on USER_NS + help + When disabled, unprivileged users will not be able to create + new namespaces. Allowing users to create their own namespaces + has been part of several recent local privilege escalation + exploits, so if you need user namespaces but are + paranoid^Wsecurity-conscious you want to disable this. + + This setting can be overridden at runtime via the + kernel.unprivileged_userns_clone sysctl. + + If unsure, say Y. + config PID_NS bool "PID Namespaces" default y diff --git a/kernel/fork.c b/kernel/fork.c index 08969f5aa38d59..ff601cb7a1fae0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -98,6 +98,10 @@ #include #include +#ifdef CONFIG_USER_NS +#include +#endif + #include #include #include @@ -2008,6 +2012,10 @@ static __latent_entropy struct task_struct *copy_process( if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); + if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. @@ -3166,6 +3174,12 @@ int ksys_unshare(unsigned long unshare_flags) if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; + if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { + err = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto bad_unshare_out; + } + err = check_unshare_flags(unshare_flags); if (err) goto bad_unshare_out; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c6d9dec11b749d..1c7c7c95387609 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -96,6 +96,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); #ifdef CONFIG_PERF_EVENTS static const int six_hundred_forty_kb = 640 * 1024; #endif +#ifdef CONFIG_USER_NS +#include +#endif static const int ngroups_max = NGROUPS_MAX; @@ -1659,6 +1662,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_USER_NS + { + .procname = "unprivileged_userns_clone", + .data = &unprivileged_userns_clone, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 54211dbd516c57..16ca0c1516298d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -22,6 +22,13 @@ #include #include +/* sysctl */ +#ifdef CONFIG_USER_NS_UNPRIVILEGED +int unprivileged_userns_clone = 1; +#else +int unprivileged_userns_clone; +#endif + static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); From f1f7abe4994a63c6b82072a5eff419c17ce7a41a Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Tue, 9 Aug 2022 12:18:55 +0200 Subject: [PATCH 03/37] Revert "kbuild: drop support for CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3" This reverts commit be9f7d7b1d23f003eba3e77010002076b68d888f. Signed-off-by: Peter Jung --- Makefile | 3 +++ arch/arc/configs/axs101_defconfig | 1 + arch/arc/configs/axs103_defconfig | 1 + arch/arc/configs/axs103_smp_defconfig | 1 + arch/arc/configs/haps_hs_defconfig | 1 + arch/arc/configs/haps_hs_smp_defconfig | 1 + arch/arc/configs/hsdk_defconfig | 1 + arch/arc/configs/nsim_700_defconfig | 1 + arch/arc/configs/nsimosci_defconfig | 1 + arch/arc/configs/nsimosci_hs_defconfig | 1 + arch/arc/configs/nsimosci_hs_smp_defconfig | 1 + arch/arc/configs/tb10x_defconfig | 1 + arch/arc/configs/vdk_hs38_defconfig | 1 + arch/arc/configs/vdk_hs38_smp_defconfig | 1 + init/Kconfig | 7 +++++++ 15 files changed, 23 insertions(+) diff --git a/Makefile b/Makefile index 997b6772229207..98c9a974b8c2ae 100644 --- a/Makefile +++ b/Makefile @@ -823,6 +823,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE KBUILD_CFLAGS += -O2 KBUILD_RUSTFLAGS += -Copt-level=2 +else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 +KBUILD_CFLAGS += -O3 +KBUILD_RUSTFLAGS += -Copt-level=3 else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os KBUILD_RUSTFLAGS += -Copt-level=s diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig index 81764160451f7f..2c15d3bf747a99 100644 --- a/arch/arc/configs/axs101_defconfig +++ b/arch/arc/configs/axs101_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig index d5181275490edf..7d868e148d9a45 100644 --- a/arch/arc/configs/axs103_defconfig +++ b/arch/arc/configs/axs103_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig index 2f336d99a8cf35..777a9f21eb6b6a 100644 --- a/arch/arc/configs/axs103_smp_defconfig +++ b/arch/arc/configs/axs103_smp_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig index 899b2fd5c71d12..bda15a87684902 100644 --- a/arch/arc/configs/haps_hs_defconfig +++ b/arch/arc/configs/haps_hs_defconfig @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EXPERT=y CONFIG_PERF_EVENTS=y # CONFIG_COMPAT_BRK is not set diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig index 0d32aac8069f26..dbd74fea69aa51 100644 --- a/arch/arc/configs/haps_hs_smp_defconfig +++ b/arch/arc/configs/haps_hs_smp_defconfig @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig index d18378d2c2a63e..2396ca4171826c 100644 --- a/arch/arc/configs/hsdk_defconfig +++ b/arch/arc/configs/hsdk_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y CONFIG_BLK_DEV_RAM=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig index 3e982977599253..5044609540cc38 100644 --- a/arch/arc/configs/nsim_700_defconfig +++ b/arch/arc/configs/nsim_700_defconfig @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig index 502c87f351c870..748c809d1c4c6f 100644 --- a/arch/arc/configs/nsimosci_defconfig +++ b/arch/arc/configs/nsimosci_defconfig @@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig index f721cc3997d02c..205c32b0074ca5 100644 --- a/arch/arc/configs/nsimosci_hs_defconfig +++ b/arch/arc/configs/nsimosci_hs_defconfig @@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig index 1419fc946a083c..2477b7c8097712 100644 --- a/arch/arc/configs/nsimosci_hs_smp_defconfig +++ b/arch/arc/configs/nsimosci_hs_smp_defconfig @@ -8,6 +8,7 @@ CONFIG_IKCONFIG_PROC=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_PERF_EVENTS=y # CONFIG_COMPAT_BRK is not set CONFIG_KPROBES=y diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig index 6f0d2be9d926ce..cf02ad0fc210ea 100644 --- a/arch/arc/configs/tb10x_defconfig +++ b/arch/arc/configs/tb10x_defconfig @@ -14,6 +14,7 @@ CONFIG_INITRAMFS_SOURCE="../tb10x-rootfs.cpio" CONFIG_INITRAMFS_ROOT_UID=2100 CONFIG_INITRAMFS_ROOT_GID=501 # CONFIG_RD_GZIP is not set +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y # CONFIG_AIO is not set CONFIG_EMBEDDED=y diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig index d3ef189c75f8b8..922b1b24f5184e 100644 --- a/arch/arc/configs/vdk_hs38_defconfig +++ b/arch/arc/configs/vdk_hs38_defconfig @@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig index 944b347025fd1a..ed64319f7eb298 100644 --- a/arch/arc/configs/vdk_hs38_smp_defconfig +++ b/arch/arc/configs/vdk_hs38_smp_defconfig @@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/init/Kconfig b/init/Kconfig index 9f7139b536f638..988d4f16a27ac0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1430,6 +1430,13 @@ config CC_OPTIMIZE_FOR_PERFORMANCE with the "-O2" compiler flag for best performance and most helpful compile-time warnings. +config CC_OPTIMIZE_FOR_PERFORMANCE_O3 + bool "Optimize more for performance (-O3)" + depends on ARC + help + Choosing this option will pass "-O3" to your compiler to optimize + the kernel yet more for performance. + config CC_OPTIMIZE_FOR_SIZE bool "Optimize for size (-Os)" help From 43ac6f515cb4252c8fe402607abbb6166fb49ea8 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 17 Oct 2022 01:58:55 +0200 Subject: [PATCH 04/37] some cfs/cachy tweaks Signed-off-by: Peter Jung --- block/elevator.c | 7 ++++++- drivers/md/dm-crypt.c | 5 +++++ include/linux/pagemap.h | 2 +- init/Kconfig | 4 ++++ kernel/Kconfig.hz | 24 ++++++++++++++++++++++++ kernel/sched/fair.c | 20 +++++++++++++++++++- mm/compaction.c | 4 ++++ mm/page-writeback.c | 8 ++++++++ mm/vmpressure.c | 4 ++++ mm/vmscan.c | 4 ++++ 10 files changed, 79 insertions(+), 3 deletions(-) diff --git a/block/elevator.c b/block/elevator.c index bd71f0fc4e4b67..389cb51389afc5 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -640,8 +640,13 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) if (q->nr_hw_queues != 1 && !blk_mq_is_shared_tags(q->tag_set->flags)) +#if defined(CONFIG_CACHY) && defined(CONFIG_MQ_IOSCHED_KYBER) + return elevator_get(q, "kyber", false); +#elif defined(CONFIG_CACHY) + return elevator_get(q, "mq-deadline", false); +#else return NULL; - +#endif return elevator_get(q, "mq-deadline", false); } diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 2653516bcdef50..cdf9d8c7b556af 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3137,6 +3137,11 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar } } +#ifdef CONFIG_CACHY + set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); + set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); +#endif + return 0; } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bbccb404422247..0a11325c35fa1b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1181,7 +1181,7 @@ struct readahead_control { ._index = i, \ } -#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) +#define VM_READAHEAD_PAGES (SZ_8M / PAGE_SIZE) void page_cache_ra_unbounded(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_count); diff --git a/init/Kconfig b/init/Kconfig index 988d4f16a27ac0..c23c83d912c629 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -123,6 +123,10 @@ config THREAD_INFO_IN_TASK menu "General setup" +config CACHY + bool "Some kernel tweaks by CachyOS" + default y + config BROKEN bool diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 38ef6d06888ef1..0f78364efd4f23 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -40,6 +40,27 @@ choice on SMP and NUMA systems and exactly dividing by both PAL and NTSC frame rates for video and multimedia work. + config HZ_500 + bool "500 HZ" + help + 500 Hz is a balanced timer frequency. Provides fast interactivity + on desktops with good smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + + config HZ_600 + bool "600 HZ" + help + 600 Hz is a balanced timer frequency. Provides fast interactivity + on desktops with good smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + + config HZ_750 + bool "750 HZ" + help + 750 Hz is a balanced timer frequency. Provides fast interactivity + on desktops with good smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + config HZ_1000 bool "1000 HZ" help @@ -53,6 +74,9 @@ config HZ default 100 if HZ_100 default 250 if HZ_250 default 300 if HZ_300 + default 500 if HZ_500 + default 600 if HZ_600 + default 750 if HZ_750 default 1000 if HZ_1000 config SCHED_HRTICK diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e4a0b8bd941c78..621e445329b084 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -69,9 +69,13 @@ * * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_CACHY +unsigned int sysctl_sched_latency = 3000000ULL; +static unsigned int normalized_sysctl_sched_latency = 3000000ULL; +#else unsigned int sysctl_sched_latency = 6000000ULL; static unsigned int normalized_sysctl_sched_latency = 6000000ULL; - +#endif /* * The initial- and re-scaling of tunables is configurable * @@ -90,8 +94,13 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_CACHY +unsigned int sysctl_sched_min_granularity = 400000ULL; +static unsigned int normalized_sysctl_sched_min_granularity = 400000ULL; +#else unsigned int sysctl_sched_min_granularity = 750000ULL; static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +#endif /* * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. @@ -121,8 +130,13 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; * * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_CACHY +unsigned int sysctl_sched_wakeup_granularity = 500000UL; +static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; +#else unsigned int sysctl_sched_wakeup_granularity = 1000000UL; static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +#endif const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -175,8 +189,12 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ +#ifdef CONFIG_CACHY +static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; +#else static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +#endif #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { diff --git a/mm/compaction.c b/mm/compaction.c index 1f6da31dd9a501..07ce9525d7ac3e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2724,7 +2724,11 @@ static void compact_nodes(void) * aggressively the kernel should compact memory in the * background. It takes values in the range [0, 100]. */ +#ifdef CONFIG_CACHY +unsigned int __read_mostly sysctl_compaction_proactiveness; +#else unsigned int __read_mostly sysctl_compaction_proactiveness = 20; +#endif int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7e9d8d857eccae..5cd27d0871b6f6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -70,7 +70,11 @@ static long ratelimit_pages = 32; /* * Start background writeback (via writeback threads) at this percentage */ +#ifdef CONFIG_CACHY +static int dirty_background_ratio = 5; +#else static int dirty_background_ratio = 10; +#endif /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of @@ -98,7 +102,11 @@ static unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks */ +#ifdef CONFIG_CACHY +unsigned int dirty_writeback_interval = 10 * 100; /* centiseconds */ +#else unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ +#endif EXPORT_SYMBOL_GPL(dirty_writeback_interval); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index b52644771cc438..11a4b0e3b583ce 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; * essence, they are percents: the higher the value, the more number * unsuccessful reclaims there were. */ +#ifdef CONFIG_CACHY +static const unsigned int vmpressure_level_med = 65; +#else static const unsigned int vmpressure_level_med = 60; +#endif static const unsigned int vmpressure_level_critical = 95; /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 8fcc5fa768c07c..c058a45dd43887 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -189,7 +189,11 @@ struct scan_control { /* * From 0 .. 200. Higher means more swappy. */ +#ifdef CONFIG_CACHY +int vm_swappiness = 20; +#else int vm_swappiness = 60; +#endif static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) From 705ed6d3cf4648c30fc55ab5206b084b88e3f6e7 Mon Sep 17 00:00:00 2001 From: graysky Date: Fri, 4 Nov 2022 15:34:36 -0400 Subject: [PATCH 05/37] more uarches for kernel 5.17+ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FEATURES This patch adds additional CPU options to the Linux kernel accessible under: Processor type and features ---> Processor family ---> With the release of gcc 11.1 and clang 12.0, several generic 64-bit levels are offered which are good for supported Intel or AMD CPUs: • x86-64-v2 • x86-64-v3 • x86-64-v4 Users of glibc 2.33 and above can see which level is supported by current hardware by running: /lib/ld-linux-x86-64.so.2 --help | grep supported Alternatively, compare the flags from /proc/cpuinfo to this list.[1] CPU-specific microarchitectures include: • AMD Improved K8-family • AMD K10-family • AMD Family 10h (Barcelona) • AMD Family 14h (Bobcat) • AMD Family 16h (Jaguar) • AMD Family 15h (Bulldozer) • AMD Family 15h (Piledriver) • AMD Family 15h (Steamroller) • AMD Family 15h (Excavator) • AMD Family 17h (Zen) • AMD Family 17h (Zen 2) • AMD Family 19h (Zen 3)† • AMD Family 19h (Zen 4)§ • Intel Silvermont low-power processors • Intel Goldmont low-power processors (Apollo Lake and Denverton) • Intel Goldmont Plus low-power processors (Gemini Lake) • Intel 1st Gen Core i3/i5/i7 (Nehalem) • Intel 1.5 Gen Core i3/i5/i7 (Westmere) • Intel 2nd Gen Core i3/i5/i7 (Sandybridge) • Intel 3rd Gen Core i3/i5/i7 (Ivybridge) • Intel 4th Gen Core i3/i5/i7 (Haswell) • Intel 5th Gen Core i3/i5/i7 (Broadwell) • Intel 6th Gen Core i3/i5/i7 (Skylake) • Intel 6th Gen Core i7/i9 (Skylake X) • Intel 8th Gen Core i3/i5/i7 (Cannon Lake) • Intel 10th Gen Core i7/i9 (Ice Lake) • Intel Xeon (Cascade Lake) • Intel Xeon (Cooper Lake)* • Intel 3rd Gen 10nm++ i3/i5/i7/i9-family (Tiger Lake)* • Intel 3rd Gen 10nm++ Xeon (Sapphire Rapids)‡ • Intel 11th Gen i3/i5/i7/i9-family (Rocket Lake)‡ • Intel 12th Gen i3/i5/i7/i9-family (Alder Lake)‡ • Intel 13th Gen i3/i5/i7/i9-family (Raptor Lake)§ • Intel 14th Gen i3/i5/i7/i9-family (Meteor Lake)§ Notes: If not otherwise noted, gcc >=9.1 is required for support. *Requires gcc >=10.1 or clang >=10.0 †Required gcc >=10.3 or clang >=12.0 ‡Required gcc >=11.1 or clang >=12.0 §Required gcc >=13.0 or clang >=15.0.5 It also offers to compile passing the 'native' option which, "selects the CPU to generate code for at compilation time by determining the processor type of the compiling machine. Using -march=native enables all instruction subsets supported by the local machine and will produce code optimized for the local machine under the constraints of the selected instruction set."[2] Users of Intel CPUs should select the 'Intel-Native' option and users of AMD CPUs should select the 'AMD-Native' option. MINOR NOTES RELATING TO INTEL ATOM PROCESSORS This patch also changes -march=atom to -march=bonnell in accordance with the gcc v4.9 changes. Upstream is using the deprecated -match=atom flags when I believe it should use the newer -march=bonnell flag for atom processors.[3] It is not recommended to compile on Atom-CPUs with the 'native' option.[4] The recommendation is to use the 'atom' option instead. BENEFITS Small but real speed increases are measurable using a make endpoint comparing a generic kernel to one built with one of the respective microarchs. See the following experimental evidence supporting this statement: https://github.com/graysky2/kernel_gcc_patch REQUIREMENTS linux version 5.17+ gcc version >=9.0 or clang version >=9.0 ACKNOWLEDGMENTS This patch builds on the seminal work by Jeroen.[5] REFERENCES 1. https://gitlab.com/x86-psABIs/x86-64-ABI/-/commit/77566eb03bc6a326811cb7e9 2. https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html#index-x86-Options 3. https://bugzilla.kernel.org/show_bug.cgi?id=77461 4. https://github.com/graysky2/kernel_gcc_patch/issues/15 5. http://www.linuxforge.net/docs/linux/linux-gcc.php --- arch/x86/Kconfig.cpu | 416 ++++++++++++++++++++++++++++++-- arch/x86/Makefile | 43 +++- arch/x86/include/asm/vermagic.h | 72 ++++++ 3 files changed, 514 insertions(+), 17 deletions(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 542377cd419d78..08d887d1220dcc 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -157,7 +157,7 @@ config MPENTIUM4 config MK6 - bool "K6/K6-II/K6-III" + bool "AMD K6/K6-II/K6-III" depends on X86_32 help Select this for an AMD K6-family processor. Enables use of @@ -165,7 +165,7 @@ config MK6 flags to GCC. config MK7 - bool "Athlon/Duron/K7" + bool "AMD Athlon/Duron/K7" depends on X86_32 help Select this for an AMD Athlon K7-family processor. Enables use of @@ -173,12 +173,106 @@ config MK7 flags to GCC. config MK8 - bool "Opteron/Athlon64/Hammer/K8" + bool "AMD Opteron/Athlon64/Hammer/K8" help Select this for an AMD Opteron or Athlon64 Hammer-family processor. Enables use of some extended instructions, and passes appropriate optimization flags to GCC. +config MK8SSE3 + bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3" + help + Select this for improved AMD Opteron or Athlon64 Hammer-family processors. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + +config MK10 + bool "AMD 61xx/7x50/PhenomX3/X4/II/K10" + help + Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50, + Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + +config MBARCELONA + bool "AMD Barcelona" + help + Select this for AMD Family 10h Barcelona processors. + + Enables -march=barcelona + +config MBOBCAT + bool "AMD Bobcat" + help + Select this for AMD Family 14h Bobcat processors. + + Enables -march=btver1 + +config MJAGUAR + bool "AMD Jaguar" + help + Select this for AMD Family 16h Jaguar processors. + + Enables -march=btver2 + +config MBULLDOZER + bool "AMD Bulldozer" + help + Select this for AMD Family 15h Bulldozer processors. + + Enables -march=bdver1 + +config MPILEDRIVER + bool "AMD Piledriver" + help + Select this for AMD Family 15h Piledriver processors. + + Enables -march=bdver2 + +config MSTEAMROLLER + bool "AMD Steamroller" + help + Select this for AMD Family 15h Steamroller processors. + + Enables -march=bdver3 + +config MEXCAVATOR + bool "AMD Excavator" + help + Select this for AMD Family 15h Excavator processors. + + Enables -march=bdver4 + +config MZEN + bool "AMD Zen" + help + Select this for AMD Family 17h Zen processors. + + Enables -march=znver1 + +config MZEN2 + bool "AMD Zen 2" + help + Select this for AMD Family 17h Zen 2 processors. + + Enables -march=znver2 + +config MZEN3 + bool "AMD Zen 3" + depends on (CC_IS_GCC && GCC_VERSION >= 100300) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + help + Select this for AMD Family 19h Zen 3 processors. + + Enables -march=znver3 + +config MZEN4 + bool "AMD Zen 4" + depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) + help + Select this for AMD Family 19h Zen 4 processors. + + Enables -march=znver4 + config MCRUSOE bool "Crusoe" depends on X86_32 @@ -270,7 +364,7 @@ config MPSC in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. config MCORE2 - bool "Core 2/newer Xeon" + bool "Intel Core 2" help Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and @@ -278,6 +372,8 @@ config MCORE2 family in /proc/cpuinfo. Newer ones have 6 and older ones 15 (not a typo) + Enables -march=core2 + config MATOM bool "Intel Atom" help @@ -287,6 +383,202 @@ config MATOM accordingly optimized code. Use a recent GCC with specific Atom support in order to fully benefit from selecting this option. +config MNEHALEM + bool "Intel Nehalem" + select X86_P6_NOP + help + + Select this for 1st Gen Core processors in the Nehalem family. + + Enables -march=nehalem + +config MWESTMERE + bool "Intel Westmere" + select X86_P6_NOP + help + + Select this for the Intel Westmere formerly Nehalem-C family. + + Enables -march=westmere + +config MSILVERMONT + bool "Intel Silvermont" + select X86_P6_NOP + help + + Select this for the Intel Silvermont platform. + + Enables -march=silvermont + +config MGOLDMONT + bool "Intel Goldmont" + select X86_P6_NOP + help + + Select this for the Intel Goldmont platform including Apollo Lake and Denverton. + + Enables -march=goldmont + +config MGOLDMONTPLUS + bool "Intel Goldmont Plus" + select X86_P6_NOP + help + + Select this for the Intel Goldmont Plus platform including Gemini Lake. + + Enables -march=goldmont-plus + +config MSANDYBRIDGE + bool "Intel Sandy Bridge" + select X86_P6_NOP + help + + Select this for 2nd Gen Core processors in the Sandy Bridge family. + + Enables -march=sandybridge + +config MIVYBRIDGE + bool "Intel Ivy Bridge" + select X86_P6_NOP + help + + Select this for 3rd Gen Core processors in the Ivy Bridge family. + + Enables -march=ivybridge + +config MHASWELL + bool "Intel Haswell" + select X86_P6_NOP + help + + Select this for 4th Gen Core processors in the Haswell family. + + Enables -march=haswell + +config MBROADWELL + bool "Intel Broadwell" + select X86_P6_NOP + help + + Select this for 5th Gen Core processors in the Broadwell family. + + Enables -march=broadwell + +config MSKYLAKE + bool "Intel Skylake" + select X86_P6_NOP + help + + Select this for 6th Gen Core processors in the Skylake family. + + Enables -march=skylake + +config MSKYLAKEX + bool "Intel Skylake X" + select X86_P6_NOP + help + + Select this for 6th Gen Core processors in the Skylake X family. + + Enables -march=skylake-avx512 + +config MCANNONLAKE + bool "Intel Cannon Lake" + select X86_P6_NOP + help + + Select this for 8th Gen Core processors + + Enables -march=cannonlake + +config MICELAKE + bool "Intel Ice Lake" + select X86_P6_NOP + help + + Select this for 10th Gen Core processors in the Ice Lake family. + + Enables -march=icelake-client + +config MCASCADELAKE + bool "Intel Cascade Lake" + select X86_P6_NOP + help + + Select this for Xeon processors in the Cascade Lake family. + + Enables -march=cascadelake + +config MCOOPERLAKE + bool "Intel Cooper Lake" + depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) + select X86_P6_NOP + help + + Select this for Xeon processors in the Cooper Lake family. + + Enables -march=cooperlake + +config MTIGERLAKE + bool "Intel Tiger Lake" + depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) + select X86_P6_NOP + help + + Select this for third-generation 10 nm process processors in the Tiger Lake family. + + Enables -march=tigerlake + +config MSAPPHIRERAPIDS + bool "Intel Sapphire Rapids" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + select X86_P6_NOP + help + + Select this for third-generation 10 nm process processors in the Sapphire Rapids family. + + Enables -march=sapphirerapids + +config MROCKETLAKE + bool "Intel Rocket Lake" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + select X86_P6_NOP + help + + Select this for eleventh-generation processors in the Rocket Lake family. + + Enables -march=rocketlake + +config MALDERLAKE + bool "Intel Alder Lake" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + select X86_P6_NOP + help + + Select this for twelfth-generation processors in the Alder Lake family. + + Enables -march=alderlake + +config MRAPTORLAKE + bool "Intel Raptor Lake" + depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) + select X86_P6_NOP + help + + Select this for thirteenth-generation processors in the Raptor Lake family. + + Enables -march=raptorlake + +config MMETEORLAKE + bool "Intel Meteor Lake" + depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) + select X86_P6_NOP + help + + Select this for fourteenth-generation processors in the Meteor Lake family. + + Enables -march=meteorlake + config GENERIC_CPU bool "Generic-x86-64" depends on X86_64 @@ -294,6 +586,50 @@ config GENERIC_CPU Generic x86-64 CPU. Run equally well on all x86-64 CPUs. +config GENERIC_CPU2 + bool "Generic-x86-64-v2" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + depends on X86_64 + help + Generic x86-64 CPU. + Run equally well on all x86-64 CPUs with min support of x86-64-v2. + +config GENERIC_CPU3 + bool "Generic-x86-64-v3" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + depends on X86_64 + help + Generic x86-64-v3 CPU with v3 instructions. + Run equally well on all x86-64 CPUs with min support of x86-64-v3. + +config GENERIC_CPU4 + bool "Generic-x86-64-v4" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) + depends on X86_64 + help + Generic x86-64 CPU with v4 instructions. + Run equally well on all x86-64 CPUs with min support of x86-64-v4. + +config MNATIVE_INTEL + bool "Intel-Native optimizations autodetected by the compiler" + help + + Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects + the optimum settings to use based on your processor. Do NOT use this + for AMD CPUs. Intel Only! + + Enables -march=native + +config MNATIVE_AMD + bool "AMD-Native optimizations autodetected by the compiler" + help + + Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects + the optimum settings to use based on your processor. Do NOT use this + for Intel CPUs. AMD Only! + + Enables -march=native + endchoice config X86_GENERIC @@ -318,9 +654,17 @@ config X86_INTERNODE_CACHE_SHIFT config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || MK8SSE3 || MK10 \ + || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \ + || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \ + || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \ + || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ + || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE \ + || MNATIVE_INTEL || MNATIVE_AMD || X86_GENERIC || GENERIC_CPU || GENERIC_CPU2 || GENERIC_CPU3 \ + || GENERIC_CPU4 default "4" if MELAN || M486SX || M486 || MGEODEGX1 - default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX + default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII \ + || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX config X86_F00F_BUG def_bool y @@ -332,15 +676,27 @@ config X86_INVD_BUG config X86_ALIGNMENT_16 def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC \ + || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1 config X86_INTEL_USERCOPY def_bool y - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 + depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC \ + || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ + || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \ + || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \ + || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \ + || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX \ + || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \ + || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \ + || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE \ + || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE \ + || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \ + || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD # # P6_NOPs are a relatively minor optimization that require a family >= @@ -356,32 +712,62 @@ config X86_USE_PPRO_CHECKSUM config X86_P6_NOP def_bool y depends on X86_64 - depends on (MCORE2 || MPENTIUM4 || MPSC) + depends on (MCORE2 || MPENTIUM4 || MPSC || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ + || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE \ + || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE \ + || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL) config X86_TSC def_bool y - depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 + depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \ + || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 \ + || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \ + || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \ + || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL \ + || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ + || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL \ + || MNATIVE_AMD) || X86_64 config X86_CMPXCHG64 def_bool y - depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 + depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ + || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 || MK8SSE3 || MK10 \ + || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN \ + || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS \ + || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE \ + || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \ + || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD # this should be set for all -march=.. options where the compiler # generates cmov. config X86_CMOV def_bool y - depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) + depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ + || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX || MK8SSE3 || MK10 \ + || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR \ + || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ + || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \ + || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \ + || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MNATIVE_INTEL || MNATIVE_AMD) config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 - default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8) + default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ + || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8 || MK8SSE3 \ + || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \ + || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \ + || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \ + || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ + || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MRAPTORLAKE \ + || MNATIVE_INTEL || MNATIVE_AMD) default "5" if X86_32 && X86_CMPXCHG64 default "4" config X86_DEBUGCTLMSR def_bool y - depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486SX || M486) && !UML + depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 \ + || M486SX || M486) && !UML config IA32_FEAT_CTL def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 415a5d138de47c..3f5fe79f03abff 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -151,8 +151,47 @@ else # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) cflags-$(CONFIG_MK8) += -march=k8 cflags-$(CONFIG_MPSC) += -march=nocona - cflags-$(CONFIG_MCORE2) += -march=core2 - cflags-$(CONFIG_MATOM) += -march=atom + cflags-$(CONFIG_MK8SSE3) += -march=k8-sse3 + cflags-$(CONFIG_MK10) += -march=amdfam10 + cflags-$(CONFIG_MBARCELONA) += -march=barcelona + cflags-$(CONFIG_MBOBCAT) += -march=btver1 + cflags-$(CONFIG_MJAGUAR) += -march=btver2 + cflags-$(CONFIG_MBULLDOZER) += -march=bdver1 + cflags-$(CONFIG_MPILEDRIVER) += -march=bdver2 -mno-tbm + cflags-$(CONFIG_MSTEAMROLLER) += -march=bdver3 -mno-tbm + cflags-$(CONFIG_MEXCAVATOR) += -march=bdver4 -mno-tbm + cflags-$(CONFIG_MZEN) += -march=znver1 + cflags-$(CONFIG_MZEN2) += -march=znver2 + cflags-$(CONFIG_MZEN3) += -march=znver3 + cflags-$(CONFIG_MZEN4) += -march=znver4 + cflags-$(CONFIG_MNATIVE_INTEL) += -march=native + cflags-$(CONFIG_MNATIVE_AMD) += -march=native + cflags-$(CONFIG_MATOM) += -march=bonnell + cflags-$(CONFIG_MCORE2) += -march=core2 + cflags-$(CONFIG_MNEHALEM) += -march=nehalem + cflags-$(CONFIG_MWESTMERE) += -march=westmere + cflags-$(CONFIG_MSILVERMONT) += -march=silvermont + cflags-$(CONFIG_MGOLDMONT) += -march=goldmont + cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus + cflags-$(CONFIG_MSANDYBRIDGE) += -march=sandybridge + cflags-$(CONFIG_MIVYBRIDGE) += -march=ivybridge + cflags-$(CONFIG_MHASWELL) += -march=haswell + cflags-$(CONFIG_MBROADWELL) += -march=broadwell + cflags-$(CONFIG_MSKYLAKE) += -march=skylake + cflags-$(CONFIG_MSKYLAKEX) += -march=skylake-avx512 + cflags-$(CONFIG_MCANNONLAKE) += -march=cannonlake + cflags-$(CONFIG_MICELAKE) += -march=icelake-client + cflags-$(CONFIG_MCASCADELAKE) += -march=cascadelake + cflags-$(CONFIG_MCOOPERLAKE) += -march=cooperlake + cflags-$(CONFIG_MTIGERLAKE) += -march=tigerlake + cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids + cflags-$(CONFIG_MROCKETLAKE) += -march=rocketlake + cflags-$(CONFIG_MALDERLAKE) += -march=alderlake + cflags-$(CONFIG_MRAPTORLAKE) += -march=raptorlake + cflags-$(CONFIG_MMETEORLAKE) += -march=meteorlake + cflags-$(CONFIG_GENERIC_CPU2) += -march=x86-64-v2 + cflags-$(CONFIG_GENERIC_CPU3) += -march=x86-64-v3 + cflags-$(CONFIG_GENERIC_CPU4) += -march=x86-64-v4 cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic KBUILD_CFLAGS += $(cflags-y) diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h index 75884d2cdec375..18021e8c0c2830 100644 --- a/arch/x86/include/asm/vermagic.h +++ b/arch/x86/include/asm/vermagic.h @@ -17,6 +17,52 @@ #define MODULE_PROC_FAMILY "586MMX " #elif defined CONFIG_MCORE2 #define MODULE_PROC_FAMILY "CORE2 " +#elif defined CONFIG_MNATIVE_INTEL +#define MODULE_PROC_FAMILY "NATIVE_INTEL " +#elif defined CONFIG_MNATIVE_AMD +#define MODULE_PROC_FAMILY "NATIVE_AMD " +#elif defined CONFIG_MNEHALEM +#define MODULE_PROC_FAMILY "NEHALEM " +#elif defined CONFIG_MWESTMERE +#define MODULE_PROC_FAMILY "WESTMERE " +#elif defined CONFIG_MSILVERMONT +#define MODULE_PROC_FAMILY "SILVERMONT " +#elif defined CONFIG_MGOLDMONT +#define MODULE_PROC_FAMILY "GOLDMONT " +#elif defined CONFIG_MGOLDMONTPLUS +#define MODULE_PROC_FAMILY "GOLDMONTPLUS " +#elif defined CONFIG_MSANDYBRIDGE +#define MODULE_PROC_FAMILY "SANDYBRIDGE " +#elif defined CONFIG_MIVYBRIDGE +#define MODULE_PROC_FAMILY "IVYBRIDGE " +#elif defined CONFIG_MHASWELL +#define MODULE_PROC_FAMILY "HASWELL " +#elif defined CONFIG_MBROADWELL +#define MODULE_PROC_FAMILY "BROADWELL " +#elif defined CONFIG_MSKYLAKE +#define MODULE_PROC_FAMILY "SKYLAKE " +#elif defined CONFIG_MSKYLAKEX +#define MODULE_PROC_FAMILY "SKYLAKEX " +#elif defined CONFIG_MCANNONLAKE +#define MODULE_PROC_FAMILY "CANNONLAKE " +#elif defined CONFIG_MICELAKE +#define MODULE_PROC_FAMILY "ICELAKE " +#elif defined CONFIG_MCASCADELAKE +#define MODULE_PROC_FAMILY "CASCADELAKE " +#elif defined CONFIG_MCOOPERLAKE +#define MODULE_PROC_FAMILY "COOPERLAKE " +#elif defined CONFIG_MTIGERLAKE +#define MODULE_PROC_FAMILY "TIGERLAKE " +#elif defined CONFIG_MSAPPHIRERAPIDS +#define MODULE_PROC_FAMILY "SAPPHIRERAPIDS " +#elif defined CONFIG_ROCKETLAKE +#define MODULE_PROC_FAMILY "ROCKETLAKE " +#elif defined CONFIG_MALDERLAKE +#define MODULE_PROC_FAMILY "ALDERLAKE " +#elif defined CONFIG_MRAPTORLAKE +#define MODULE_PROC_FAMILY "RAPTORLAKE " +#elif defined CONFIG_MMETEORLAKE +#define MODULE_PROC_FAMILY "METEORLAKE " #elif defined CONFIG_MATOM #define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 @@ -35,6 +81,32 @@ #define MODULE_PROC_FAMILY "K7 " #elif defined CONFIG_MK8 #define MODULE_PROC_FAMILY "K8 " +#elif defined CONFIG_MK8SSE3 +#define MODULE_PROC_FAMILY "K8SSE3 " +#elif defined CONFIG_MK10 +#define MODULE_PROC_FAMILY "K10 " +#elif defined CONFIG_MBARCELONA +#define MODULE_PROC_FAMILY "BARCELONA " +#elif defined CONFIG_MBOBCAT +#define MODULE_PROC_FAMILY "BOBCAT " +#elif defined CONFIG_MBULLDOZER +#define MODULE_PROC_FAMILY "BULLDOZER " +#elif defined CONFIG_MPILEDRIVER +#define MODULE_PROC_FAMILY "PILEDRIVER " +#elif defined CONFIG_MSTEAMROLLER +#define MODULE_PROC_FAMILY "STEAMROLLER " +#elif defined CONFIG_MJAGUAR +#define MODULE_PROC_FAMILY "JAGUAR " +#elif defined CONFIG_MEXCAVATOR +#define MODULE_PROC_FAMILY "EXCAVATOR " +#elif defined CONFIG_MZEN +#define MODULE_PROC_FAMILY "ZEN " +#elif defined CONFIG_MZEN2 +#define MODULE_PROC_FAMILY "ZEN2 " +#elif defined CONFIG_MZEN3 +#define MODULE_PROC_FAMILY "ZEN3 " +#elif defined CONFIG_MZEN4 +#define MODULE_PROC_FAMILY "ZEN4 " #elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE From 440587a5413cbbd51cad5fedebe6ce477488a0c5 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 12 Dec 2022 11:06:37 +0100 Subject: [PATCH 06/37] Allow O3 Signed-off-by: Peter Jung --- init/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index c23c83d912c629..89a3e52edcb9f9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1436,7 +1436,6 @@ config CC_OPTIMIZE_FOR_PERFORMANCE config CC_OPTIMIZE_FOR_PERFORMANCE_O3 bool "Optimize more for performance (-O3)" - depends on ARC help Choosing this option will pass "-O3" to your compiler to optimize the kernel yet more for performance. From eef4465102fd7b86e21eb3101d7355a214f588d0 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 3 Nov 2022 11:48:27 +0100 Subject: [PATCH 07/37] THP Shrinker From: Alexander Zhu https://lore.kernel.org/linux-mm/cover.1667454613.git.alexlzhu@fb.com/T/#t Changelog: v5 to v6 -removed PageSwapCache check from add_underutilized_thp as split_huge_page takes care of this already. -added check for PageHuge in add_underutilized_thp to account for hugetlbfs pages. -added Yu Zhao as author for the second patch v4 to v5 -split out split_huge_page changes into three different patches. One for zapping zero pages, one for not remapping zero pages, and one for self tests. -fixed bug with lru_to_folio, was corrupting the folio -fixed bug with memchr_inv in mm/thp_utilization. zero page should mean !memchr_inv(kaddr, 0, PAGE_SIZE) v3 to v4 -changed thp_utilization_bucket() function to take folios, saves conversion between page and folio -added newlines where they were previously missing in v2-v3 -moved the thp utilization code out into its own file under mm/thp_utilization.c -removed is_anonymous_transparent_hugepage function. Use folio_test_anon and folio_test_trans_huge instead. -changed thp_number_utilized_pages to use memchr_inv -added some comments regardling trylock -change the relock to be unconditional in low_util_free_page -only expose can_shrink_thp, abstract the thp_utilization and bucket logic to be private to mm/thp_utilization.c v2 to v3 -put_page() after trylock_page in low_util_free_page. put() to be called after get() call -removed spin_unlock_irq in low_util_free_page above LRU_SKIP. There was a double unlock. -moved spin_unlock_irq() to below list_lru_isolate() in low_util_free_page. This is to shorten the critical section. -moved lock_page in add_underutilized_thp such that we only lock when allocating and adding to the list_lru -removed list_lru_alloc in list_lru_add_page and list_lru_delete_page as these are no longer needed. v1 to v2 -reversed ordering of is_transparent_hugepage and PageAnon in is_anon_transparent_hugepage, page->mapping is only meaningful for user pages -only trigger the unmap_clean/zap in split_huge_page on anonymous THPs. We cannot zap zero pages for file THPs. -modified split_huge_page self test based off more recent changes. -Changed lru_lock to be irq safe. Added irq_save and restore around list_lru adds/deletes. -Changed low_util_free_page() to trylock the page, and if it fails, unlock lru_lock and return LRU_SKIP. This is to avoid deadlock between reclaim, which calls split_huge_page() and the THP Shrinker -Changed low_util_free_page() to unlock lru_lock, split_huge_page, then lock lru_lock. This way split_huge_page is not called with the lru_lock held. That leads to deadlock as split_huge_page calls on_each_cpu_mask -Changed list_lru_shrink_walk to list_lru_shrink_walk_irq. RFC to v1 -refactored out the code to obtain the thp_utilization_bucket, as that now has to be used in multiple places. -added support to map to the read only zero page when splitting a THP registered with userfaultfd. -added a self test to verify that userfaultfd change is working. -Remove all THPs that are not in the top utilization bucket. This is what we have found to perform the best in production testing, we have found that there are an almost trivial number of THPs in the middle range of buckets that account for most of the memory waste. -Added check for THP utilization prior to split_huge_page for the THP Shrinker. This is to account for THPs that move to the top bucket, but were underutilized at the time they were added to the list_lru. -Multiply the shrink_count and scan_count by HPAGE_PMD_NR. This is because a THP is 512 pages, and should count as 512 objects in reclaim. This way reclaim is triggered at a more appropriate frequency than in the RFC. Transparent Hugepages use a larger page size of 2MB in comparison to normal sized pages that are 4kb. A larger page size allows for fewer TLB cache misses and thus more efficient use of the CPU. Using a larger page size also results in more memory waste, which can hurt performance in some use cases. THPs are currently enabled in the Linux Kernel by applications in limited virtual address ranges via the madvise system call. The THP shrinker tries to find a balance between increased use of THPs, and increased use of memory. It shrinks the size of memory by removing the underutilized THPs that are identified by the thp_utilization scanner. In our experiments we have noticed that the least utilized THPs are almost entirely unutilized. Sample Output: Utilized[0-50]: 1331 680884 Utilized[51-101]: 9 3983 Utilized[102-152]: 3 1187 Utilized[153-203]: 0 0 Utilized[204-255]: 2 539 Utilized[256-306]: 5 1135 Utilized[307-357]: 1 192 Utilized[358-408]: 0 0 Utilized[409-459]: 1 57 Utilized[460-512]: 400 13 Last Scan Time: 223.98s Last Scan Duration: 70.65s Above is a sample obtained from one of our test machines when THP is always enabled. Of the 1331 THPs in this thp_utilization sample that have from 0-50 utilized subpages, we see that there are 680884 free pages. This comes out to 680884 / (512 * 1331) = 99.91% zero pages in the least utilized bucket. This represents 680884 * 4KB = 2.7GB memory waste. Also note that the vast majority of pages are either in the least utilized [0-50] or most utilized [460-512] buckets. The least utilized THPs are responsible for almost all of the memory waste when THP is always enabled. Thus by clearing out THPs in the lowest utilization bucket we extract most of the improvement in CPU efficiency. We have seen similar results on our production hosts. This patchset introduces the THP shrinker we have developed to identify and split the least utilized THPs. It includes the thp_utilization changes that groups anonymous THPs into buckets, the split_huge_page() changes that identify and zap zero 4KB pages within THPs and the shrinker changes. It should be noted that the split_huge_page() changes are based off previous work done by Yu Zhao. In the future, we intend to allow additional tuning to the shrinker based on workload depending on CPU/IO/Memory pressure and the amount of anonymous memory. The long term goal is to eventually always enable THP for all applications and deprecate madvise entirely. In production we thus far have observed 2-3% reduction in overall cpu usage on stateless web servers when THP is always enabled. Alexander Zhu (4): mm: add thp_utilization metrics to debugfs mm: do not remap clean subpages when splitting isolated thp mm: add selftests to split_huge_page() to verify unmap/zap of zero pages mm: THP low utilization shrinker Yu Zhao (1): mm: changes to split_huge_page() to free zero filled tail pages Signed-off-by: Peter Jung --- Documentation/admin-guide/mm/transhuge.rst | 9 + include/linux/huge_mm.h | 9 + include/linux/list_lru.h | 24 ++ include/linux/mm_types.h | 5 + include/linux/rmap.h | 2 +- include/linux/vm_event_item.h | 3 + mm/Makefile | 2 +- mm/huge_memory.c | 156 +++++++++++- mm/list_lru.c | 49 ++++ mm/migrate.c | 73 +++++- mm/migrate_device.c | 4 +- mm/page_alloc.c | 6 + mm/thp_utilization.c | 222 ++++++++++++++++++ mm/vmstat.c | 3 + .../selftests/vm/split_huge_page_test.c | 115 ++++++++- tools/testing/selftests/vm/vm_util.c | 23 ++ tools/testing/selftests/vm/vm_util.h | 3 + 17 files changed, 690 insertions(+), 18 deletions(-) create mode 100644 mm/thp_utilization.c diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 8ee78ec232ebcf..21d86303c97ef7 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -304,6 +304,15 @@ To identify what applications are mapping file transparent huge pages, it is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields for each mapping. +The utilization of transparent hugepages can be viewed by reading +``/sys/kernel/debug/thp_utilization``. The utilization of a THP is defined +as the ratio of non zero filled 4kb pages to the total number of pages in a +THP. The buckets are labelled by the range of total utilized 4kb pages with +one line per utilization bucket. Each line contains the total number of +THPs in that bucket and the total number of zero filled 4kb pages summed +over all THPs in that bucket. The last two lines show the timestamp and +duration respectively of the most recent scan over all of physical memory. + Note that reading the smaps file is expensive and reading it frequently will incur overhead. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a1341fdcf666d0..1745c94eb1039b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -178,6 +178,8 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); +bool can_shrink_thp(struct folio *folio); + void prep_transhuge_page(struct page *page); void free_transhuge_page(struct page *page); @@ -189,6 +191,8 @@ static inline int split_huge_page(struct page *page) } void deferred_split_huge_page(struct page *page); +void add_underutilized_thp(struct page *page); + void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio); @@ -302,6 +306,11 @@ static inline struct list_head *page_deferred_list(struct page *page) return &page[2].deferred_list; } +static inline struct list_head *page_underutilized_thp_list(struct page *page) +{ + return &page[3].underutilized_thp_list; +} + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index b35968ee9fb508..c2cf146ea88002 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -89,6 +89,18 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren */ bool list_lru_add(struct list_lru *lru, struct list_head *item); +/** + * list_lru_add_page: add an element to the lru list's tail + * @list_lru: the lru pointer + * @page: the page containing the item + * @item: the item to be deleted. + * + * This function works the same as list_lru_add in terms of list + * manipulation. Used for non slab objects contained in the page. + * + * Return value: true if the list was updated, false otherwise + */ +bool list_lru_add_page(struct list_lru *lru, struct page *page, struct list_head *item); /** * list_lru_del: delete an element to the lru list * @list_lru: the lru pointer @@ -102,6 +114,18 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item); */ bool list_lru_del(struct list_lru *lru, struct list_head *item); +/** + * list_lru_del_page: delete an element to the lru list + * @list_lru: the lru pointer + * @page: the page containing the item + * @item: the item to be deleted. + * + * This function works the same as list_lru_del in terms of list + * manipulation. Used for non slab objects contained in the page. + * + * Return value: true if the list was updated, false otherwise + */ +bool list_lru_del_page(struct list_lru *lru, struct page *page, struct list_head *item); /** * list_lru_count_one: return the number of objects currently held by @lru * @lru: the lru pointer. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 500e536796ca4a..da1d1cf4215871 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -152,6 +152,11 @@ struct page { /* For both global and memcg */ struct list_head deferred_list; }; + struct { /* Third tail page of compound page */ + unsigned long _compound_pad_3; /* compound_head */ + unsigned long _compound_pad_4; + struct list_head underutilized_thp_list; + }; struct { /* Page table pages */ unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bd3504d11b1559..3f83bbcf133367 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -428,7 +428,7 @@ int folio_mkclean(struct folio *); int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); +void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 3518dba1e02f4b..3618b10ddec9ce 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -111,6 +111,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD THP_SPLIT_PUD, #endif + THP_SPLIT_FREE, + THP_SPLIT_UNMAP, + THP_SPLIT_REMAP_READONLY_ZERO_PAGE, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, THP_SWPOUT, diff --git a/mm/Makefile b/mm/Makefile index 8e105e5b3e2938..5f76dc6ce044cf 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -95,7 +95,7 @@ obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_NUMA) += memory-tiers.o obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o -obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o thp_utilization.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o ifdef CONFIG_SWAP diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 811d19b5c4f606..9f2f15d8e02ef0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -71,6 +71,8 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; +static struct list_lru huge_low_util_page_lru; + bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, bool smaps, bool in_pf, bool enforce_sysfs) { @@ -234,6 +236,53 @@ static struct shrinker huge_zero_page_shrinker = { .seeks = DEFAULT_SEEKS, }; +static enum lru_status low_util_free_page(struct list_head *item, + struct list_lru_one *lru, + spinlock_t *lru_lock, + void *cb_arg) +{ + struct folio *folio = page_folio(list_entry(item, struct page, underutilized_thp_list)); + struct page *head = &folio->page; + + if (get_page_unless_zero(head)) { + /* Inverse lock order from add_underutilized_thp() */ + if (!trylock_page(head)) { + put_page(head); + return LRU_SKIP; + } + list_lru_isolate(lru, item); + spin_unlock_irq(lru_lock); + if (can_shrink_thp(folio)) + split_huge_page(head); + spin_lock_irq(lru_lock); + unlock_page(head); + put_page(head); + } + + return LRU_REMOVED_RETRY; +} + +static unsigned long shrink_huge_low_util_page_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return HPAGE_PMD_NR * list_lru_shrink_count(&huge_low_util_page_lru, sc); +} + +static unsigned long shrink_huge_low_util_page_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + return HPAGE_PMD_NR * list_lru_shrink_walk_irq(&huge_low_util_page_lru, + sc, low_util_free_page, NULL); +} + +static struct shrinker huge_low_util_page_shrinker = { + .count_objects = shrink_huge_low_util_page_count, + .scan_objects = shrink_huge_low_util_page_scan, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | + SHRINKER_NONSLAB, +}; + #ifdef CONFIG_SYSFS static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -485,6 +534,9 @@ static int __init hugepage_init(void) if (err) goto err_slab; + err = register_shrinker(&huge_low_util_page_shrinker, "thp-low-util"); + if (err) + goto err_low_util_shrinker; err = register_shrinker(&huge_zero_page_shrinker, "thp-zero"); if (err) goto err_hzp_shrinker; @@ -492,6 +544,9 @@ static int __init hugepage_init(void) if (err) goto err_split_shrinker; + err = list_lru_init_memcg(&huge_low_util_page_lru, &huge_low_util_page_shrinker); + if (err) + goto err_low_util_list_lru; /* * By default disable transparent hugepages on smaller systems, * where the extra memory used could hurt more than TLB overhead @@ -508,10 +563,14 @@ static int __init hugepage_init(void) return 0; err_khugepaged: + list_lru_destroy(&huge_low_util_page_lru); +err_low_util_list_lru: unregister_shrinker(&deferred_split_shrinker); err_split_shrinker: unregister_shrinker(&huge_zero_page_shrinker); err_hzp_shrinker: + unregister_shrinker(&huge_low_util_page_shrinker); +err_low_util_shrinker: khugepaged_destroy(); err_slab: hugepage_exit_sysfs(hugepage_kobj); @@ -586,6 +645,7 @@ void prep_transhuge_page(struct page *page) */ INIT_LIST_HEAD(page_deferred_list(page)); + INIT_LIST_HEAD(page_underutilized_thp_list(page)); set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } @@ -2376,7 +2436,7 @@ static void unmap_folio(struct folio *folio) try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); } -static void remap_page(struct folio *folio, unsigned long nr) +static void remap_page(struct folio *folio, unsigned long nr, bool unmap_clean) { int i = 0; @@ -2384,7 +2444,7 @@ static void remap_page(struct folio *folio, unsigned long nr) if (!folio_test_anon(folio)) return; for (;;) { - remove_migration_ptes(folio, folio, true); + remove_migration_ptes(folio, folio, true, unmap_clean); i += folio_nr_pages(folio); if (i >= nr) break; @@ -2454,8 +2514,7 @@ static void __split_huge_page_tail(struct page *head, int tail, LRU_GEN_MASK | LRU_REFS_MASK)); /* ->mapping in first tail page is compound_mapcount */ - VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, - page_tail); + VM_BUG_ON_PAGE(tail > 3 && page_tail->mapping != TAIL_MAPPING, page_tail); page_tail->mapping = head->mapping; page_tail->index = head->index + tail; @@ -2508,6 +2567,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct address_space *swap_cache = NULL; unsigned long offset = 0; unsigned int nr = thp_nr_pages(head); + LIST_HEAD(pages_to_free); + int nr_pages_to_free = 0; int i; /* complete memcg works before add pages to LRU */ @@ -2570,7 +2631,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } local_irq_enable(); - remap_page(folio, nr); + remap_page(folio, nr, PageAnon(head)); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; @@ -2584,6 +2645,34 @@ static void __split_huge_page(struct page *page, struct list_head *list, continue; unlock_page(subpage); + /* + * If a tail page has only two references left, one inherited + * from the isolation of its head and the other from + * lru_add_page_tail() which we are about to drop, it means this + * tail page was concurrently zapped. Then we can safely free it + * and save page reclaim or migration the trouble of trying it. + */ + if (list && page_ref_freeze(subpage, 2)) { + VM_BUG_ON_PAGE(PageLRU(subpage), subpage); + VM_BUG_ON_PAGE(PageCompound(subpage), subpage); + VM_BUG_ON_PAGE(page_mapped(subpage), subpage); + + ClearPageActive(subpage); + ClearPageUnevictable(subpage); + list_move(&subpage->lru, &pages_to_free); + nr_pages_to_free++; + continue; + } + + /* + * If a tail page has only one reference left, it will be freed + * by the call to free_page_and_swap_cache below. Since zero + * subpages are no longer remapped, there will only be one + * reference left in cases outside of reclaim or migration. + */ + if (page_ref_count(subpage) == 1) + nr_pages_to_free++; + /* * Subpages may be freed if there wasn't any mapping * like if add_to_swap() is running on a lru page that @@ -2593,6 +2682,13 @@ static void __split_huge_page(struct page *page, struct list_head *list, */ free_page_and_swap_cache(subpage); } + + if (!nr_pages_to_free) + return; + + mem_cgroup_uncharge_list(&pages_to_free); + free_unref_page_list(&pages_to_free); + count_vm_events(THP_SPLIT_FREE, nr_pages_to_free); } /* Racy check whether the huge page can be split */ @@ -2635,6 +2731,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) struct folio *folio = page_folio(page); struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page); XA_STATE(xas, &folio->mapping->i_pages, folio->index); + struct list_head *underutilized_thp_list = page_underutilized_thp_list(&folio->page); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int extra_pins, ret; @@ -2742,6 +2839,10 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) list_del(page_deferred_list(&folio->page)); } spin_unlock(&ds_queue->split_queue_lock); + /* Frozen refs lock out additions, test can be lockless */ + if (!list_empty(underutilized_thp_list)) + list_lru_del_page(&huge_low_util_page_lru, &folio->page, + underutilized_thp_list); if (mapping) { int nr = folio_nr_pages(folio); @@ -2764,7 +2865,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) xas_unlock(&xas); local_irq_enable(); - remap_page(folio, folio_nr_pages(folio)); + remap_page(folio, folio_nr_pages(folio), false); ret = -EBUSY; } @@ -2784,6 +2885,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) void free_transhuge_page(struct page *page) { struct deferred_split *ds_queue = get_deferred_split_queue(page); + struct list_head *underutilized_thp_list = page_underutilized_thp_list(page); unsigned long flags; spin_lock_irqsave(&ds_queue->split_queue_lock, flags); @@ -2792,6 +2894,13 @@ void free_transhuge_page(struct page *page) list_del(page_deferred_list(page)); } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + /* A dead page cannot be re-added to the THP shrinker, test can be lockless */ + if (!list_empty(underutilized_thp_list)) + list_lru_del_page(&huge_low_util_page_lru, page, underutilized_thp_list); + + if (PageLRU(page)) + __folio_clear_lru_flags(page_folio(page)); + free_compound_page(page); } @@ -2832,6 +2941,41 @@ void deferred_split_huge_page(struct page *page) spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } +void add_underutilized_thp(struct page *page) +{ + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + + /* hugetlbfs pages do not have an associated memcgroup */ + if (PageHuge(page)) + return; + + /* + * Need to take a reference on the page to prevent the page from getting free'd from + * under us while we are adding the THP to the shrinker. + */ + if (!get_page_unless_zero(page)) + return; + + if (is_huge_zero_page(page)) + goto out_put; + + /* Stabilize page->memcg to allocate and add to the same list */ + lock_page(page); + +#ifdef CONFIG_MEMCG_KMEM + if (memcg_list_lru_alloc(page_memcg(page), &huge_low_util_page_lru, GFP_KERNEL)) + goto out_unlock; +#endif + + list_lru_add_page(&huge_low_util_page_lru, page, page_underutilized_thp_list(page)); + +out_unlock: + unlock_page(page); +out_put: + put_page(page); +} + static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { diff --git a/mm/list_lru.c b/mm/list_lru.c index a05e5bef3b4007..8cc56a84b55442 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -140,6 +140,32 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) } EXPORT_SYMBOL_GPL(list_lru_add); +bool list_lru_add_page(struct list_lru *lru, struct page *page, struct list_head *item) +{ + int nid = page_to_nid(page); + struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; + struct mem_cgroup *memcg; + unsigned long flags; + + spin_lock_irqsave(&nlru->lock, flags); + if (list_empty(item)) { + memcg = page_memcg(page); + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); + list_add_tail(item, &l->list); + /* Set shrinker bit if the first element was added */ + if (!l->nr_items++) + set_shrinker_bit(memcg, nid, + lru_shrinker_id(lru)); + nlru->nr_items++; + spin_unlock_irqrestore(&nlru->lock, flags); + return true; + } + spin_unlock_irqrestore(&nlru->lock, flags); + return false; +} +EXPORT_SYMBOL_GPL(list_lru_add_page); + bool list_lru_del(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); @@ -160,6 +186,29 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) } EXPORT_SYMBOL_GPL(list_lru_del); +bool list_lru_del_page(struct list_lru *lru, struct page *page, struct list_head *item) +{ + int nid = page_to_nid(page); + struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; + struct mem_cgroup *memcg; + unsigned long flags; + + spin_lock_irqsave(&nlru->lock, flags); + if (!list_empty(item)) { + memcg = page_memcg(page); + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); + list_del_init(item); + l->nr_items--; + nlru->nr_items--; + spin_unlock_irqrestore(&nlru->lock, flags); + return true; + } + spin_unlock_irqrestore(&nlru->lock, flags); + return false; +} +EXPORT_SYMBOL_GPL(list_lru_del_page); + void list_lru_isolate(struct list_lru_one *list, struct list_head *item) { list_del_init(item); diff --git a/mm/migrate.c b/mm/migrate.c index dff333593a8ae2..2764b14d338374 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -168,13 +169,62 @@ void putback_movable_pages(struct list_head *l) } } +static bool try_to_unmap_clean(struct page_vma_mapped_walk *pvmw, struct page *page) +{ + void *addr; + bool dirty; + pte_t newpte; + + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page); + + if (PageMlocked(page) || (pvmw->vma->vm_flags & VM_LOCKED)) + return false; + + /* + * The pmd entry mapping the old thp was flushed and the pte mapping + * this subpage has been non present. Therefore, this subpage is + * inaccessible. We don't need to remap it if it contains only zeros. + */ + addr = kmap_local_page(page); + dirty = memchr_inv(addr, 0, PAGE_SIZE); + kunmap_local(addr); + + if (dirty) + return false; + + pte_clear_not_present_full(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, false); + + if (userfaultfd_armed(pvmw->vma)) { + newpte = pte_mkspecial(pfn_pte(page_to_pfn(ZERO_PAGE(pvmw->address)), + pvmw->vma->vm_page_prot)); + ptep_clear_flush(pvmw->vma, pvmw->address, pvmw->pte); + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte); + dec_mm_counter(pvmw->vma->vm_mm, MM_ANONPAGES); + count_vm_event(THP_SPLIT_REMAP_READONLY_ZERO_PAGE); + return true; + } + + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(page)); + count_vm_event(THP_SPLIT_UNMAP); + return true; +} + +struct rmap_walk_arg { + struct folio *folio; + bool unmap_clean; +}; + /* * Restore a potential migration pte to a working pte entry */ static bool remove_migration_pte(struct folio *folio, - struct vm_area_struct *vma, unsigned long addr, void *old) + struct vm_area_struct *vma, unsigned long addr, void *arg) { - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); + struct rmap_walk_arg *rmap_walk_arg = arg; + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION); while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; @@ -197,6 +247,8 @@ static bool remove_migration_pte(struct folio *folio, continue; } #endif + if (rmap_walk_arg->unmap_clean && try_to_unmap_clean(&pvmw, new)) + continue; folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); @@ -272,13 +324,20 @@ static bool remove_migration_pte(struct folio *folio, * Get rid of all migration entries and replace them by * references to the indicated page. */ -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) +void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean) { + struct rmap_walk_arg rmap_walk_arg = { + .folio = src, + .unmap_clean = unmap_clean, + }; + struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, - .arg = src, + .arg = &rmap_walk_arg, }; + VM_BUG_ON_FOLIO(unmap_clean && src != dst, src); + if (locked) rmap_walk_locked(dst, &rwc); else @@ -872,7 +931,7 @@ static int writeout(struct address_space *mapping, struct folio *folio) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(folio, folio, false); + remove_migration_ptes(folio, folio, false, false); rc = mapping->a_ops->writepage(&folio->page, &wbc); @@ -1128,7 +1187,7 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, if (page_was_mapped) remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, false); + rc == MIGRATEPAGE_SUCCESS ? dst : src, false, false); out_unlock_both: folio_unlock(dst); @@ -1338,7 +1397,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (page_was_mapped) remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, false); + rc == MIGRATEPAGE_SUCCESS ? dst : src, false, false); unlock_put_anon: folio_unlock(dst); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 721b2365dbca96..59e7d571d91f07 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -425,7 +425,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, continue; folio = page_folio(page); - remove_migration_ptes(folio, folio, false); + remove_migration_ptes(folio, folio, false, false); src_pfns[i] = 0; folio_unlock(folio); @@ -851,7 +851,7 @@ void migrate_device_finalize(unsigned long *src_pfns, src = page_folio(page); dst = page_folio(newpage); - remove_migration_ptes(src, dst, false); + remove_migration_ptes(src, dst, false, false); folio_unlock(src); if (is_zone_device_page(page)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e60657875d328..70e9bfd76e040a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1336,6 +1336,12 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) * deferred_list.next -- ignore value. */ break; + case 3: + /* + * the third tail page: ->mapping is + * underutilized_thp_list.next -- ignore value. + */ + break; default: if (page->mapping != TAIL_MAPPING) { bad_page(page, "corrupted mapping in tail page"); diff --git a/mm/thp_utilization.c b/mm/thp_utilization.c new file mode 100644 index 00000000000000..0cb18f122c5763 --- /dev/null +++ b/mm/thp_utilization.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022 Meta, Inc. + * Authors: Alexander Zhu, Johannes Weiner, Rik van Riel + */ + +#include +#include +#include +/* + * The number of utilization buckets THPs will be grouped in + * under /sys/kernel/debug/thp_utilization. + */ +#define THP_UTIL_BUCKET_NR 10 +/* + * The number of hugepages to scan through on each periodic + * run of the scanner that generates /sys/kernel/debug/thp_utilization. + */ +#define THP_UTIL_SCAN_SIZE 256 + +static void thp_utilization_workfn(struct work_struct *work); +static DECLARE_DELAYED_WORK(thp_utilization_work, thp_utilization_workfn); + +struct thp_scan_info_bucket { + int nr_thps; + int nr_zero_pages; +}; + +struct thp_scan_info { + struct thp_scan_info_bucket buckets[THP_UTIL_BUCKET_NR]; + struct zone *scan_zone; + struct timespec64 last_scan_duration; + struct timespec64 last_scan_time; + unsigned long pfn; +}; + +/* + * thp_scan_debugfs is referred to when /sys/kernel/debug/thp_utilization + * is opened. thp_scan is used to keep track fo the current scan through + * physical memory. + */ +static struct thp_scan_info thp_scan_debugfs; +static struct thp_scan_info thp_scan; + +#ifdef CONFIG_DEBUG_FS +static int thp_utilization_show(struct seq_file *seqf, void *pos) +{ + int i; + int start; + int end; + + for (i = 0; i < THP_UTIL_BUCKET_NR; i++) { + start = i * HPAGE_PMD_NR / THP_UTIL_BUCKET_NR; + end = (i + 1 == THP_UTIL_BUCKET_NR) + ? HPAGE_PMD_NR + : ((i + 1) * HPAGE_PMD_NR / THP_UTIL_BUCKET_NR - 1); + /* The last bucket will need to contain 100 */ + seq_printf(seqf, "Utilized[%d-%d]: %d %d\n", start, end, + thp_scan_debugfs.buckets[i].nr_thps, + thp_scan_debugfs.buckets[i].nr_zero_pages); + } + + seq_printf(seqf, "Last Scan Time: %lu.%02lus\n", + (unsigned long)thp_scan_debugfs.last_scan_time.tv_sec, + (thp_scan_debugfs.last_scan_time.tv_nsec / (NSEC_PER_SEC / 100))); + + seq_printf(seqf, "Last Scan Duration: %lu.%02lus\n", + (unsigned long)thp_scan_debugfs.last_scan_duration.tv_sec, + (thp_scan_debugfs.last_scan_duration.tv_nsec / (NSEC_PER_SEC / 100))); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(thp_utilization); + +static int __init thp_utilization_debugfs(void) +{ + debugfs_create_file("thp_utilization", 0200, NULL, NULL, + &thp_utilization_fops); + return 0; +} +late_initcall(thp_utilization_debugfs); +#endif + +static int thp_utilization_bucket(int num_utilized_pages) +{ + int bucket; + + if (num_utilized_pages < 0 || num_utilized_pages > HPAGE_PMD_NR) + return -1; + + /* Group THPs into utilization buckets */ + bucket = num_utilized_pages * THP_UTIL_BUCKET_NR / HPAGE_PMD_NR; + return min(bucket, THP_UTIL_BUCKET_NR - 1); +} + +static int thp_number_utilized_pages(struct folio *folio) +{ + int thp_nr_utilized_pages = HPAGE_PMD_NR; + void *kaddr; + int i; + bool zero_page; + + if (!folio || !folio_test_anon(folio) || !folio_test_transhuge(folio)) + return -1; + + for (i = 0; i < folio_nr_pages(folio); i++) { + kaddr = kmap_local_folio(folio, i); + zero_page = !memchr_inv(kaddr, 0, PAGE_SIZE); + + if (zero_page) + thp_nr_utilized_pages--; + + kunmap_local(kaddr); + } + + return thp_nr_utilized_pages; +} + +bool can_shrink_thp(struct folio *folio) +{ + int bucket, num_utilized_pages; + + if (!folio || !folio_test_anon(folio) || !folio_test_transhuge(folio)) + return false; + + num_utilized_pages = thp_number_utilized_pages(folio); + bucket = thp_utilization_bucket(num_utilized_pages); + + return bucket >= 0 && bucket < THP_UTIL_BUCKET_NR - 1; +} + +static void thp_scan_next_zone(void) +{ + struct timespec64 current_time; + bool update_debugfs; + /* + * THP utilization worker thread has reached the end + * of the memory zone. Proceed to the next zone. + */ + thp_scan.scan_zone = next_zone(thp_scan.scan_zone); + update_debugfs = !thp_scan.scan_zone; + thp_scan.scan_zone = update_debugfs ? (first_online_pgdat())->node_zones + : thp_scan.scan_zone; + thp_scan.pfn = (thp_scan.scan_zone->zone_start_pfn + HPAGE_PMD_NR - 1) + & ~(HPAGE_PMD_SIZE - 1); + if (!update_debugfs) + return; + + /* + * If the worker has scanned through all of physical memory then + * update information displayed in /sys/kernel/debug/thp_utilization + */ + ktime_get_ts64(¤t_time); + thp_scan_debugfs.last_scan_duration = timespec64_sub(current_time, + thp_scan_debugfs.last_scan_time); + thp_scan_debugfs.last_scan_time = current_time; + + memcpy(&thp_scan_debugfs.buckets, &thp_scan.buckets, sizeof(thp_scan.buckets)); + memset(&thp_scan.buckets, 0, sizeof(thp_scan.buckets)); +} + +static void thp_util_scan(unsigned long pfn_end) +{ + struct page *page = NULL; + int bucket, current_pfn, num_utilized_pages; + int i; + /* + * Scan through each memory zone in chunks of THP_UTIL_SCAN_SIZE + * PFNs every second looking for anonymous THPs. + */ + for (i = 0; i < THP_UTIL_SCAN_SIZE; i++) { + current_pfn = thp_scan.pfn; + thp_scan.pfn += HPAGE_PMD_NR; + if (current_pfn >= pfn_end) + return; + + page = pfn_to_online_page(current_pfn); + if (!page) + continue; + + num_utilized_pages = thp_number_utilized_pages(page_folio(page)); + bucket = thp_utilization_bucket(num_utilized_pages); + if (bucket < 0) + continue; + + if (bucket < THP_UTIL_BUCKET_NR - 1) + add_underutilized_thp(page); + + thp_scan.buckets[bucket].nr_thps++; + thp_scan.buckets[bucket].nr_zero_pages += (HPAGE_PMD_NR - num_utilized_pages); + } +} + +static void thp_utilization_workfn(struct work_struct *work) +{ + unsigned long pfn_end; + /* + * Worker function that scans through all of physical memory + * for anonymous THPs. + */ + if (!thp_scan.scan_zone) + thp_scan.scan_zone = (first_online_pgdat())->node_zones; + + pfn_end = zone_end_pfn(thp_scan.scan_zone); + /* If we have reached the end of the zone or end of physical memory + * move on to the next zone. Otherwise, scan the next PFNs in the + * current zone. + */ + if (!managed_zone(thp_scan.scan_zone) || thp_scan.pfn >= pfn_end) + thp_scan_next_zone(); + else + thp_util_scan(pfn_end); + + schedule_delayed_work(&thp_utilization_work, HZ); +} + +static int __init thp_scan_init(void) +{ + schedule_delayed_work(&thp_utilization_work, HZ); + return 0; +} +subsys_initcall(thp_scan_init); diff --git a/mm/vmstat.c b/mm/vmstat.c index b2371d745e007f..3d802eb6754d0b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1359,6 +1359,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD "thp_split_pud", #endif + "thp_split_free", + "thp_split_unmap", + "thp_split_remap_readonly_zero_page", "thp_zero_page_alloc", "thp_zero_page_alloc_failed", "thp_swpout", diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c index 76e1c36dd9e577..42f0e79a4508de 100644 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -16,6 +16,9 @@ #include #include #include +#include /* Definition of SYS_* constants */ +#include +#include #include "vm_util.h" uint64_t pagesize; @@ -88,6 +91,115 @@ static void write_debugfs(const char *fmt, ...) } } +static char *allocate_zero_filled_hugepage(size_t len) +{ + char *result; + size_t i; + + result = memalign(pmd_pagesize, len); + if (!result) { + printf("Fail to allocate memory\n"); + exit(EXIT_FAILURE); + } + + madvise(result, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + result[i] = (char)0; + + return result; +} + +static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len) +{ + uint64_t rss_anon_before, rss_anon_after; + size_t i; + + if (!check_huge_anon(one_page, 4, pmd_pagesize)) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + rss_anon_before = rss_anon(); + if (!rss_anon_before) { + printf("No RssAnon is allocated before split\n"); + exit(EXIT_FAILURE); + } + + /* split all THPs */ + write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, + (uint64_t)one_page + len); + + for (i = 0; i < len; i++) + if (one_page[i] != (char)0) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + + if (!check_huge_anon(one_page, 0, pmd_pagesize)) { + printf("Still AnonHugePages not split\n"); + exit(EXIT_FAILURE); + } + + rss_anon_after = rss_anon(); + if (rss_anon_after >= rss_anon_before) { + printf("Incorrect RssAnon value. Before: %ld After: %ld\n", + rss_anon_before, rss_anon_after); + exit(EXIT_FAILURE); + } +} + +void split_pmd_zero_pages(void) +{ + char *one_page; + int nr_hpages = 4; + size_t len = nr_hpages * pmd_pagesize; + + one_page = allocate_zero_filled_hugepage(len); + verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len); + printf("Split zero filled huge pages successful\n"); + free(one_page); +} + +void split_pmd_zero_pages_uffd(void) +{ + char *one_page; + int nr_hpages = 4; + size_t len = nr_hpages * pmd_pagesize; + long uffd; /* userfaultfd file descriptor */ + struct uffdio_api uffdio_api; + struct uffdio_register uffdio_register; + + /* Create and enable userfaultfd object. */ + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd == -1) { + perror("userfaultfd"); + exit(1); + } + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { + perror("ioctl-UFFDIO_API"); + exit(1); + } + + one_page = allocate_zero_filled_hugepage(len); + + uffdio_register.range.start = (unsigned long)one_page; + uffdio_register.range.len = len; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { + perror("ioctl-UFFDIO_REGISTER"); + exit(1); + } + + verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len); + printf("Split zero filled huge pages with uffd successful\n"); + free(one_page); +} + void split_pmd_thp(void) { char *one_page; @@ -121,7 +233,6 @@ void split_pmd_thp(void) exit(EXIT_FAILURE); } - if (check_huge_anon(one_page, 0, pmd_pagesize)) { printf("Still AnonHugePages not split\n"); exit(EXIT_FAILURE); @@ -301,6 +412,8 @@ int main(int argc, char **argv) pageshift = ffs(pagesize) - 1; pmd_pagesize = read_pmd_pagesize(); + split_pmd_zero_pages(); + split_pmd_zero_pages_uffd(); split_pmd_thp(); split_pte_mapped_thp(); split_file_backed_thp(); diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c index f11f8adda52186..72f3edc64aaf9a 100644 --- a/tools/testing/selftests/vm/vm_util.c +++ b/tools/testing/selftests/vm/vm_util.c @@ -6,6 +6,7 @@ #define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" #define SMAP_FILE_PATH "/proc/self/smaps" +#define STATUS_FILE_PATH "/proc/self/status" #define MAX_LINE_LENGTH 500 uint64_t pagemap_get_entry(int fd, char *start) @@ -72,6 +73,28 @@ uint64_t read_pmd_pagesize(void) return strtoul(buf, NULL, 10); } +uint64_t rss_anon(void) +{ + uint64_t rss_anon = 0; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + + fp = fopen(STATUS_FILE_PATH, "r"); + if (!fp) + ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH); + + if (!check_for_pattern(fp, "RssAnon:", buffer, sizeof(buffer))) + goto err_out; + + if (sscanf(buffer, "RssAnon:%10ld kB", &rss_anon) != 1) + ksft_exit_fail_msg("Reading status error\n"); + +err_out: + fclose(fp); + return rss_anon; +} + bool __check_huge(void *addr, char *pattern, int nr_hpages, uint64_t hpage_size) { diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h index 5c35de454e08f3..dd1885f6609716 100644 --- a/tools/testing/selftests/vm/vm_util.h +++ b/tools/testing/selftests/vm/vm_util.h @@ -1,12 +1,15 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include +#include uint64_t pagemap_get_entry(int fd, char *start); bool pagemap_is_softdirty(int fd, char *start); void clear_softdirty(void); bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); +uint64_t rss_anon(void); bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); From 05f84814b69e71e6555a85641f0e483caf0e5389 Mon Sep 17 00:00:00 2001 From: Alexandre Frade Date: Wed, 5 Oct 2022 03:46:34 +0000 Subject: [PATCH 08/37] XANMOD: rcu: Change sched_setscheduler_nocheck() calls to SCHED_RR policy Signed-off-by: Alexandre Frade --- Documentation/admin-guide/kernel-parameters.txt | 2 +- kernel/rcu/Kconfig | 4 ++-- kernel/rcu/rcutorture.c | 2 +- kernel/rcu/tree.c | 6 +++--- kernel/rcu/tree_nocb.h | 4 ++-- kernel/rcu/tree_plugin.h | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 42af9ca0127e52..e3d0acb92612bc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4720,7 +4720,7 @@ overwritten. rcutree.kthread_prio= [KNL,BOOT] - Set the SCHED_FIFO priority of the RCU per-CPU + Set the SCHED_RR priority of the RCU per-CPU kthreads (rcuc/N). This value is also used for the priority of the RCU boost threads (rcub/N) and for the RCU grace-period kthreads (rcu_bh, diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index d471d22a5e21b4..e945e522969e07 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -282,9 +282,9 @@ config RCU_NOCB_CPU_CB_BOOST depends on RCU_NOCB_CPU && RCU_BOOST default y if PREEMPT_RT help - Use this option to invoke offloaded callbacks as SCHED_FIFO + Use this option to invoke offloaded callbacks as SCHED_RR to avoid starvation by heavy SCHED_OTHER background load. - Of course, running as SCHED_FIFO during callback floods will + Of course, running as SCHED_RR during callback floods will cause the rcuo[ps] kthreads to monopolize the CPU for hundreds of milliseconds or more. Therefore, when enabling this option, it is your responsibility to ensure that latency-sensitive diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 503c2aa845a4a6..b58e64ac60800e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2362,7 +2362,7 @@ static int rcutorture_booster_init(unsigned int cpu) t = per_cpu(ksoftirqd, cpu); WARN_ON_ONCE(!t); sp.sched_priority = 2; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); } /* Don't allow time recalculation while creating a new task. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 93416afebd59c7..85c0b1c734bd86 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4401,8 +4401,8 @@ static void __init rcu_start_exp_gp_kworkers(void) return; } - sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); - sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_RR, ¶m); + sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_RR, ¶m); } @@ -4440,7 +4440,7 @@ static int __init rcu_spawn_gp_kthread(void) return 0; if (kthread_prio) { sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); } rnp = rcu_get_root(); raw_spin_lock_irqsave_rcu_node(rnp, flags); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 0a5f0ef4148451..035c5fe003702d 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1319,7 +1319,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); if (kthread_prio) - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); } mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); @@ -1330,7 +1330,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) goto end; if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio) - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e3142ee35fc6ac..f543a01cdd5828 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1007,7 +1007,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) struct sched_param sp; sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(current, SCHED_RR, &sp); #endif /* #ifdef CONFIG_RCU_BOOST */ WRITE_ONCE(rdp->rcuc_activity, jiffies); @@ -1206,7 +1206,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + sched_setscheduler_nocheck(t, SCHED_RR, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ out: From fed1201b45a2dea27d4afddf3b2f94a95b052b3d Mon Sep 17 00:00:00 2001 From: Steven Barrett Date: Sat, 30 Jan 2021 11:34:18 -0600 Subject: [PATCH 09/37] ZEN: Add ACS override support Source: https://gitlab.com/Queuecumber/linux-acs-override/-/raw/master/workspaces/5.10.4/acso.patch --- .../admin-guide/kernel-parameters.txt | 9 ++ drivers/pci/quirks.c | 101 ++++++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e3d0acb92612bc..de9aec04321aed 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4145,6 +4145,15 @@ nomsi [MSI] If the PCI_MSI kernel config parameter is enabled, this kernel boot option can be used to disable the use of MSI interrupts system-wide. + pcie_acs_override = + [PCIE] Override missing PCIe ACS support for: + downstream + All downstream ports - full ACS capabilities + multfunction + All multifunction devices - multifunction ACS subset + id:nnnn:nnnn + Specfic device - full ACS capabilities + Specified as vid:did (vendor/device ID) in hex noioapicquirk [APIC] Disable all boot interrupt quirks. Safety option to keep boot IRQs enabled. This should never be necessary. diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 285acc4aaccc1e..492e88a99c0727 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3612,6 +3612,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; } +static bool acs_on_downstream; +static bool acs_on_multifunction; + +#define NUM_ACS_IDS 16 +struct acs_on_id { + unsigned short vendor; + unsigned short device; +}; +static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; +static u8 max_acs_id; + +static __init int pcie_acs_override_setup(char *p) +{ + if (!p) + return -EINVAL; + + while (*p) { + if (!strncmp(p, "downstream", 10)) + acs_on_downstream = true; + if (!strncmp(p, "multifunction", 13)) + acs_on_multifunction = true; + if (!strncmp(p, "id:", 3)) { + char opt[5]; + int ret; + long val; + + if (max_acs_id >= NUM_ACS_IDS - 1) { + pr_warn("Out of PCIe ACS override slots (%d)\n", + NUM_ACS_IDS); + goto next; + } + + p += 3; + snprintf(opt, 5, "%s", p); + ret = kstrtol(opt, 16, &val); + if (ret) { + pr_warn("PCIe ACS ID parse error %d\n", ret); + goto next; + } + acs_on_ids[max_acs_id].vendor = val; + + p += strcspn(p, ":"); + if (*p != ':') { + pr_warn("PCIe ACS invalid ID\n"); + goto next; + } + + p++; + snprintf(opt, 5, "%s", p); + ret = kstrtol(opt, 16, &val); + if (ret) { + pr_warn("PCIe ACS ID parse error %d\n", ret); + goto next; + } + acs_on_ids[max_acs_id].device = val; + max_acs_id++; + } +next: + p += strcspn(p, ","); + if (*p == ',') + p++; + } + + if (acs_on_downstream || acs_on_multifunction || max_acs_id) + pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); + + return 0; +} +early_param("pcie_acs_override", pcie_acs_override_setup); + +static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) +{ + int i; + + /* Never override ACS for legacy devices or devices with ACS caps */ + if (!pci_is_pcie(dev) || + pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) + return -ENOTTY; + + for (i = 0; i < max_acs_id; i++) + if (acs_on_ids[i].vendor == dev->vendor && + acs_on_ids[i].device == dev->device) + return 1; + + switch (pci_pcie_type(dev)) { + case PCI_EXP_TYPE_DOWNSTREAM: + case PCI_EXP_TYPE_ROOT_PORT: + if (acs_on_downstream) + return 1; + break; + case PCI_EXP_TYPE_ENDPOINT: + case PCI_EXP_TYPE_UPSTREAM: + case PCI_EXP_TYPE_LEG_END: + case PCI_EXP_TYPE_RC_END: + if (acs_on_multifunction && dev->multifunction) + return 1; + } + + return -ENOTTY; +} /* * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be * prevented for those affected devices. @@ -4980,6 +5080,7 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_NXP, 0x8d9b, pci_quirk_nxp_rp_acs }, /* Zhaoxin Root/Downstream Ports */ { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, + { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, { 0 } }; From 8b9d4947de23f6eb20720aa8a9abf796f471e7bc Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" Date: Mon, 3 Aug 2020 21:18:53 +0200 Subject: [PATCH 10/37] ZEN: Add OpenRGB patches Squashed commit of the following: commit 942ded8f8652a4b4e6b46d04938bb66f1eac4c78 Author: Steven Barrett Date: Sat Jul 4 21:28:54 2020 -0500 openrgb: Deduplicate piix4 setup for HUDSON2/KERNCZ SMBUS In the original OpenRGB patches, the initialization code for CZ secondary SMBus controllers was copied. Later, an upstream commit landed in 5.7 stable that combined the initialization code for both primary/secondary smbus channels. Combine the initialization code ahead of time so upstream merges correctly with OpenRGB. Fixes: https://github.com/zen-kernel/zen-kernel/issues/176 commit a65e3ecf90b24fd44689cc0713af602965ffaf4e Author: Steven Barrett Date: Wed Jun 17 14:24:20 2020 -0500 Add OpenRGB patch - 0c45e26c Source: https://gitlab.com/CalcProgrammer1/OpenRGB/-/raw/master/OpenRGB.patch History: https://gitlab.com/CalcProgrammer1/OpenRGB/-/commits/master/OpenRGB.patch --- drivers/i2c/busses/Kconfig | 9 + drivers/i2c/busses/Makefile | 1 + drivers/i2c/busses/i2c-nct6775.c | 647 +++++++++++++++++++++++++++++++ drivers/i2c/busses/i2c-piix4.c | 4 +- 4 files changed, 659 insertions(+), 2 deletions(-) create mode 100644 drivers/i2c/busses/i2c-nct6775.c diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index e50f9603d189e8..ef652f896599e3 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -229,6 +229,15 @@ config I2C_CHT_WC combined with a FUSB302 Type-C port-controller as such it is advised to also select CONFIG_TYPEC_FUSB302=m. +config I2C_NCT6775 + tristate "Nuvoton NCT6775 and compatible SMBus controller" + help + If you say yes to this option, support will be included for the + Nuvoton NCT6775 and compatible SMBus controllers. + + This driver can also be built as a module. If so, the module + will be called i2c-nct6775. + config I2C_NFORCE2 tristate "Nvidia nForce2, nForce3 and nForce4" depends on PCI diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile index e73cdb1d2b5a85..052ccd05c13c31 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC) += i2c-cht-wc.o obj-$(CONFIG_I2C_I801) += i2c-i801.o obj-$(CONFIG_I2C_ISCH) += i2c-isch.o obj-$(CONFIG_I2C_ISMT) += i2c-ismt.o +obj-$(CONFIG_I2C_NCT6775) += i2c-nct6775.o obj-$(CONFIG_I2C_NFORCE2) += i2c-nforce2.o obj-$(CONFIG_I2C_NFORCE2_S4985) += i2c-nforce2-s4985.o obj-$(CONFIG_I2C_NVIDIA_GPU) += i2c-nvidia-gpu.o diff --git a/drivers/i2c/busses/i2c-nct6775.c b/drivers/i2c/busses/i2c-nct6775.c new file mode 100644 index 00000000000000..0462f095204311 --- /dev/null +++ b/drivers/i2c/busses/i2c-nct6775.c @@ -0,0 +1,647 @@ +/* + * i2c-nct6775 - Driver for the SMBus master functionality of + * Nuvoton NCT677x Super-I/O chips + * + * Copyright (C) 2019 Adam Honse + * + * Derived from nct6775 hwmon driver + * Copyright (C) 2012 Guenter Roeck + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRVNAME "i2c-nct6775" + +/* Nuvoton SMBus address offsets */ +#define SMBHSTDAT (0 + nuvoton_nct6793d_smba) +#define SMBBLKSZ (1 + nuvoton_nct6793d_smba) +#define SMBHSTCMD (2 + nuvoton_nct6793d_smba) +#define SMBHSTIDX (3 + nuvoton_nct6793d_smba) //Index field is the Command field on other controllers +#define SMBHSTCTL (4 + nuvoton_nct6793d_smba) +#define SMBHSTADD (5 + nuvoton_nct6793d_smba) +#define SMBHSTERR (9 + nuvoton_nct6793d_smba) +#define SMBHSTSTS (0xE + nuvoton_nct6793d_smba) + +/* Command register */ +#define NCT6793D_READ_BYTE 0 +#define NCT6793D_READ_WORD 1 +#define NCT6793D_READ_BLOCK 2 +#define NCT6793D_BLOCK_WRITE_READ_PROC_CALL 3 +#define NCT6793D_PROC_CALL 4 +#define NCT6793D_WRITE_BYTE 8 +#define NCT6793D_WRITE_WORD 9 +#define NCT6793D_WRITE_BLOCK 10 + +/* Control register */ +#define NCT6793D_MANUAL_START 128 +#define NCT6793D_SOFT_RESET 64 + +/* Error register */ +#define NCT6793D_NO_ACK 32 + +/* Status register */ +#define NCT6793D_FIFO_EMPTY 1 +#define NCT6793D_FIFO_FULL 2 +#define NCT6793D_MANUAL_ACTIVE 4 + +#define NCT6775_LD_SMBUS 0x0B + +/* Other settings */ +#define MAX_RETRIES 400 + +enum kinds { nct6106, nct6775, nct6776, nct6779, nct6791, nct6792, nct6793, + nct6795, nct6796, nct6798 }; + +struct nct6775_sio_data { + int sioreg; + enum kinds kind; +}; + +/* used to set data->name = nct6775_device_names[data->sio_kind] */ +static const char * const nct6775_device_names[] = { + "nct6106", + "nct6775", + "nct6776", + "nct6779", + "nct6791", + "nct6792", + "nct6793", + "nct6795", + "nct6796", + "nct6798", +}; + +static const char * const nct6775_sio_names[] __initconst = { + "NCT6106D", + "NCT6775F", + "NCT6776D/F", + "NCT6779D", + "NCT6791D", + "NCT6792D", + "NCT6793D", + "NCT6795D", + "NCT6796D", + "NCT6798D", +}; + +#define SIO_REG_LDSEL 0x07 /* Logical device select */ +#define SIO_REG_DEVID 0x20 /* Device ID (2 bytes) */ +#define SIO_REG_SMBA 0x62 /* SMBus base address register */ + +#define SIO_NCT6106_ID 0xc450 +#define SIO_NCT6775_ID 0xb470 +#define SIO_NCT6776_ID 0xc330 +#define SIO_NCT6779_ID 0xc560 +#define SIO_NCT6791_ID 0xc800 +#define SIO_NCT6792_ID 0xc910 +#define SIO_NCT6793_ID 0xd120 +#define SIO_NCT6795_ID 0xd350 +#define SIO_NCT6796_ID 0xd420 +#define SIO_NCT6798_ID 0xd428 +#define SIO_ID_MASK 0xFFF0 + +static inline void +superio_outb(int ioreg, int reg, int val) +{ + outb(reg, ioreg); + outb(val, ioreg + 1); +} + +static inline int +superio_inb(int ioreg, int reg) +{ + outb(reg, ioreg); + return inb(ioreg + 1); +} + +static inline void +superio_select(int ioreg, int ld) +{ + outb(SIO_REG_LDSEL, ioreg); + outb(ld, ioreg + 1); +} + +static inline int +superio_enter(int ioreg) +{ + /* + * Try to reserve and for exclusive access. + */ + if (!request_muxed_region(ioreg, 2, DRVNAME)) + return -EBUSY; + + outb(0x87, ioreg); + outb(0x87, ioreg); + + return 0; +} + +static inline void +superio_exit(int ioreg) +{ + outb(0xaa, ioreg); + outb(0x02, ioreg); + outb(0x02, ioreg + 1); + release_region(ioreg, 2); +} + +/* + * ISA constants + */ + +#define IOREGION_ALIGNMENT (~7) +#define IOREGION_LENGTH 2 +#define ADDR_REG_OFFSET 0 +#define DATA_REG_OFFSET 1 + +#define NCT6775_REG_BANK 0x4E +#define NCT6775_REG_CONFIG 0x40 + +static struct i2c_adapter *nct6775_adapter; + +struct i2c_nct6775_adapdata { + unsigned short smba; +}; + +/* Return negative errno on error. */ +static s32 nct6775_access(struct i2c_adapter * adap, u16 addr, + unsigned short flags, char read_write, + u8 command, int size, union i2c_smbus_data * data) +{ + struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap); + unsigned short nuvoton_nct6793d_smba = adapdata->smba; + int i, len, cnt; + union i2c_smbus_data tmp_data; + int timeout = 0; + + tmp_data.word = 0; + cnt = 0; + len = 0; + + outb_p(NCT6793D_SOFT_RESET, SMBHSTCTL); + + switch (size) { + case I2C_SMBUS_QUICK: + outb_p((addr << 1) | read_write, + SMBHSTADD); + break; + case I2C_SMBUS_BYTE_DATA: + tmp_data.byte = data->byte; + case I2C_SMBUS_BYTE: + outb_p((addr << 1) | read_write, + SMBHSTADD); + outb_p(command, SMBHSTIDX); + if (read_write == I2C_SMBUS_WRITE) { + outb_p(tmp_data.byte, SMBHSTDAT); + outb_p(NCT6793D_WRITE_BYTE, SMBHSTCMD); + } + else { + outb_p(NCT6793D_READ_BYTE, SMBHSTCMD); + } + break; + case I2C_SMBUS_WORD_DATA: + outb_p((addr << 1) | read_write, + SMBHSTADD); + outb_p(command, SMBHSTIDX); + if (read_write == I2C_SMBUS_WRITE) { + outb_p(data->word & 0xff, SMBHSTDAT); + outb_p((data->word & 0xff00) >> 8, SMBHSTDAT); + outb_p(NCT6793D_WRITE_WORD, SMBHSTCMD); + } + else { + outb_p(NCT6793D_READ_WORD, SMBHSTCMD); + } + break; + case I2C_SMBUS_BLOCK_DATA: + outb_p((addr << 1) | read_write, + SMBHSTADD); + outb_p(command, SMBHSTIDX); + if (read_write == I2C_SMBUS_WRITE) { + len = data->block[0]; + if (len == 0 || len > I2C_SMBUS_BLOCK_MAX) + return -EINVAL; + outb_p(len, SMBBLKSZ); + + cnt = 1; + if (len >= 4) { + for (i = cnt; i <= 4; i++) { + outb_p(data->block[i], SMBHSTDAT); + } + + len -= 4; + cnt += 4; + } + else { + for (i = cnt; i <= len; i++ ) { + outb_p(data->block[i], SMBHSTDAT); + } + + len = 0; + } + + outb_p(NCT6793D_WRITE_BLOCK, SMBHSTCMD); + } + else { + return -ENOTSUPP; + } + break; + default: + dev_warn(&adap->dev, "Unsupported transaction %d\n", size); + return -EOPNOTSUPP; + } + + outb_p(NCT6793D_MANUAL_START, SMBHSTCTL); + + while ((size == I2C_SMBUS_BLOCK_DATA) && (len > 0)) { + if (read_write == I2C_SMBUS_WRITE) { + timeout = 0; + while ((inb_p(SMBHSTSTS) & NCT6793D_FIFO_EMPTY) == 0) + { + if(timeout > MAX_RETRIES) + { + return -ETIMEDOUT; + } + usleep_range(250, 500); + timeout++; + } + + //Load more bytes into FIFO + if (len >= 4) { + for (i = cnt; i <= (cnt + 4); i++) { + outb_p(data->block[i], SMBHSTDAT); + } + + len -= 4; + cnt += 4; + } + else { + for (i = cnt; i <= (cnt + len); i++) { + outb_p(data->block[i], SMBHSTDAT); + } + + len = 0; + } + } + else { + return -ENOTSUPP; + } + + } + + //wait for manual mode to complete + timeout = 0; + while ((inb_p(SMBHSTSTS) & NCT6793D_MANUAL_ACTIVE) != 0) + { + if(timeout > MAX_RETRIES) + { + return -ETIMEDOUT; + } + usleep_range(250, 500); + timeout++; + } + + if ((inb_p(SMBHSTERR) & NCT6793D_NO_ACK) != 0) { + return -ENXIO; + } + else if ((read_write == I2C_SMBUS_WRITE) || (size == I2C_SMBUS_QUICK)) { + return 0; + } + + switch (size) { + case I2C_SMBUS_QUICK: + case I2C_SMBUS_BYTE_DATA: + data->byte = inb_p(SMBHSTDAT); + break; + case I2C_SMBUS_WORD_DATA: + data->word = inb_p(SMBHSTDAT) + (inb_p(SMBHSTDAT) << 8); + break; + } + return 0; +} + +static u32 nct6775_func(struct i2c_adapter *adapter) +{ + return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE | + I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA | + I2C_FUNC_SMBUS_BLOCK_DATA; +} + +static const struct i2c_algorithm smbus_algorithm = { + .smbus_xfer = nct6775_access, + .functionality = nct6775_func, +}; + +static int nct6775_add_adapter(unsigned short smba, const char *name, struct i2c_adapter **padap) +{ + struct i2c_adapter *adap; + struct i2c_nct6775_adapdata *adapdata; + int retval; + + adap = kzalloc(sizeof(*adap), GFP_KERNEL); + if (adap == NULL) { + return -ENOMEM; + } + + adap->owner = THIS_MODULE; + adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; + adap->algo = &smbus_algorithm; + + adapdata = kzalloc(sizeof(*adapdata), GFP_KERNEL); + if (adapdata == NULL) { + kfree(adap); + return -ENOMEM; + } + + adapdata->smba = smba; + + snprintf(adap->name, sizeof(adap->name), + "SMBus NCT67xx adapter%s at %04x", name, smba); + + i2c_set_adapdata(adap, adapdata); + + retval = i2c_add_adapter(adap); + if (retval) { + kfree(adapdata); + kfree(adap); + return retval; + } + + *padap = adap; + return 0; +} + +static void nct6775_remove_adapter(struct i2c_adapter *adap) +{ + struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap); + + if (adapdata->smba) { + i2c_del_adapter(adap); + kfree(adapdata); + kfree(adap); + } +} + +//static SIMPLE_DEV_PM_OPS(nct6775_dev_pm_ops, nct6775_suspend, nct6775_resume); + +/* + * when Super-I/O functions move to a separate file, the Super-I/O + * bus will manage the lifetime of the device and this module will only keep + * track of the nct6775 driver. But since we use platform_device_alloc(), we + * must keep track of the device + */ +static struct platform_device *pdev[2]; + +static int nct6775_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct nct6775_sio_data *sio_data = dev_get_platdata(dev); + struct resource *res; + + res = platform_get_resource(pdev, IORESOURCE_IO, 0); + if (!devm_request_region(&pdev->dev, res->start, IOREGION_LENGTH, + DRVNAME)) + return -EBUSY; + + switch (sio_data->kind) { + case nct6791: + case nct6792: + case nct6793: + case nct6795: + case nct6796: + case nct6798: + nct6775_add_adapter(res->start, "", &nct6775_adapter); + break; + default: + return -ENODEV; + } + + return 0; +} +/* +static void nct6791_enable_io_mapping(int sioaddr) +{ + int val; + + val = superio_inb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE); + if (val & 0x10) { + pr_info("Enabling hardware monitor logical device mappings.\n"); + superio_outb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE, + val & ~0x10); + } +}*/ + +static struct platform_driver i2c_nct6775_driver = { + .driver = { + .name = DRVNAME, +// .pm = &nct6775_dev_pm_ops, + }, + .probe = nct6775_probe, +}; + +static void __exit i2c_nct6775_exit(void) +{ + int i; + + if(nct6775_adapter) + nct6775_remove_adapter(nct6775_adapter); + + for (i = 0; i < ARRAY_SIZE(pdev); i++) { + if (pdev[i]) + platform_device_unregister(pdev[i]); + } + platform_driver_unregister(&i2c_nct6775_driver); +} + +/* nct6775_find() looks for a '627 in the Super-I/O config space */ +static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data) +{ + u16 val; + int err; + int addr; + + err = superio_enter(sioaddr); + if (err) + return err; + + val = (superio_inb(sioaddr, SIO_REG_DEVID) << 8) | + superio_inb(sioaddr, SIO_REG_DEVID + 1); + + switch (val & SIO_ID_MASK) { + case SIO_NCT6106_ID: + sio_data->kind = nct6106; + break; + case SIO_NCT6775_ID: + sio_data->kind = nct6775; + break; + case SIO_NCT6776_ID: + sio_data->kind = nct6776; + break; + case SIO_NCT6779_ID: + sio_data->kind = nct6779; + break; + case SIO_NCT6791_ID: + sio_data->kind = nct6791; + break; + case SIO_NCT6792_ID: + sio_data->kind = nct6792; + break; + case SIO_NCT6793_ID: + sio_data->kind = nct6793; + break; + case SIO_NCT6795_ID: + sio_data->kind = nct6795; + break; + case SIO_NCT6796_ID: + sio_data->kind = nct6796; + break; + case SIO_NCT6798_ID: + sio_data->kind = nct6798; + break; + default: + if (val != 0xffff) + pr_debug("unsupported chip ID: 0x%04x\n", val); + superio_exit(sioaddr); + return -ENODEV; + } + + /* We have a known chip, find the SMBus I/O address */ + superio_select(sioaddr, NCT6775_LD_SMBUS); + val = (superio_inb(sioaddr, SIO_REG_SMBA) << 8) + | superio_inb(sioaddr, SIO_REG_SMBA + 1); + addr = val & IOREGION_ALIGNMENT; + if (addr == 0) { + pr_err("Refusing to enable a Super-I/O device with a base I/O port 0\n"); + superio_exit(sioaddr); + return -ENODEV; + } + + //if (sio_data->kind == nct6791 || sio_data->kind == nct6792 || + // sio_data->kind == nct6793 || sio_data->kind == nct6795 || + // sio_data->kind == nct6796) + // nct6791_enable_io_mapping(sioaddr); + + superio_exit(sioaddr); + pr_info("Found %s or compatible chip at %#x:%#x\n", + nct6775_sio_names[sio_data->kind], sioaddr, addr); + sio_data->sioreg = sioaddr; + + return addr; +} + +static int __init i2c_nct6775_init(void) +{ + int i, err; + bool found = false; + int address; + struct resource res; + struct nct6775_sio_data sio_data; + int sioaddr[2] = { 0x2e, 0x4e }; + + err = platform_driver_register(&i2c_nct6775_driver); + if (err) + return err; + + /* + * initialize sio_data->kind and sio_data->sioreg. + * + * when Super-I/O functions move to a separate file, the Super-I/O + * driver will probe 0x2e and 0x4e and auto-detect the presence of a + * nct6775 hardware monitor, and call probe() + */ + for (i = 0; i < ARRAY_SIZE(pdev); i++) { + address = nct6775_find(sioaddr[i], &sio_data); + if (address <= 0) + continue; + + found = true; + + pdev[i] = platform_device_alloc(DRVNAME, address); + if (!pdev[i]) { + err = -ENOMEM; + goto exit_device_unregister; + } + + err = platform_device_add_data(pdev[i], &sio_data, + sizeof(struct nct6775_sio_data)); + if (err) + goto exit_device_put; + + memset(&res, 0, sizeof(res)); + res.name = DRVNAME; + res.start = address; + res.end = address + IOREGION_LENGTH - 1; + res.flags = IORESOURCE_IO; + + err = acpi_check_resource_conflict(&res); + if (err) { + platform_device_put(pdev[i]); + pdev[i] = NULL; + continue; + } + + err = platform_device_add_resources(pdev[i], &res, 1); + if (err) + goto exit_device_put; + + /* platform_device_add calls probe() */ + err = platform_device_add(pdev[i]); + if (err) + goto exit_device_put; + } + if (!found) { + err = -ENODEV; + goto exit_unregister; + } + + return 0; + +exit_device_put: + platform_device_put(pdev[i]); +exit_device_unregister: + while (--i >= 0) { + if (pdev[i]) + platform_device_unregister(pdev[i]); + } +exit_unregister: + platform_driver_unregister(&i2c_nct6775_driver); + return err; +} + +MODULE_AUTHOR("Adam Honse "); +MODULE_DESCRIPTION("SMBus driver for NCT6775F and compatible chips"); +MODULE_LICENSE("GPL"); + +module_init(i2c_nct6775_init); +module_exit(i2c_nct6775_exit); diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c index 809fbd014cd683..d54b35b147ee97 100644 --- a/drivers/i2c/busses/i2c-piix4.c +++ b/drivers/i2c/busses/i2c-piix4.c @@ -568,11 +568,11 @@ static int piix4_transaction(struct i2c_adapter *piix4_adapter) if (srvrworks_csb5_delay) /* Extra delay for SERVERWORKS_CSB5 */ usleep_range(2000, 2100); else - usleep_range(250, 500); + usleep_range(25, 50); while ((++timeout < MAX_TIMEOUT) && ((temp = inb_p(SMBHSTSTS)) & 0x01)) - usleep_range(250, 500); + usleep_range(25, 50); /* If the SMBus is still busy, we give up */ if (timeout == MAX_TIMEOUT) { From b0141466e0333c3ecb54b28415f27793dcf7d166 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 10 Nov 2022 21:15:20 +0100 Subject: [PATCH 11/37] Makefile.debug: support for -gz=zstd Make DEBUG_INFO_COMPRESSED a choice; DEBUG_INFO_COMPRESSED_NONE is the default, DEBUG_INFO_COMPRESSED_ZLIB uses zlib, DEBUG_INFO_COMPRESSED_ZSTD uses zstd. This renames the existing KConfig option DEBUG_INFO_COMPRESSED to DEBUG_INFO_COMPRESSED_ZLIB so users upgrading may need to reset the new Kconfigs. Some quick N=1 measurements with du, /usr/bin/time -v, and bloaty: clang-16, x86_64 defconfig plus CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_COMPRESSED_NONE=y: Elapsed (wall clock) time (h:mm:ss or m:ss): 0:55.43 488M vmlinux 27.6% 136Mi 0.0% 0 .debug_info 6.1% 30.2Mi 0.0% 0 .debug_str_offsets 3.5% 17.2Mi 0.0% 0 .debug_line 3.3% 16.3Mi 0.0% 0 .debug_loclists 0.9% 4.62Mi 0.0% 0 .debug_str clang-16, x86_64 defconfig plus CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_COMPRESSED_ZLIB=y: Elapsed (wall clock) time (h:mm:ss or m:ss): 1:00.35 385M vmlinux 21.8% 85.4Mi 0.0% 0 .debug_info 2.1% 8.26Mi 0.0% 0 .debug_str_offsets 2.1% 8.24Mi 0.0% 0 .debug_loclists 1.9% 7.48Mi 0.0% 0 .debug_line 0.5% 1.94Mi 0.0% 0 .debug_str clang-16, x86_64 defconfig plus CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_COMPRESSED_ZSTD=y: Elapsed (wall clock) time (h:mm:ss or m:ss): 0:59.69 373M vmlinux 21.4% 81.4Mi 0.0% 0 .debug_info 2.3% 8.85Mi 0.0% 0 .debug_loclists 1.5% 5.71Mi 0.0% 0 .debug_line 0.5% 1.95Mi 0.0% 0 .debug_str_offsets 0.4% 1.62Mi 0.0% 0 .debug_str That's only a 3.11% overall binary size savings over zlib, but at no performance regression. Link: https://maskray.me/blog/2022-09-09-zstd-compressed-debug-sections Link: https://maskray.me/blog/2022-01-23-compressed-debug-sections Suggested-by: Sedat Dilek (DHL Supply Chain) Reviewed-by: Nathan Chancellor Signed-off-by: Nick Desaulniers --- Changes v2 -> v3: * Fix scripts/Makefile.debug as per Sedat. * Update commit message as per Nicolas. Changes v1 -> v2: * Remove `depends on DEBUG_KERNEL` as per Nathan. * Rename Kconfigs as per Sedat and Masahiro. * Add note about renamed Kconfigs to commit message. * Add more help text to DEBUG_INFO_COMPRESSED_ZSTD. Signed-off-by: Peter Jung --- lib/Kconfig.debug | 29 +++++++++++++++++++++++++++-- scripts/Makefile.debug | 6 +++++- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3638b3424be531..12cff76aeb9381 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -312,8 +312,21 @@ config DEBUG_INFO_REDUCED DEBUG_INFO build and compile times are reduced too. Only works with newer gcc versions. -config DEBUG_INFO_COMPRESSED - bool "Compressed debugging information" +choice + prompt "Compressed Debug information" + help + Compress the resulting debug info. Results in smaller debug info sections, + but requires that consumers are able to decompress the results. + + If unsure, choose DEBUG_INFO_COMPRESSED_NONE. + +config DEBUG_INFO_COMPRESSED_NONE + bool "Don't compress debug information" + help + Don't compress debug info sections. + +config DEBUG_INFO_COMPRESSED_ZLIB + bool "Compress debugging information with zlib" depends on $(cc-option,-gz=zlib) depends on $(ld-option,--compress-debug-sections=zlib) help @@ -327,6 +340,18 @@ config DEBUG_INFO_COMPRESSED preferable to setting $KDEB_COMPRESS to "none" which would be even larger. +config DEBUG_INFO_COMPRESSED_ZSTD + bool "Compress debugging information with zstd" + depends on $(cc-option,-gz=zstd) + depends on $(ld-option,--compress-debug-sections=zstd) + help + Compress the debug information using zstd. This may provide better + compression than zlib, for about the same time costs, but requires newer + toolchain support. Requires GCC 13.0+ or Clang 16.0+, binutils 2.40+, and + zstd. + +endchoice # "Compressed Debug information" + config DEBUG_INFO_SPLIT bool "Produce split debuginfo in .dwo files" depends on $(cc-option,-gsplit-dwarf) diff --git a/scripts/Makefile.debug b/scripts/Makefile.debug index 332c486f705f75..059ff38fe0cb31 100644 --- a/scripts/Makefile.debug +++ b/scripts/Makefile.debug @@ -27,10 +27,14 @@ else DEBUG_RUSTFLAGS += -Cdebuginfo=2 endif -ifdef CONFIG_DEBUG_INFO_COMPRESSED +ifdef CONFIG_DEBUG_INFO_COMPRESSED_ZLIB DEBUG_CFLAGS += -gz=zlib KBUILD_AFLAGS += -gz=zlib KBUILD_LDFLAGS += --compress-debug-sections=zlib +else ifdef CONFIG_DEBUG_INFO_COMPRESSED_ZSTD +DEBUG_CFLAGS += -gz=zstd +KBUILD_AFLAGS += -gz=zstd +KBUILD_LDFLAGS += --compress-debug-sections=zstd endif KBUILD_CFLAGS += $(DEBUG_CFLAGS) From 25205e4a3dd76c0858870bc8e7427accad0fb123 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sun, 8 Mar 2020 00:31:35 -0800 Subject: [PATCH 12/37] ZEN: Disable stack conservation for GCC There's plenty of room on the stack for a few more inlined bytes here and there. The measured stack usage at runtime is still safe without this, and performance is surely improved at a microscopic level, so remove it. Signed-off-by: Sultan Alsawaf --- Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Makefile b/Makefile index 98c9a974b8c2ae..006680a6f13d51 100644 --- a/Makefile +++ b/Makefile @@ -1062,11 +1062,6 @@ KBUILD_CFLAGS += -fno-strict-overflow # Make sure -fstack-check isn't enabled (like gentoo apparently did) KBUILD_CFLAGS += -fno-stack-check -# conserve stack if available -ifdef CONFIG_CC_IS_GCC -KBUILD_CFLAGS += -fconserve-stack -endif - # Prohibit date/time macros, which would make the build non-deterministic KBUILD_CFLAGS += -Werror=date-time From 618840572ddc618d092b7c2e56070086dae5e4cd Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Tue, 6 Dec 2022 23:13:24 +0100 Subject: [PATCH 13/37] module/decompress: Support zstd in-kernel decompression Add support for zstd compressed modules to the in-kernel decompression code. This allows zstd compressed modules to be decompressed by the kernel, similar to the existing support for gzip and xz compressed modules. Cc: Dmitry Torokhov Cc: Piotr Gorski Cc: Nick Terrell Signed-off-by: Stephen Boyd Signed-off-by: Peter Jung --- kernel/module/Kconfig | 3 +- kernel/module/decompress.c | 92 +++++++++++++++++++++++++++++++++++++- 2 files changed, 92 insertions(+), 3 deletions(-) diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 26ea5d04f56c2d..424b3bc58f3f51 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -221,9 +221,10 @@ endchoice config MODULE_DECOMPRESS bool "Support in-kernel module decompression" - depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ + depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD select ZLIB_INFLATE if MODULE_COMPRESS_GZIP select XZ_DEC if MODULE_COMPRESS_XZ + select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD help Support for decompressing kernel modules by the kernel itself diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c index c033572d83f0e8..44f14643d01430 100644 --- a/kernel/module/decompress.c +++ b/kernel/module/decompress.c @@ -50,7 +50,7 @@ static struct page *module_get_next_page(struct load_info *info) return page; } -#ifdef CONFIG_MODULE_COMPRESS_GZIP +#if defined(CONFIG_MODULE_COMPRESS_GZIP) #include #define MODULE_COMPRESSION gzip #define MODULE_DECOMPRESS_FN module_gzip_decompress @@ -141,7 +141,7 @@ static ssize_t module_gzip_decompress(struct load_info *info, kfree(s.workspace); return retval; } -#elif CONFIG_MODULE_COMPRESS_XZ +#elif defined(CONFIG_MODULE_COMPRESS_XZ) #include #define MODULE_COMPRESSION xz #define MODULE_DECOMPRESS_FN module_xz_decompress @@ -199,6 +199,94 @@ static ssize_t module_xz_decompress(struct load_info *info, xz_dec_end(xz_dec); return retval; } +#elif defined(CONFIG_MODULE_COMPRESS_ZSTD) +#include +#define MODULE_COMPRESSION zstd +#define MODULE_DECOMPRESS_FN module_zstd_decompress + +static ssize_t module_zstd_decompress(struct load_info *info, + const void *buf, size_t size) +{ + static const u8 signature[] = { 0x28, 0xb5, 0x2f, 0xfd }; + ZSTD_outBuffer zstd_dec; + ZSTD_inBuffer zstd_buf; + zstd_frame_header header; + size_t wksp_size; + void *wksp = NULL; + ZSTD_DStream *dstream; + size_t ret; + size_t new_size = 0; + int retval; + + if (size < sizeof(signature) || + memcmp(buf, signature, sizeof(signature))) { + pr_err("not a zstd compressed module\n"); + return -EINVAL; + } + + zstd_buf.src = buf; + zstd_buf.pos = 0; + zstd_buf.size = size; + + ret = zstd_get_frame_header(&header, zstd_buf.src, zstd_buf.size); + if (ret != 0) { + pr_err("ZSTD-compressed data has an incomplete frame header\n"); + retval = -EINVAL; + goto out; + } + if (header.windowSize > (1 << ZSTD_WINDOWLOG_MAX)) { + pr_err("ZSTD-compressed data has too large a window size\n"); + retval = -EINVAL; + goto out; + } + + wksp_size = zstd_dstream_workspace_bound(header.windowSize); + wksp = kmalloc(wksp_size, GFP_KERNEL); + if (!wksp) { + retval = -ENOMEM; + goto out; + } + + dstream = zstd_init_dstream(header.windowSize, wksp, wksp_size); + if (!dstream) { + pr_err("Can't initialize ZSTD stream\n"); + retval = -ENOMEM; + goto out; + } + + do { + struct page *page = module_get_next_page(info); + + if (!IS_ERR(page)) { + retval = PTR_ERR(page); + goto out; + } + + zstd_dec.dst = kmap_local_page(page); + zstd_dec.pos = 0; + zstd_dec.size = PAGE_SIZE; + + ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf); + kunmap(page); + retval = zstd_get_error_code(ret); + if (retval) + break; + + new_size += zstd_dec.pos; + } while (zstd_dec.pos == PAGE_SIZE && ret != 0); + + if (retval) { + pr_err("ZSTD-decompression failed with status %d\n", retval); + retval = -EINVAL; + goto out; + } + + retval = new_size; + + out: + kfree(wksp); + return retval; +} #else #error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS" #endif From 9eeae3b566dd87c88c05b3c1a4d621c6f9810ea1 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Sun, 11 Dec 2022 16:31:54 +0100 Subject: [PATCH 14/37] x86: Avoid relocation information in final vmlinux The Linux build process on x86 roughly consists of compiling all input files, statically linking them into a vmlinux ELF file, and then taking and turning this file into an actual bzImage bootable file. vmlinux has in this process two main purposes: 1) It is an intermediate build target on the way to produce the final bootable image. 2) It is a file that is expected to be used by debuggers and standard ELF tooling to work with the built kernel. For the second purpose, a vmlinux file is typically collected by various package build recipes, such as distribution spec files, including the kernel's own binrpm-pkg target. When building a kernel supporting KASLR with CONFIG_X86_NEED_RELOCS, vmlinux contains also relocation information produced by using the --emit-relocs linker option. This is utilized by subsequent build steps to create vmlinux.relocs and produce a relocatable image. However, the information is not needed by debuggers and other standard ELF tooling. The issue is then that the collected vmlinux file and hence distribution packages end up unnecessarily large because of this extra data. The following is a size comparison of vmlinux v6.0 with and without the relocation information: | Configuration | With relocs | Stripped relocs | | x86_64_defconfig | 70 MB | 43 MB | | +CONFIG_DEBUG_INFO | 818 MB | 367 MB | Optimize a resulting vmlinux by adding a postlink step that splits the relocation information into vmlinux.relocs and then strips it from the vmlinux binary. Signed-off-by: Petr Pavlu --- Changes since v2 [1]: - Ignore only the moved vmlinux.relocs, add it to .gitignore and Documentation/dontdiff. - Clean up the patch description. Changes since v1 [2]: - Fix the command to remove relocations to work with llvm-objcopy too. [1] https://lore.kernel.org/lkml/20220927084632.14531-1-petr.pavlu@suse.com/ [2] https://lore.kernel.org/lkml/20220913132911.6850-1-petr.pavlu@suse.com/ Signed-off-by: Peter Jung --- .gitignore | 1 + Documentation/dontdiff | 1 + arch/x86/Makefile.postlink | 41 +++++++++++++++++++++++++++++ arch/x86/boot/compressed/.gitignore | 1 - arch/x86/boot/compressed/Makefile | 10 +++---- 5 files changed, 47 insertions(+), 7 deletions(-) create mode 100644 arch/x86/Makefile.postlink diff --git a/.gitignore b/.gitignore index 5da004814678d0..cb59d89372c014 100644 --- a/.gitignore +++ b/.gitignore @@ -61,6 +61,7 @@ modules.order /vmlinux /vmlinux.32 /vmlinux.map +/vmlinux.relocs /vmlinux.symvers /vmlinux-gdb.py /vmlinuz diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 352ff53a2306ad..7c210744d84c6e 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -255,6 +255,7 @@ vmlinux.aout vmlinux.bin.all vmlinux.lds vmlinux.map +vmlinux.relocs vmlinux.symvers vmlinuz voffset.h diff --git a/arch/x86/Makefile.postlink b/arch/x86/Makefile.postlink new file mode 100644 index 00000000000000..b38ffa4defb3df --- /dev/null +++ b/arch/x86/Makefile.postlink @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: GPL-2.0 +# =========================================================================== +# Post-link x86 pass +# =========================================================================== +# +# 1. Separate relocations from vmlinux into vmlinux.relocs. +# 2. Strip relocations from vmlinux. + +PHONY := __archpost +__archpost: + +-include include/config/auto.conf +include scripts/Kbuild.include + +CMD_RELOCS = arch/x86/tools/relocs +quiet_cmd_relocs = RELOCS $@.relocs + cmd_relocs = $(CMD_RELOCS) $@ > $@.relocs;$(CMD_RELOCS) --abs-relocs $@ + +quiet_cmd_strip_relocs = RSTRIP $@ + cmd_strip_relocs = $(OBJCOPY) --remove-section='.rel.*' --remove-section='.rel__*' --remove-section='.rela.*' --remove-section='.rela__*' $@ + +# `@true` prevents complaint when there is nothing to be done + +vmlinux: FORCE + @true +ifeq ($(CONFIG_X86_NEED_RELOCS),y) + $(call cmd,relocs) + $(call cmd,strip_relocs) +endif + +%.ko: FORCE + @true + +clean: + @rm -f vmlinux.relocs + +PHONY += FORCE clean + +FORCE: + +.PHONY: $(PHONY) diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore index 25805199a50611..b2968175fc2757 100644 --- a/arch/x86/boot/compressed/.gitignore +++ b/arch/x86/boot/compressed/.gitignore @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only relocs vmlinux.bin.all -vmlinux.relocs vmlinux.lds mkpiggy piggy.S diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 3a261abb6d158d..75a467a408d2c9 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -121,14 +121,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs -CMD_RELOCS = arch/x86/tools/relocs -quiet_cmd_relocs = RELOCS $@ - cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< -$(obj)/vmlinux.relocs: vmlinux FORCE - $(call if_changed,relocs) +# vmlinux.relocs is created by the vmlinux postlink step. +vmlinux.relocs: vmlinux + @true vmlinux.bin.all-y := $(obj)/vmlinux.bin -vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs +vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += vmlinux.relocs $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) From 3b8ada43c9985920e3abe7507c1a50d152b67187 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Mon, 24 Oct 2022 17:02:54 -0300 Subject: [PATCH 15/37] x86/split_lock: Add sysctl to control the misery mode Commit b041b525dab9 ("x86/split_lock: Make life miserable for split lockers") changed the way the split lock detector works when in "warn" mode; basically, it not only shows the warn message, but also intentionally introduces a slowdown through sleeping plus serialization mechanism on such task. Based on discussions in [0], seems the warning alone wasn't enough motivation for userspace developers to fix their applications. This slowdown is enough to totally break some proprietary (aka. unfixable) userspace[1]. Happens that originally the proposal in [0] was to add a new mode which would warns + slowdown the "split locking" task, keeping the old warn mode untouched. In the end, that idea was discarded and the regular/default "warn" mode now slows down the applications. This is quite aggressive with regards proprietary/legacy programs that basically are unable to properly run in kernel with this change. While it is understandable that a malicious application could DoS by split locking, it seems unacceptable to regress old/proprietary userspace programs through a default configuration that previously worked. An example of such breakage was reported in [1]. Add a sysctl to allow controlling the "misery mode" behavior, as per Thomas suggestion on [2]. This way, users running legacy and/or proprietary software are allowed to still execute them with a decent performance while still observing the warning messages on kernel log. [0] https://lore.kernel.org/lkml/20220217012721.9694-1-tony.luck@intel.com/ [1] https://github.com/doitsujin/dxvk/issues/2938 [2] https://lore.kernel.org/lkml/87pmf4bter.ffs@tglx/ [ dhansen: minor changelog tweaks, including clarifying the actual problem ] Fixes: b041b525dab9 ("x86/split_lock: Make life miserable for split lockers") Suggested-by: Thomas Gleixner Signed-off-by: Guilherme G. Piccoli Signed-off-by: Dave Hansen Reviewed-by: Tony Luck Tested-by: Andre Almeida Link: https://lore.kernel.org/all/20221024200254.635256-1-gpiccoli%40igalia.com --- Documentation/admin-guide/sysctl/kernel.rst | 23 ++++++++ arch/x86/kernel/cpu/intel.c | 63 +++++++++++++++++---- 2 files changed, 76 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 98d1b198b2b4c1..c2c64c1b706ff6 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -1314,6 +1314,29 @@ watchdog work to be queued by the watchdog timer function, otherwise the NMI watchdog — if enabled — can detect a hard lockup condition. +split_lock_mitigate (x86 only) +============================== + +On x86, each "split lock" imposes a system-wide performance penalty. On larger +systems, large numbers of split locks from unprivileged users can result in +denials of service to well-behaved and potentially more important users. + +The kernel mitigates these bad users by detecting split locks and imposing +penalties: forcing them to wait and only allowing one core to execute split +locks at a time. + +These mitigations can make those bad applications unbearably slow. Setting +split_lock_mitigate=0 may restore some application performance, but will also +increase system exposure to denial of service attacks from split lock users. + += =================================================================== +0 Disable the mitigation mode - just warns the split lock on kernel log + and exposes the system to denials of service from the split lockers. +1 Enable the mitigation mode (this is the default) - penalizes the split + lockers with intentional performance degradation. += =================================================================== + + stack_erasing ============= diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 2d7ea5480ec339..42789965048339 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1034,8 +1034,32 @@ static const struct { static struct ratelimit_state bld_ratelimit; +static unsigned int sysctl_sld_mitigate = 1; static DEFINE_SEMAPHORE(buslock_sem); +#ifdef CONFIG_PROC_SYSCTL +static struct ctl_table sld_sysctls[] = { + { + .procname = "split_lock_mitigate", + .data = &sysctl_sld_mitigate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init sld_mitigate_sysctl_init(void) +{ + register_sysctl_init("kernel", sld_sysctls); + return 0; +} + +late_initcall(sld_mitigate_sysctl_init); +#endif + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt), ratelimit; @@ -1146,12 +1170,20 @@ static void split_lock_init(void) split_lock_verify_msr(sld_state != sld_off); } -static void __split_lock_reenable(struct work_struct *work) +static void __split_lock_reenable_unlock(struct work_struct *work) { sld_update_msr(true); up(&buslock_sem); } +static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock); + +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); +} +static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); + /* * If a CPU goes offline with pending delayed work to re-enable split lock * detection then the delayed work will be executed on some other CPU. That @@ -1169,10 +1201,9 @@ static int splitlock_cpu_offline(unsigned int cpu) return 0; } -static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable); - static void split_lock_warn(unsigned long ip) { + struct delayed_work *work; int cpu; if (!current->reported_split_lock) @@ -1180,14 +1211,26 @@ static void split_lock_warn(unsigned long ip) current->comm, current->pid, ip); current->reported_split_lock = 1; - /* misery factor #1, sleep 10ms before trying to execute split lock */ - if (msleep_interruptible(10) > 0) - return; - /* Misery factor #2, only allow one buslocked disabled core at a time */ - if (down_interruptible(&buslock_sem) == -EINTR) - return; + if (sysctl_sld_mitigate) { + /* + * misery factor #1: + * sleep 10ms before trying to execute split lock. + */ + if (msleep_interruptible(10) > 0) + return; + /* + * Misery factor #2: + * only allow one buslocked disabled core at a time. + */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + work = &sl_reenable_unlock; + } else { + work = &sl_reenable; + } + cpu = get_cpu(); - schedule_delayed_work_on(cpu, &split_lock_reenable, 2); + schedule_delayed_work_on(cpu, work, 2); /* Disable split lock detection on this CPU to make progress */ sld_update_msr(false); From 858ce2c64fc57219d4b794bdb8f94d0b9eeec979 Mon Sep 17 00:00:00 2001 From: Piotr Gorski Date: Thu, 25 Aug 2022 16:28:14 +0200 Subject: [PATCH 16/37] kbuild-modules-6.1: allow setting zstd compression level for modules and the kernel image Signed-off-by: Piotr Gorski --- init/Kconfig | 13 +++++++++++++ kernel/module/Kconfig | 25 +++++++++++++++++++++++++ scripts/Makefile.lib | 13 ++++++++++--- scripts/Makefile.modinst | 7 ++++++- 4 files changed, 54 insertions(+), 4 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 89a3e52edcb9f9..6aa69cf5af7c47 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -350,6 +350,19 @@ config KERNEL_UNCOMPRESSED endchoice +menu "ZSTD compression options" + depends on KERNEL_ZSTD + +config ZSTD_COMP_VAL + int "Compression level (1-22)" + range 1 22 + default "22" + help + Choose a compression level for zstd kernel compression. + Default is 22, which is the maximum. + +endmenu + config DEFAULT_INIT string "Default init path" default "" diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 424b3bc58f3f51..ecf2798c5ccf75 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -219,6 +219,31 @@ config MODULE_COMPRESS_ZSTD endchoice +menu "ZSTD module compression options" + depends on MODULE_COMPRESS_ZSTD + +config MODULE_COMPRESS_ZSTD_LEVEL + int "Compression level (1-19)" + range 1 19 + default 9 + help + Compression level used by zstd for compressing modules. + +config MODULE_COMPRESS_ZSTD_ULTRA + bool "Enable ZSTD ultra compression" + help + Compress modules with ZSTD using the highest possible compression. + +config MODULE_COMPRESS_ZSTD_LEVEL_ULTRA + int "Compression level (20-22)" + depends on MODULE_COMPRESS_ZSTD_ULTRA + range 20 22 + default 20 + help + Ultra compression level used by zstd for compressing modules. + +endmenu + config MODULE_DECOMPRESS bool "Support in-kernel module decompression" depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 3aa384cec76b8b..2e16fcca038d38 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -526,14 +526,21 @@ quiet_cmd_xzmisc = XZMISC $@ # decompression is used, like initramfs decompression, zstd22 should likely not # be used because it would require zstd to allocate a 128 MB buffer. +ifdef CONFIG_ZSTD_COMP_VAL +zstd_comp_val := $(CONFIG_ZSTD_COMP_VAL) +ifeq ($(shell test $(zstd_comp_val) -gt 19; echo $$?),0) +zstd_comp_val += --ultra +endif +endif + quiet_cmd_zstd = ZSTD $@ - cmd_zstd = cat $(real-prereqs) | $(ZSTD) -19 > $@ + cmd_zstd = cat $(real-prereqs) | $(ZSTD) -T0 -19 > $@ quiet_cmd_zstd22 = ZSTD22 $@ - cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -22 --ultra > $@ + cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -T0 -22 --ultra > $@ quiet_cmd_zstd22_with_size = ZSTD22 $@ - cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -22 --ultra; $(size_append); } > $@ + cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -T0 -$(zstd_comp_val); $(size_append); } > $@ # ASM offsets # --------------------------------------------------------------------------- diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst index a4c987c23750f6..132863cf3183ce 100644 --- a/scripts/Makefile.modinst +++ b/scripts/Makefile.modinst @@ -96,8 +96,13 @@ quiet_cmd_gzip = GZIP $@ cmd_gzip = $(KGZIP) -n -f $< quiet_cmd_xz = XZ $@ cmd_xz = $(XZ) --lzma2=dict=2MiB -f $< +ifdef CONFIG_MODULE_COMPRESS_ZSTD_ULTRA quiet_cmd_zstd = ZSTD $@ - cmd_zstd = $(ZSTD) -T0 --rm -f -q $< + cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL_ULTRA) --ultra --zstd=wlog=21 -T0 --rm -f -q $< +else +quiet_cmd_zstd = ZSTD $@ + cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL) --zstd=wlog=21 -T0 --rm -f -q $< +endif $(dst)/%.ko.gz: $(dst)/%.ko FORCE $(call cmd,gzip) From 2cbb991fc6ec17f702b551a259a9f9aca86f4306 Mon Sep 17 00:00:00 2001 From: Piotr Gorski Date: Tue, 6 Sep 2022 11:06:39 +0200 Subject: [PATCH 17/37] cachyos-6.1: mm/swap: Disable swap-in readahead Signed-off-by: Piotr Gorski --- mm/swap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/swap.c b/mm/swap.c index 955930f41d20c6..6e0976bf6b7db0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1113,6 +1113,10 @@ EXPORT_SYMBOL(pagevec_lookup_range_tag); */ void __init swap_setup(void) { +#ifdef CONFIG_CACHY + /* Only swap-in pages requested, avoid readahead */ + page_cluster = 0; +#else unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ @@ -1124,4 +1128,5 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ +#endif } From d0f753267031f6b5192956255e5f12097bf1309b Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 17 Oct 2022 01:59:54 +0200 Subject: [PATCH 18/37] bfq-6.1: set CachyOS branding Signed-off-by: Piotr Gorski Signed-off-by: Peter Jung --- block/bfq-iosched.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7ea427817f7f5f..2d739a163ba044 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7461,6 +7461,7 @@ MODULE_ALIAS("bfq-iosched"); static int __init bfq_init(void) { int ret; + char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.0"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); @@ -7492,6 +7493,11 @@ static int __init bfq_init(void) if (ret) goto slab_kill; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + strcat(msg, " (with cgroups support)"); +#endif + pr_info("%s", msg); + return 0; slab_kill: From 8ec3d6344be13ba69703ee93a060270a4418f035 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Wed, 5 Oct 2022 19:55:34 +0200 Subject: [PATCH 19/37] x86/build: Add more x86_64 optimizations Signed-off-by: Peter Jung --- arch/x86/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 3f5fe79f03abff..2d799a69918ecf 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -67,7 +67,8 @@ export BITS # # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 # -KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx +KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 \ + -mno-avx512f -O3 KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 ifeq ($(CONFIG_X86_KERNEL_IBT),y) From 4bb62809f0addb77a73258ada73964ee4fb39ab3 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Fri, 16 Dec 2022 14:21:35 +0100 Subject: [PATCH 20/37] Revert "bfq-6.1: set CachyOS branding" This reverts commit d0f753267031f6b5192956255e5f12097bf1309b. --- block/bfq-iosched.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 2d739a163ba044..7ea427817f7f5f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7461,7 +7461,6 @@ MODULE_ALIAS("bfq-iosched"); static int __init bfq_init(void) { int ret; - char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.0"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); @@ -7493,11 +7492,6 @@ static int __init bfq_init(void) if (ret) goto slab_kill; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - strcat(msg, " (with cgroups support)"); -#endif - pr_info("%s", msg); - return 0; slab_kill: From a5f9643d2892f496183284d74c8ea50d81d87686 Mon Sep 17 00:00:00 2001 From: Piotr Gorski Date: Wed, 21 Dec 2022 10:21:17 +0100 Subject: [PATCH 21/37] Fix some cfs/cachy tweaks Signed-off-by: Piotr Gorski --- drivers/md/dm-crypt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index cdf9d8c7b556af..973fe8f80051b3 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3137,11 +3137,6 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar } } -#ifdef CONFIG_CACHY - set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); - set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); -#endif - return 0; } @@ -3212,6 +3207,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } +#ifdef CONFIG_CACHY + set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); + set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); +#endif + ret = crypt_ctr_cipher(ti, argv[0], argv[1]); if (ret < 0) goto bad; From 9b06c8a6da0355aaf62f654e7435ccdb9cd7c546 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 29 Dec 2022 12:11:30 +0100 Subject: [PATCH 22/37] Cachy: Initialize ata before graphics ATA init is the long pole in the boot process, and its asynchronous. move the graphics init after it so that ata and graphics initialize in parallel Signed-off-by: Peter Jung --- drivers/Makefile | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/Makefile b/drivers/Makefile index bdf1c66141c9bd..1e1a0832fb48a1 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -59,15 +59,8 @@ obj-y += char/ # iommu/ comes before gpu as gpu are using iommu controllers obj-y += iommu/ -# gpu/ comes after char for AGP vs DRM startup and after iommu -obj-y += gpu/ - obj-$(CONFIG_CONNECTOR) += connector/ -# i810fb and intelfb depend on char/agp/ -obj-$(CONFIG_FB_I810) += video/fbdev/i810/ -obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ - obj-$(CONFIG_PARPORT) += parport/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ @@ -79,6 +72,14 @@ obj-y += macintosh/ obj-y += scsi/ obj-y += nvme/ obj-$(CONFIG_ATA) += ata/ + +# gpu/ comes after char for AGP vs DRM startup and after iommu +obj-y += gpu/ + +# i810fb and intelfb depend on char/agp/ +obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ + obj-$(CONFIG_TARGET_CORE) += target/ obj-$(CONFIG_MTD) += mtd/ obj-$(CONFIG_SPI) += spi/ From f95c4f023a0d1e0c77a4b3d965f528d03437c178 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 29 Dec 2022 12:12:32 +0100 Subject: [PATCH 23/37] Cachy: mm: Disable unevictable compaction Signed-off-by: Peter Jung --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 57e1d8c5b50528..d66ec503a7ca84 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -576,7 +576,7 @@ config COMPACTION config COMPACT_UNEVICTABLE_DEFAULT int depends on COMPACTION - default 0 if PREEMPT_RT + default 0 if PREEMPT_RT || CACHY default 1 # From e9e8a1c6c2fcdfd13ccacecbb091cf58b2049eca Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 29 Dec 2022 12:13:44 +0100 Subject: [PATCH 24/37] Cachy: Tune mgLRU to protect cache used in the last second Although not identical to the le9 patches that protect a byte-amount of cache through tunables, multigenerational LRU now supports protecting cache accessed in the last X milliseconds. Signed-off-by: Peter Jung --- mm/vmscan.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index c058a45dd43887..77c2bb3bccf8b9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4526,7 +4526,11 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned } /* to protect the working set of the last N jiffies */ +#ifdef CONFIG_CACHY +static unsigned long lru_gen_min_ttl __read_mostly = HZ; +#else static unsigned long lru_gen_min_ttl __read_mostly; +#endif static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { From 5663ef07f96779fa48f1a2b07b101e04c8ab46d5 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Sat, 31 Dec 2022 14:28:49 +0100 Subject: [PATCH 25/37] Revert "THP Shrinker" This reverts commit eef4465102fd7b86e21eb3101d7355a214f588d0. --- Documentation/admin-guide/mm/transhuge.rst | 9 - include/linux/huge_mm.h | 9 - include/linux/list_lru.h | 24 -- include/linux/mm_types.h | 5 - include/linux/rmap.h | 2 +- include/linux/vm_event_item.h | 3 - mm/Makefile | 2 +- mm/huge_memory.c | 156 +----------- mm/list_lru.c | 49 ---- mm/migrate.c | 73 +----- mm/migrate_device.c | 4 +- mm/page_alloc.c | 6 - mm/thp_utilization.c | 222 ------------------ mm/vmstat.c | 3 - .../selftests/vm/split_huge_page_test.c | 115 +-------- tools/testing/selftests/vm/vm_util.c | 23 -- tools/testing/selftests/vm/vm_util.h | 3 - 17 files changed, 18 insertions(+), 690 deletions(-) delete mode 100644 mm/thp_utilization.c diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 21d86303c97ef7..8ee78ec232ebcf 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -304,15 +304,6 @@ To identify what applications are mapping file transparent huge pages, it is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields for each mapping. -The utilization of transparent hugepages can be viewed by reading -``/sys/kernel/debug/thp_utilization``. The utilization of a THP is defined -as the ratio of non zero filled 4kb pages to the total number of pages in a -THP. The buckets are labelled by the range of total utilized 4kb pages with -one line per utilization bucket. Each line contains the total number of -THPs in that bucket and the total number of zero filled 4kb pages summed -over all THPs in that bucket. The last two lines show the timestamp and -duration respectively of the most recent scan over all of physical memory. - Note that reading the smaps file is expensive and reading it frequently will incur overhead. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1745c94eb1039b..a1341fdcf666d0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -178,8 +178,6 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -bool can_shrink_thp(struct folio *folio); - void prep_transhuge_page(struct page *page); void free_transhuge_page(struct page *page); @@ -191,8 +189,6 @@ static inline int split_huge_page(struct page *page) } void deferred_split_huge_page(struct page *page); -void add_underutilized_thp(struct page *page); - void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio); @@ -306,11 +302,6 @@ static inline struct list_head *page_deferred_list(struct page *page) return &page[2].deferred_list; } -static inline struct list_head *page_underutilized_thp_list(struct page *page) -{ - return &page[3].underutilized_thp_list; -} - #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index c2cf146ea88002..b35968ee9fb508 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -89,18 +89,6 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren */ bool list_lru_add(struct list_lru *lru, struct list_head *item); -/** - * list_lru_add_page: add an element to the lru list's tail - * @list_lru: the lru pointer - * @page: the page containing the item - * @item: the item to be deleted. - * - * This function works the same as list_lru_add in terms of list - * manipulation. Used for non slab objects contained in the page. - * - * Return value: true if the list was updated, false otherwise - */ -bool list_lru_add_page(struct list_lru *lru, struct page *page, struct list_head *item); /** * list_lru_del: delete an element to the lru list * @list_lru: the lru pointer @@ -114,18 +102,6 @@ bool list_lru_add_page(struct list_lru *lru, struct page *page, struct list_head */ bool list_lru_del(struct list_lru *lru, struct list_head *item); -/** - * list_lru_del_page: delete an element to the lru list - * @list_lru: the lru pointer - * @page: the page containing the item - * @item: the item to be deleted. - * - * This function works the same as list_lru_del in terms of list - * manipulation. Used for non slab objects contained in the page. - * - * Return value: true if the list was updated, false otherwise - */ -bool list_lru_del_page(struct list_lru *lru, struct page *page, struct list_head *item); /** * list_lru_count_one: return the number of objects currently held by @lru * @lru: the lru pointer. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index da1d1cf4215871..500e536796ca4a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -152,11 +152,6 @@ struct page { /* For both global and memcg */ struct list_head deferred_list; }; - struct { /* Third tail page of compound page */ - unsigned long _compound_pad_3; /* compound_head */ - unsigned long _compound_pad_4; - struct list_head underutilized_thp_list; - }; struct { /* Page table pages */ unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3f83bbcf133367..bd3504d11b1559 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -428,7 +428,7 @@ int folio_mkclean(struct folio *); int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean); +void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 3618b10ddec9ce..3518dba1e02f4b 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -111,9 +111,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD THP_SPLIT_PUD, #endif - THP_SPLIT_FREE, - THP_SPLIT_UNMAP, - THP_SPLIT_REMAP_READONLY_ZERO_PAGE, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, THP_SWPOUT, diff --git a/mm/Makefile b/mm/Makefile index 5f76dc6ce044cf..8e105e5b3e2938 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -95,7 +95,7 @@ obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_NUMA) += memory-tiers.o obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o -obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o thp_utilization.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o ifdef CONFIG_SWAP diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9f2f15d8e02ef0..811d19b5c4f606 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -71,8 +71,6 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -static struct list_lru huge_low_util_page_lru; - bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, bool smaps, bool in_pf, bool enforce_sysfs) { @@ -236,53 +234,6 @@ static struct shrinker huge_zero_page_shrinker = { .seeks = DEFAULT_SEEKS, }; -static enum lru_status low_util_free_page(struct list_head *item, - struct list_lru_one *lru, - spinlock_t *lru_lock, - void *cb_arg) -{ - struct folio *folio = page_folio(list_entry(item, struct page, underutilized_thp_list)); - struct page *head = &folio->page; - - if (get_page_unless_zero(head)) { - /* Inverse lock order from add_underutilized_thp() */ - if (!trylock_page(head)) { - put_page(head); - return LRU_SKIP; - } - list_lru_isolate(lru, item); - spin_unlock_irq(lru_lock); - if (can_shrink_thp(folio)) - split_huge_page(head); - spin_lock_irq(lru_lock); - unlock_page(head); - put_page(head); - } - - return LRU_REMOVED_RETRY; -} - -static unsigned long shrink_huge_low_util_page_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - return HPAGE_PMD_NR * list_lru_shrink_count(&huge_low_util_page_lru, sc); -} - -static unsigned long shrink_huge_low_util_page_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - return HPAGE_PMD_NR * list_lru_shrink_walk_irq(&huge_low_util_page_lru, - sc, low_util_free_page, NULL); -} - -static struct shrinker huge_low_util_page_shrinker = { - .count_objects = shrink_huge_low_util_page_count, - .scan_objects = shrink_huge_low_util_page_scan, - .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | - SHRINKER_NONSLAB, -}; - #ifdef CONFIG_SYSFS static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -534,9 +485,6 @@ static int __init hugepage_init(void) if (err) goto err_slab; - err = register_shrinker(&huge_low_util_page_shrinker, "thp-low-util"); - if (err) - goto err_low_util_shrinker; err = register_shrinker(&huge_zero_page_shrinker, "thp-zero"); if (err) goto err_hzp_shrinker; @@ -544,9 +492,6 @@ static int __init hugepage_init(void) if (err) goto err_split_shrinker; - err = list_lru_init_memcg(&huge_low_util_page_lru, &huge_low_util_page_shrinker); - if (err) - goto err_low_util_list_lru; /* * By default disable transparent hugepages on smaller systems, * where the extra memory used could hurt more than TLB overhead @@ -563,14 +508,10 @@ static int __init hugepage_init(void) return 0; err_khugepaged: - list_lru_destroy(&huge_low_util_page_lru); -err_low_util_list_lru: unregister_shrinker(&deferred_split_shrinker); err_split_shrinker: unregister_shrinker(&huge_zero_page_shrinker); err_hzp_shrinker: - unregister_shrinker(&huge_low_util_page_shrinker); -err_low_util_shrinker: khugepaged_destroy(); err_slab: hugepage_exit_sysfs(hugepage_kobj); @@ -645,7 +586,6 @@ void prep_transhuge_page(struct page *page) */ INIT_LIST_HEAD(page_deferred_list(page)); - INIT_LIST_HEAD(page_underutilized_thp_list(page)); set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } @@ -2436,7 +2376,7 @@ static void unmap_folio(struct folio *folio) try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); } -static void remap_page(struct folio *folio, unsigned long nr, bool unmap_clean) +static void remap_page(struct folio *folio, unsigned long nr) { int i = 0; @@ -2444,7 +2384,7 @@ static void remap_page(struct folio *folio, unsigned long nr, bool unmap_clean) if (!folio_test_anon(folio)) return; for (;;) { - remove_migration_ptes(folio, folio, true, unmap_clean); + remove_migration_ptes(folio, folio, true); i += folio_nr_pages(folio); if (i >= nr) break; @@ -2514,7 +2454,8 @@ static void __split_huge_page_tail(struct page *head, int tail, LRU_GEN_MASK | LRU_REFS_MASK)); /* ->mapping in first tail page is compound_mapcount */ - VM_BUG_ON_PAGE(tail > 3 && page_tail->mapping != TAIL_MAPPING, page_tail); + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, + page_tail); page_tail->mapping = head->mapping; page_tail->index = head->index + tail; @@ -2567,8 +2508,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct address_space *swap_cache = NULL; unsigned long offset = 0; unsigned int nr = thp_nr_pages(head); - LIST_HEAD(pages_to_free); - int nr_pages_to_free = 0; int i; /* complete memcg works before add pages to LRU */ @@ -2631,7 +2570,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } local_irq_enable(); - remap_page(folio, nr, PageAnon(head)); + remap_page(folio, nr); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; @@ -2645,34 +2584,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, continue; unlock_page(subpage); - /* - * If a tail page has only two references left, one inherited - * from the isolation of its head and the other from - * lru_add_page_tail() which we are about to drop, it means this - * tail page was concurrently zapped. Then we can safely free it - * and save page reclaim or migration the trouble of trying it. - */ - if (list && page_ref_freeze(subpage, 2)) { - VM_BUG_ON_PAGE(PageLRU(subpage), subpage); - VM_BUG_ON_PAGE(PageCompound(subpage), subpage); - VM_BUG_ON_PAGE(page_mapped(subpage), subpage); - - ClearPageActive(subpage); - ClearPageUnevictable(subpage); - list_move(&subpage->lru, &pages_to_free); - nr_pages_to_free++; - continue; - } - - /* - * If a tail page has only one reference left, it will be freed - * by the call to free_page_and_swap_cache below. Since zero - * subpages are no longer remapped, there will only be one - * reference left in cases outside of reclaim or migration. - */ - if (page_ref_count(subpage) == 1) - nr_pages_to_free++; - /* * Subpages may be freed if there wasn't any mapping * like if add_to_swap() is running on a lru page that @@ -2682,13 +2593,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, */ free_page_and_swap_cache(subpage); } - - if (!nr_pages_to_free) - return; - - mem_cgroup_uncharge_list(&pages_to_free); - free_unref_page_list(&pages_to_free); - count_vm_events(THP_SPLIT_FREE, nr_pages_to_free); } /* Racy check whether the huge page can be split */ @@ -2731,7 +2635,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) struct folio *folio = page_folio(page); struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page); XA_STATE(xas, &folio->mapping->i_pages, folio->index); - struct list_head *underutilized_thp_list = page_underutilized_thp_list(&folio->page); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int extra_pins, ret; @@ -2839,10 +2742,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) list_del(page_deferred_list(&folio->page)); } spin_unlock(&ds_queue->split_queue_lock); - /* Frozen refs lock out additions, test can be lockless */ - if (!list_empty(underutilized_thp_list)) - list_lru_del_page(&huge_low_util_page_lru, &folio->page, - underutilized_thp_list); if (mapping) { int nr = folio_nr_pages(folio); @@ -2865,7 +2764,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) xas_unlock(&xas); local_irq_enable(); - remap_page(folio, folio_nr_pages(folio), false); + remap_page(folio, folio_nr_pages(folio)); ret = -EBUSY; } @@ -2885,7 +2784,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) void free_transhuge_page(struct page *page) { struct deferred_split *ds_queue = get_deferred_split_queue(page); - struct list_head *underutilized_thp_list = page_underutilized_thp_list(page); unsigned long flags; spin_lock_irqsave(&ds_queue->split_queue_lock, flags); @@ -2894,13 +2792,6 @@ void free_transhuge_page(struct page *page) list_del(page_deferred_list(page)); } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); - /* A dead page cannot be re-added to the THP shrinker, test can be lockless */ - if (!list_empty(underutilized_thp_list)) - list_lru_del_page(&huge_low_util_page_lru, page, underutilized_thp_list); - - if (PageLRU(page)) - __folio_clear_lru_flags(page_folio(page)); - free_compound_page(page); } @@ -2941,41 +2832,6 @@ void deferred_split_huge_page(struct page *page) spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } -void add_underutilized_thp(struct page *page) -{ - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - VM_BUG_ON_PAGE(!PageAnon(page), page); - - /* hugetlbfs pages do not have an associated memcgroup */ - if (PageHuge(page)) - return; - - /* - * Need to take a reference on the page to prevent the page from getting free'd from - * under us while we are adding the THP to the shrinker. - */ - if (!get_page_unless_zero(page)) - return; - - if (is_huge_zero_page(page)) - goto out_put; - - /* Stabilize page->memcg to allocate and add to the same list */ - lock_page(page); - -#ifdef CONFIG_MEMCG_KMEM - if (memcg_list_lru_alloc(page_memcg(page), &huge_low_util_page_lru, GFP_KERNEL)) - goto out_unlock; -#endif - - list_lru_add_page(&huge_low_util_page_lru, page, page_underutilized_thp_list(page)); - -out_unlock: - unlock_page(page); -out_put: - put_page(page); -} - static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { diff --git a/mm/list_lru.c b/mm/list_lru.c index 8cc56a84b55442..a05e5bef3b4007 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -140,32 +140,6 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) } EXPORT_SYMBOL_GPL(list_lru_add); -bool list_lru_add_page(struct list_lru *lru, struct page *page, struct list_head *item) -{ - int nid = page_to_nid(page); - struct list_lru_node *nlru = &lru->node[nid]; - struct list_lru_one *l; - struct mem_cgroup *memcg; - unsigned long flags; - - spin_lock_irqsave(&nlru->lock, flags); - if (list_empty(item)) { - memcg = page_memcg(page); - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); - list_add_tail(item, &l->list); - /* Set shrinker bit if the first element was added */ - if (!l->nr_items++) - set_shrinker_bit(memcg, nid, - lru_shrinker_id(lru)); - nlru->nr_items++; - spin_unlock_irqrestore(&nlru->lock, flags); - return true; - } - spin_unlock_irqrestore(&nlru->lock, flags); - return false; -} -EXPORT_SYMBOL_GPL(list_lru_add_page); - bool list_lru_del(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); @@ -186,29 +160,6 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) } EXPORT_SYMBOL_GPL(list_lru_del); -bool list_lru_del_page(struct list_lru *lru, struct page *page, struct list_head *item) -{ - int nid = page_to_nid(page); - struct list_lru_node *nlru = &lru->node[nid]; - struct list_lru_one *l; - struct mem_cgroup *memcg; - unsigned long flags; - - spin_lock_irqsave(&nlru->lock, flags); - if (!list_empty(item)) { - memcg = page_memcg(page); - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); - list_del_init(item); - l->nr_items--; - nlru->nr_items--; - spin_unlock_irqrestore(&nlru->lock, flags); - return true; - } - spin_unlock_irqrestore(&nlru->lock, flags); - return false; -} -EXPORT_SYMBOL_GPL(list_lru_del_page); - void list_lru_isolate(struct list_lru_one *list, struct list_head *item) { list_del_init(item); diff --git a/mm/migrate.c b/mm/migrate.c index 2764b14d338374..dff333593a8ae2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -169,62 +168,13 @@ void putback_movable_pages(struct list_head *l) } } -static bool try_to_unmap_clean(struct page_vma_mapped_walk *pvmw, struct page *page) -{ - void *addr; - bool dirty; - pte_t newpte; - - VM_BUG_ON_PAGE(PageCompound(page), page); - VM_BUG_ON_PAGE(!PageAnon(page), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page); - - if (PageMlocked(page) || (pvmw->vma->vm_flags & VM_LOCKED)) - return false; - - /* - * The pmd entry mapping the old thp was flushed and the pte mapping - * this subpage has been non present. Therefore, this subpage is - * inaccessible. We don't need to remap it if it contains only zeros. - */ - addr = kmap_local_page(page); - dirty = memchr_inv(addr, 0, PAGE_SIZE); - kunmap_local(addr); - - if (dirty) - return false; - - pte_clear_not_present_full(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, false); - - if (userfaultfd_armed(pvmw->vma)) { - newpte = pte_mkspecial(pfn_pte(page_to_pfn(ZERO_PAGE(pvmw->address)), - pvmw->vma->vm_page_prot)); - ptep_clear_flush(pvmw->vma, pvmw->address, pvmw->pte); - set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte); - dec_mm_counter(pvmw->vma->vm_mm, MM_ANONPAGES); - count_vm_event(THP_SPLIT_REMAP_READONLY_ZERO_PAGE); - return true; - } - - dec_mm_counter(pvmw->vma->vm_mm, mm_counter(page)); - count_vm_event(THP_SPLIT_UNMAP); - return true; -} - -struct rmap_walk_arg { - struct folio *folio; - bool unmap_clean; -}; - /* * Restore a potential migration pte to a working pte entry */ static bool remove_migration_pte(struct folio *folio, - struct vm_area_struct *vma, unsigned long addr, void *arg) + struct vm_area_struct *vma, unsigned long addr, void *old) { - struct rmap_walk_arg *rmap_walk_arg = arg; - DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION); + DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; @@ -247,8 +197,6 @@ static bool remove_migration_pte(struct folio *folio, continue; } #endif - if (rmap_walk_arg->unmap_clean && try_to_unmap_clean(&pvmw, new)) - continue; folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); @@ -324,20 +272,13 @@ static bool remove_migration_pte(struct folio *folio, * Get rid of all migration entries and replace them by * references to the indicated page. */ -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean) +void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) { - struct rmap_walk_arg rmap_walk_arg = { - .folio = src, - .unmap_clean = unmap_clean, - }; - struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, - .arg = &rmap_walk_arg, + .arg = src, }; - VM_BUG_ON_FOLIO(unmap_clean && src != dst, src); - if (locked) rmap_walk_locked(dst, &rwc); else @@ -931,7 +872,7 @@ static int writeout(struct address_space *mapping, struct folio *folio) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(folio, folio, false, false); + remove_migration_ptes(folio, folio, false); rc = mapping->a_ops->writepage(&folio->page, &wbc); @@ -1187,7 +1128,7 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, if (page_was_mapped) remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, false, false); + rc == MIGRATEPAGE_SUCCESS ? dst : src, false); out_unlock_both: folio_unlock(dst); @@ -1397,7 +1338,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (page_was_mapped) remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, false, false); + rc == MIGRATEPAGE_SUCCESS ? dst : src, false); unlock_put_anon: folio_unlock(dst); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 59e7d571d91f07..721b2365dbca96 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -425,7 +425,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, continue; folio = page_folio(page); - remove_migration_ptes(folio, folio, false, false); + remove_migration_ptes(folio, folio, false); src_pfns[i] = 0; folio_unlock(folio); @@ -851,7 +851,7 @@ void migrate_device_finalize(unsigned long *src_pfns, src = page_folio(page); dst = page_folio(newpage); - remove_migration_ptes(src, dst, false, false); + remove_migration_ptes(src, dst, false); folio_unlock(src); if (is_zone_device_page(page)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 70e9bfd76e040a..6e60657875d328 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1336,12 +1336,6 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) * deferred_list.next -- ignore value. */ break; - case 3: - /* - * the third tail page: ->mapping is - * underutilized_thp_list.next -- ignore value. - */ - break; default: if (page->mapping != TAIL_MAPPING) { bad_page(page, "corrupted mapping in tail page"); diff --git a/mm/thp_utilization.c b/mm/thp_utilization.c deleted file mode 100644 index 0cb18f122c5763..00000000000000 --- a/mm/thp_utilization.c +++ /dev/null @@ -1,222 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2022 Meta, Inc. - * Authors: Alexander Zhu, Johannes Weiner, Rik van Riel - */ - -#include -#include -#include -/* - * The number of utilization buckets THPs will be grouped in - * under /sys/kernel/debug/thp_utilization. - */ -#define THP_UTIL_BUCKET_NR 10 -/* - * The number of hugepages to scan through on each periodic - * run of the scanner that generates /sys/kernel/debug/thp_utilization. - */ -#define THP_UTIL_SCAN_SIZE 256 - -static void thp_utilization_workfn(struct work_struct *work); -static DECLARE_DELAYED_WORK(thp_utilization_work, thp_utilization_workfn); - -struct thp_scan_info_bucket { - int nr_thps; - int nr_zero_pages; -}; - -struct thp_scan_info { - struct thp_scan_info_bucket buckets[THP_UTIL_BUCKET_NR]; - struct zone *scan_zone; - struct timespec64 last_scan_duration; - struct timespec64 last_scan_time; - unsigned long pfn; -}; - -/* - * thp_scan_debugfs is referred to when /sys/kernel/debug/thp_utilization - * is opened. thp_scan is used to keep track fo the current scan through - * physical memory. - */ -static struct thp_scan_info thp_scan_debugfs; -static struct thp_scan_info thp_scan; - -#ifdef CONFIG_DEBUG_FS -static int thp_utilization_show(struct seq_file *seqf, void *pos) -{ - int i; - int start; - int end; - - for (i = 0; i < THP_UTIL_BUCKET_NR; i++) { - start = i * HPAGE_PMD_NR / THP_UTIL_BUCKET_NR; - end = (i + 1 == THP_UTIL_BUCKET_NR) - ? HPAGE_PMD_NR - : ((i + 1) * HPAGE_PMD_NR / THP_UTIL_BUCKET_NR - 1); - /* The last bucket will need to contain 100 */ - seq_printf(seqf, "Utilized[%d-%d]: %d %d\n", start, end, - thp_scan_debugfs.buckets[i].nr_thps, - thp_scan_debugfs.buckets[i].nr_zero_pages); - } - - seq_printf(seqf, "Last Scan Time: %lu.%02lus\n", - (unsigned long)thp_scan_debugfs.last_scan_time.tv_sec, - (thp_scan_debugfs.last_scan_time.tv_nsec / (NSEC_PER_SEC / 100))); - - seq_printf(seqf, "Last Scan Duration: %lu.%02lus\n", - (unsigned long)thp_scan_debugfs.last_scan_duration.tv_sec, - (thp_scan_debugfs.last_scan_duration.tv_nsec / (NSEC_PER_SEC / 100))); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(thp_utilization); - -static int __init thp_utilization_debugfs(void) -{ - debugfs_create_file("thp_utilization", 0200, NULL, NULL, - &thp_utilization_fops); - return 0; -} -late_initcall(thp_utilization_debugfs); -#endif - -static int thp_utilization_bucket(int num_utilized_pages) -{ - int bucket; - - if (num_utilized_pages < 0 || num_utilized_pages > HPAGE_PMD_NR) - return -1; - - /* Group THPs into utilization buckets */ - bucket = num_utilized_pages * THP_UTIL_BUCKET_NR / HPAGE_PMD_NR; - return min(bucket, THP_UTIL_BUCKET_NR - 1); -} - -static int thp_number_utilized_pages(struct folio *folio) -{ - int thp_nr_utilized_pages = HPAGE_PMD_NR; - void *kaddr; - int i; - bool zero_page; - - if (!folio || !folio_test_anon(folio) || !folio_test_transhuge(folio)) - return -1; - - for (i = 0; i < folio_nr_pages(folio); i++) { - kaddr = kmap_local_folio(folio, i); - zero_page = !memchr_inv(kaddr, 0, PAGE_SIZE); - - if (zero_page) - thp_nr_utilized_pages--; - - kunmap_local(kaddr); - } - - return thp_nr_utilized_pages; -} - -bool can_shrink_thp(struct folio *folio) -{ - int bucket, num_utilized_pages; - - if (!folio || !folio_test_anon(folio) || !folio_test_transhuge(folio)) - return false; - - num_utilized_pages = thp_number_utilized_pages(folio); - bucket = thp_utilization_bucket(num_utilized_pages); - - return bucket >= 0 && bucket < THP_UTIL_BUCKET_NR - 1; -} - -static void thp_scan_next_zone(void) -{ - struct timespec64 current_time; - bool update_debugfs; - /* - * THP utilization worker thread has reached the end - * of the memory zone. Proceed to the next zone. - */ - thp_scan.scan_zone = next_zone(thp_scan.scan_zone); - update_debugfs = !thp_scan.scan_zone; - thp_scan.scan_zone = update_debugfs ? (first_online_pgdat())->node_zones - : thp_scan.scan_zone; - thp_scan.pfn = (thp_scan.scan_zone->zone_start_pfn + HPAGE_PMD_NR - 1) - & ~(HPAGE_PMD_SIZE - 1); - if (!update_debugfs) - return; - - /* - * If the worker has scanned through all of physical memory then - * update information displayed in /sys/kernel/debug/thp_utilization - */ - ktime_get_ts64(¤t_time); - thp_scan_debugfs.last_scan_duration = timespec64_sub(current_time, - thp_scan_debugfs.last_scan_time); - thp_scan_debugfs.last_scan_time = current_time; - - memcpy(&thp_scan_debugfs.buckets, &thp_scan.buckets, sizeof(thp_scan.buckets)); - memset(&thp_scan.buckets, 0, sizeof(thp_scan.buckets)); -} - -static void thp_util_scan(unsigned long pfn_end) -{ - struct page *page = NULL; - int bucket, current_pfn, num_utilized_pages; - int i; - /* - * Scan through each memory zone in chunks of THP_UTIL_SCAN_SIZE - * PFNs every second looking for anonymous THPs. - */ - for (i = 0; i < THP_UTIL_SCAN_SIZE; i++) { - current_pfn = thp_scan.pfn; - thp_scan.pfn += HPAGE_PMD_NR; - if (current_pfn >= pfn_end) - return; - - page = pfn_to_online_page(current_pfn); - if (!page) - continue; - - num_utilized_pages = thp_number_utilized_pages(page_folio(page)); - bucket = thp_utilization_bucket(num_utilized_pages); - if (bucket < 0) - continue; - - if (bucket < THP_UTIL_BUCKET_NR - 1) - add_underutilized_thp(page); - - thp_scan.buckets[bucket].nr_thps++; - thp_scan.buckets[bucket].nr_zero_pages += (HPAGE_PMD_NR - num_utilized_pages); - } -} - -static void thp_utilization_workfn(struct work_struct *work) -{ - unsigned long pfn_end; - /* - * Worker function that scans through all of physical memory - * for anonymous THPs. - */ - if (!thp_scan.scan_zone) - thp_scan.scan_zone = (first_online_pgdat())->node_zones; - - pfn_end = zone_end_pfn(thp_scan.scan_zone); - /* If we have reached the end of the zone or end of physical memory - * move on to the next zone. Otherwise, scan the next PFNs in the - * current zone. - */ - if (!managed_zone(thp_scan.scan_zone) || thp_scan.pfn >= pfn_end) - thp_scan_next_zone(); - else - thp_util_scan(pfn_end); - - schedule_delayed_work(&thp_utilization_work, HZ); -} - -static int __init thp_scan_init(void) -{ - schedule_delayed_work(&thp_utilization_work, HZ); - return 0; -} -subsys_initcall(thp_scan_init); diff --git a/mm/vmstat.c b/mm/vmstat.c index 3d802eb6754d0b..b2371d745e007f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1359,9 +1359,6 @@ const char * const vmstat_text[] = { #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD "thp_split_pud", #endif - "thp_split_free", - "thp_split_unmap", - "thp_split_remap_readonly_zero_page", "thp_zero_page_alloc", "thp_zero_page_alloc_failed", "thp_swpout", diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c index 42f0e79a4508de..76e1c36dd9e577 100644 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -16,9 +16,6 @@ #include #include #include -#include /* Definition of SYS_* constants */ -#include -#include #include "vm_util.h" uint64_t pagesize; @@ -91,115 +88,6 @@ static void write_debugfs(const char *fmt, ...) } } -static char *allocate_zero_filled_hugepage(size_t len) -{ - char *result; - size_t i; - - result = memalign(pmd_pagesize, len); - if (!result) { - printf("Fail to allocate memory\n"); - exit(EXIT_FAILURE); - } - - madvise(result, len, MADV_HUGEPAGE); - - for (i = 0; i < len; i++) - result[i] = (char)0; - - return result; -} - -static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len) -{ - uint64_t rss_anon_before, rss_anon_after; - size_t i; - - if (!check_huge_anon(one_page, 4, pmd_pagesize)) { - printf("No THP is allocated\n"); - exit(EXIT_FAILURE); - } - - rss_anon_before = rss_anon(); - if (!rss_anon_before) { - printf("No RssAnon is allocated before split\n"); - exit(EXIT_FAILURE); - } - - /* split all THPs */ - write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, - (uint64_t)one_page + len); - - for (i = 0; i < len; i++) - if (one_page[i] != (char)0) { - printf("%ld byte corrupted\n", i); - exit(EXIT_FAILURE); - } - - if (!check_huge_anon(one_page, 0, pmd_pagesize)) { - printf("Still AnonHugePages not split\n"); - exit(EXIT_FAILURE); - } - - rss_anon_after = rss_anon(); - if (rss_anon_after >= rss_anon_before) { - printf("Incorrect RssAnon value. Before: %ld After: %ld\n", - rss_anon_before, rss_anon_after); - exit(EXIT_FAILURE); - } -} - -void split_pmd_zero_pages(void) -{ - char *one_page; - int nr_hpages = 4; - size_t len = nr_hpages * pmd_pagesize; - - one_page = allocate_zero_filled_hugepage(len); - verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len); - printf("Split zero filled huge pages successful\n"); - free(one_page); -} - -void split_pmd_zero_pages_uffd(void) -{ - char *one_page; - int nr_hpages = 4; - size_t len = nr_hpages * pmd_pagesize; - long uffd; /* userfaultfd file descriptor */ - struct uffdio_api uffdio_api; - struct uffdio_register uffdio_register; - - /* Create and enable userfaultfd object. */ - - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (uffd == -1) { - perror("userfaultfd"); - exit(1); - } - - uffdio_api.api = UFFD_API; - uffdio_api.features = 0; - if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { - perror("ioctl-UFFDIO_API"); - exit(1); - } - - one_page = allocate_zero_filled_hugepage(len); - - uffdio_register.range.start = (unsigned long)one_page; - uffdio_register.range.len = len; - uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { - perror("ioctl-UFFDIO_REGISTER"); - exit(1); - } - - verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len); - printf("Split zero filled huge pages with uffd successful\n"); - free(one_page); -} - void split_pmd_thp(void) { char *one_page; @@ -233,6 +121,7 @@ void split_pmd_thp(void) exit(EXIT_FAILURE); } + if (check_huge_anon(one_page, 0, pmd_pagesize)) { printf("Still AnonHugePages not split\n"); exit(EXIT_FAILURE); @@ -412,8 +301,6 @@ int main(int argc, char **argv) pageshift = ffs(pagesize) - 1; pmd_pagesize = read_pmd_pagesize(); - split_pmd_zero_pages(); - split_pmd_zero_pages_uffd(); split_pmd_thp(); split_pte_mapped_thp(); split_file_backed_thp(); diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c index 72f3edc64aaf9a..f11f8adda52186 100644 --- a/tools/testing/selftests/vm/vm_util.c +++ b/tools/testing/selftests/vm/vm_util.c @@ -6,7 +6,6 @@ #define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" #define SMAP_FILE_PATH "/proc/self/smaps" -#define STATUS_FILE_PATH "/proc/self/status" #define MAX_LINE_LENGTH 500 uint64_t pagemap_get_entry(int fd, char *start) @@ -73,28 +72,6 @@ uint64_t read_pmd_pagesize(void) return strtoul(buf, NULL, 10); } -uint64_t rss_anon(void) -{ - uint64_t rss_anon = 0; - int ret; - FILE *fp; - char buffer[MAX_LINE_LENGTH]; - - fp = fopen(STATUS_FILE_PATH, "r"); - if (!fp) - ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH); - - if (!check_for_pattern(fp, "RssAnon:", buffer, sizeof(buffer))) - goto err_out; - - if (sscanf(buffer, "RssAnon:%10ld kB", &rss_anon) != 1) - ksft_exit_fail_msg("Reading status error\n"); - -err_out: - fclose(fp); - return rss_anon; -} - bool __check_huge(void *addr, char *pattern, int nr_hpages, uint64_t hpage_size) { diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h index dd1885f6609716..5c35de454e08f3 100644 --- a/tools/testing/selftests/vm/vm_util.h +++ b/tools/testing/selftests/vm/vm_util.h @@ -1,15 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include -#include -#include uint64_t pagemap_get_entry(int fd, char *start); bool pagemap_is_softdirty(int fd, char *start); void clear_softdirty(void); bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); -uint64_t rss_anon(void); bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); From 64887dcc1add7695d59429ec9aa5052ff0036408 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Wed, 4 Jan 2023 12:20:53 +0100 Subject: [PATCH 26/37] Revert "lib-string.c-Optimize-memchr" This reverts commit 3d59f4a13c2ce240b2994ba28e876244dd94cb45. --- lib/string.c | 62 ++++++++++++++-------------------------------------- 1 file changed, 17 insertions(+), 45 deletions(-) diff --git a/lib/string.c b/lib/string.c index db397ee8b0ad0d..3371d26a0e390c 100644 --- a/lib/string.c +++ b/lib/string.c @@ -874,61 +874,24 @@ char *strnstr(const char *s1, const char *s2, size_t len) EXPORT_SYMBOL(strnstr); #endif -#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 - -#define MEMCHR_MASK_GEN(mask) (mask *= 0x0101010101010101ULL) - -#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) - -#define MEMCHR_MASK_GEN(mask) \ - do { \ - mask *= 0x01010101; \ - mask |= mask << 32; \ - } while (0) - -#else - -#define MEMCHR_MASK_GEN(mask) \ - do { \ - mask |= mask << 8; \ - mask |= mask << 16; \ - mask |= mask << 32; \ - } while (0) - -#endif - #ifndef __HAVE_ARCH_MEMCHR /** * memchr - Find a character in an area of memory. - * @p: The memory area + * @s: The memory area * @c: The byte to search for - * @length: The size of the area. + * @n: The size of the area. * * returns the address of the first occurrence of @c, or %NULL * if @c is not found */ -void *memchr(const void *p, int c, unsigned long length) +void *memchr(const void *s, int c, size_t n) { - u64 mask, val; - const void *end = p + length; - - c &= 0xff; - if (p <= end - 8) { - mask = c; - MEMCHR_MASK_GEN(mask); - - for (; p <= end - 8; p += 8) { - val = *(u64 *)p ^ mask; - if ((val + 0xfefefefefefefeffu) & - (~val & 0x8080808080808080u)) - break; + const unsigned char *p = s; + while (n-- != 0) { + if ((unsigned char)c == *p++) { + return (void *)(p - 1); } } - - for (; p < end; p++) - if (*(unsigned char *)p == c) - return (void *)p; - return NULL; } EXPORT_SYMBOL(memchr); @@ -964,7 +927,16 @@ void *memchr_inv(const void *start, int c, size_t bytes) return check_bytes8(start, value, bytes); value64 = value; - MEMCHR_MASK_GEN(value64); +#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 + value64 *= 0x0101010101010101ULL; +#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) + value64 *= 0x01010101; + value64 |= value64 << 32; +#else + value64 |= value64 << 8; + value64 |= value64 << 16; + value64 |= value64 << 32; +#endif prefix = (unsigned long)start % 8; if (prefix) { From 63cc8a0fb0d7c679b6fff23f5fb256826ca866de Mon Sep 17 00:00:00 2001 From: Piotr Gorski Date: Thu, 12 Jan 2023 18:17:25 +0100 Subject: [PATCH 27/37] update fs-patches Signed-off-by: Piotr Gorski --- fs/btrfs/tree-log.c | 30 ++++++++++++++++++++++++------ fs/xfs/libxfs/xfs_btree.c | 7 ++++++- fs/xfs/xfs_extent_busy.c | 1 + fs/xfs/xfs_icache.c | 10 ++++++++++ fs/xfs/xfs_ioctl.c | 4 ++-- 5 files changed, 43 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c3cf3dabe0b1b6..9da5cc3b6546f2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3857,7 +3857,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, path->slots[0]); if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; + } else if (ret < 0) { + err = ret; } + goto done; } @@ -3877,19 +3880,34 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, */ if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; + } else if (ret < 0) { + err = ret; + goto done; } + btrfs_release_path(path); /* - * Find the first key from this transaction again. See the note for - * log_new_dir_dentries, if we're logging a directory recursively we - * won't be holding its i_mutex, which means we can modify the directory - * while we're logging it. If we remove an entry between our first - * search and this search we'll not find the key again and can just - * bail. + * Find the first key from this transaction again or the one we were at + * in the loop below in case we had to reschedule. We may be logging the + * directory without holding its VFS lock, which happen when logging new + * dentries (through log_new_dir_dentries()) or in some cases when we + * need to log the parent directory of an inode. This means a dir index + * key might be deleted from the inode's root, and therefore we may not + * find it anymore. If we can't find it, just move to the next key. We + * can not bail out and ignore, because if we do that we will simply + * not log dir index keys that come after the one that was just deleted + * and we can end up logging a dir index range that ends at (u64)-1 + * (@last_offset is initialized to that), resulting in removing dir + * entries we should not remove at log replay time. */ search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret > 0) + ret = btrfs_next_item(root, path); + if (ret < 0) + err = ret; + /* If ret is 1, there are no more keys in the inode's root. */ if (ret != 0) goto done; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 4c16c8c31fcbcd..35f574421670da 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4666,7 +4666,12 @@ xfs_btree_space_to_height( const unsigned int *limits, unsigned long long leaf_blocks) { - unsigned long long node_blocks = limits[1]; + /* + * The root btree block can have fewer than minrecs pointers in it + * because the tree might not be big enough to require that amount of + * fanout. Hence it has a minimum size of 2 pointers, not limits[1]. + */ + unsigned long long node_blocks = 2; unsigned long long blocks_left = leaf_blocks - 1; unsigned int height = 1; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index ad22a003f9595c..f3d328e4a4408b 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -236,6 +236,7 @@ xfs_extent_busy_update_extent( * */ busyp->bno = fend; + busyp->length = bend - fend; } else if (bbno < fbno) { /* * Case 8: diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index eae7427062cf9a..778488effa9320 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1847,12 +1847,20 @@ xfs_inodegc_worker( struct xfs_inodegc, work); struct llist_node *node = llist_del_all(&gc->list); struct xfs_inode *ip, *n; + unsigned int nofs_flag; WRITE_ONCE(gc->items, 0); if (!node) return; + /* + * We can allocate memory here while doing writeback on behalf of + * memory reclaim. To avoid memory allocation deadlocks set the + * task-wide nofs context for the following operations. + */ + nofs_flag = memalloc_nofs_save(); + ip = llist_entry(node, struct xfs_inode, i_gclist); trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits)); @@ -1861,6 +1869,8 @@ xfs_inodegc_worker( xfs_iflags_set(ip, XFS_INACTIVATING); xfs_inodegc_inactivate(ip); } + + memalloc_nofs_restore(nofs_flag); } /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 1f783e97962968..85fbb3b71d1c67 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -754,7 +754,7 @@ xfs_bulkstat_fmt( static int xfs_bulk_ireq_setup( struct xfs_mount *mp, - struct xfs_bulk_ireq *hdr, + const struct xfs_bulk_ireq *hdr, struct xfs_ibulk *breq, void __user *ubuffer) { @@ -780,7 +780,7 @@ xfs_bulk_ireq_setup( switch (hdr->ino) { case XFS_BULK_IREQ_SPECIAL_ROOT: - hdr->ino = mp->m_sb.sb_rootino; + breq->startino = mp->m_sb.sb_rootino; break; default: return -EINVAL; From 4b5835edd40f760756a8e6229112f03adf99c260 Mon Sep 17 00:00:00 2001 From: Piotr Gorski Date: Thu, 12 Jan 2023 18:20:17 +0100 Subject: [PATCH 28/37] Revert "update fs-patches" This reverts commit 63cc8a0fb0d7c679b6fff23f5fb256826ca866de. Signed-off-by: Piotr Gorski --- fs/btrfs/tree-log.c | 30 ++++++------------------------ fs/xfs/libxfs/xfs_btree.c | 7 +------ fs/xfs/xfs_extent_busy.c | 1 - fs/xfs/xfs_icache.c | 10 ---------- fs/xfs/xfs_ioctl.c | 4 ++-- 5 files changed, 9 insertions(+), 43 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9da5cc3b6546f2..c3cf3dabe0b1b6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3857,10 +3857,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, path->slots[0]); if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; - } else if (ret < 0) { - err = ret; } - goto done; } @@ -3880,34 +3877,19 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, */ if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; - } else if (ret < 0) { - err = ret; - goto done; } - btrfs_release_path(path); /* - * Find the first key from this transaction again or the one we were at - * in the loop below in case we had to reschedule. We may be logging the - * directory without holding its VFS lock, which happen when logging new - * dentries (through log_new_dir_dentries()) or in some cases when we - * need to log the parent directory of an inode. This means a dir index - * key might be deleted from the inode's root, and therefore we may not - * find it anymore. If we can't find it, just move to the next key. We - * can not bail out and ignore, because if we do that we will simply - * not log dir index keys that come after the one that was just deleted - * and we can end up logging a dir index range that ends at (u64)-1 - * (@last_offset is initialized to that), resulting in removing dir - * entries we should not remove at log replay time. + * Find the first key from this transaction again. See the note for + * log_new_dir_dentries, if we're logging a directory recursively we + * won't be holding its i_mutex, which means we can modify the directory + * while we're logging it. If we remove an entry between our first + * search and this search we'll not find the key again and can just + * bail. */ search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (ret > 0) - ret = btrfs_next_item(root, path); - if (ret < 0) - err = ret; - /* If ret is 1, there are no more keys in the inode's root. */ if (ret != 0) goto done; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 35f574421670da..4c16c8c31fcbcd 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4666,12 +4666,7 @@ xfs_btree_space_to_height( const unsigned int *limits, unsigned long long leaf_blocks) { - /* - * The root btree block can have fewer than minrecs pointers in it - * because the tree might not be big enough to require that amount of - * fanout. Hence it has a minimum size of 2 pointers, not limits[1]. - */ - unsigned long long node_blocks = 2; + unsigned long long node_blocks = limits[1]; unsigned long long blocks_left = leaf_blocks - 1; unsigned int height = 1; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index f3d328e4a4408b..ad22a003f9595c 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -236,7 +236,6 @@ xfs_extent_busy_update_extent( * */ busyp->bno = fend; - busyp->length = bend - fend; } else if (bbno < fbno) { /* * Case 8: diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 778488effa9320..eae7427062cf9a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1847,20 +1847,12 @@ xfs_inodegc_worker( struct xfs_inodegc, work); struct llist_node *node = llist_del_all(&gc->list); struct xfs_inode *ip, *n; - unsigned int nofs_flag; WRITE_ONCE(gc->items, 0); if (!node) return; - /* - * We can allocate memory here while doing writeback on behalf of - * memory reclaim. To avoid memory allocation deadlocks set the - * task-wide nofs context for the following operations. - */ - nofs_flag = memalloc_nofs_save(); - ip = llist_entry(node, struct xfs_inode, i_gclist); trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits)); @@ -1869,8 +1861,6 @@ xfs_inodegc_worker( xfs_iflags_set(ip, XFS_INACTIVATING); xfs_inodegc_inactivate(ip); } - - memalloc_nofs_restore(nofs_flag); } /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 85fbb3b71d1c67..1f783e97962968 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -754,7 +754,7 @@ xfs_bulkstat_fmt( static int xfs_bulk_ireq_setup( struct xfs_mount *mp, - const struct xfs_bulk_ireq *hdr, + struct xfs_bulk_ireq *hdr, struct xfs_ibulk *breq, void __user *ubuffer) { @@ -780,7 +780,7 @@ xfs_bulk_ireq_setup( switch (hdr->ino) { case XFS_BULK_IREQ_SPECIAL_ROOT: - breq->startino = mp->m_sb.sb_rootino; + hdr->ino = mp->m_sb.sb_rootino; break; default: return -EINVAL; From 1a22f73fe301806bae92394a42daf2dbe322e7e1 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Sat, 28 Jan 2023 21:19:07 +0100 Subject: [PATCH 29/37] Add a sysctl to skip tcp collapse processing when the receive buffer is full. For context and additional information about this patch, see the blog post at https://blog.cloudflare.com/optimizing-tcp-for-high-throughput-and-low-latency/ sysctl: net.ipv4.tcp_collapse_max_bytes If tcp_collapse_max_bytes is non-zero, attempt to collapse the queue to free up memory if the current amount of memory allocated is less than tcp_collapse_max_bytes. Otherwise, the packet is dropped without attempting to collapse the queue. If tcp_collapse_max_bytes is zero, this feature is disabled and the default Linux behavior is used. The default Linux behavior is to always perform the attempt to collapse the queue to free up memory. When the receive queue is small, we want to collapse the queue. There are two reasons for this: (a) the latency of performing the collapse will be small on a small queue, and (b) we want to avoid sending a congestion signal (via a packet drop) to the sender when the receive queue is small. The result is that we avoid latency spikes caused by the time it takes to perform the collapse logic when the receive queue is large and full, while preserving existing behavior and performance for all other cases. Signed-off-by: Peter Jung --- include/net/netns/ipv4.h | 1 + include/trace/events/tcp.h | 7 +++++++ net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 2 ++ 5 files changed, 53 insertions(+) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 1b800467944517..d6d7f9942b970a 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -188,6 +188,7 @@ struct netns_ipv4 { int sysctl_udp_rmem_min; u8 sysctl_fib_notify_on_flag_change; + unsigned int sysctl_tcp_collapse_max_bytes; #ifdef CONFIG_NET_L3_MASTER_DEV u8 sysctl_udp_l3mdev_accept; diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 901b440238d5fc..7026df84a0f616 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -187,6 +187,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, TP_ARGS(sk) ); +DEFINE_EVENT(tcp_event_sk, tcp_collapse_max_bytes_exceeded, + + TP_PROTO(struct sock *sk), + + TP_ARGS(sk) +); + TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 9b8a6db7a66b31..0e364b98c4b3ba 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1384,6 +1384,13 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, + { + .procname = "tcp_collapse_max_bytes", + .data = &init_net.ipv4.sysctl_tcp_collapse_max_bytes, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0640453fce54b6..f2b70b05dc33a4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5384,6 +5384,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk) static int tcp_prune_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); @@ -5395,6 +5396,39 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; + /* For context and additional information about this patch, see the + * blog post at + * + * sysctl: net.ipv4.tcp_collapse_max_bytes + * + * If tcp_collapse_max_bytes is non-zero, attempt to collapse the + * queue to free up memory if the current amount of memory allocated + * is less than tcp_collapse_max_bytes. Otherwise, the packet is + * dropped without attempting to collapse the queue. + * + * If tcp_collapse_max_bytes is zero, this feature is disabled + * and the default Linux behavior is used. The default Linux + * behavior is to always perform the attempt to collapse the + * queue to free up memory. + * + * When the receive queue is small, we want to collapse the + * queue. There are two reasons for this: (a) the latency of + * performing the collapse will be small on a small queue, and + * (b) we want to avoid sending a congestion signal (via a + * packet drop) to the sender when the receive queue is small. + * + * The result is that we avoid latency spikes caused by the + * time it takes to perform the collapse logic when the receive + * queue is large and full, while preserving existing behavior + * and performance for all other cases. + */ + if (net->ipv4.sysctl_tcp_collapse_max_bytes && + (atomic_read(&sk->sk_rmem_alloc) > net->ipv4.sysctl_tcp_collapse_max_bytes)) { + /* We are dropping the packet */ + trace_tcp_collapse_max_bytes_exceeded(sk); + goto do_not_collapse; + } + tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) tcp_collapse(sk, &sk->sk_receive_queue, NULL, @@ -5413,6 +5447,8 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; +do_not_collapse: + /* If we are really being abused, tell the caller to silently * drop receive data on the floor. It will get retransmitted * and hopefully then we'll have sufficient space. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index da46357f501b3f..77967b2db29efa 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3215,6 +3215,8 @@ static int __net_init tcp_sk_init(struct net *net) else net->ipv4.tcp_congestion_control = &tcp_reno; + net->ipv4.sysctl_tcp_collapse_max_bytes = 0; + return 0; } From 74976185d0987ffd6bd21ca41ed1bde4268d85fe Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 9 Feb 2023 11:51:32 +0100 Subject: [PATCH 30/37] Revert "Cachy: Tune mgLRU to protect cache used in the last second" This reverts commit e9e8a1c6c2fcdfd13ccacecbb091cf58b2049eca. --- mm/vmscan.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 77c2bb3bccf8b9..c058a45dd43887 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4526,11 +4526,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned } /* to protect the working set of the last N jiffies */ -#ifdef CONFIG_CACHY -static unsigned long lru_gen_min_ttl __read_mostly = HZ; -#else static unsigned long lru_gen_min_ttl __read_mostly; -#endif static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { From d9dbe3f112c25403d3cad7637504dae5bdfceaec Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Thu, 29 Dec 2022 12:13:44 +0100 Subject: [PATCH 31/37] Cachy: Tune mgLRU to protect cache used in the last second Although not identical to the le9 patches that protect a byte-amount of cache through tunables, multigenerational LRU now supports protecting cache accessed in the last X milliseconds. Signed-off-by: Peter Jung --- mm/vmscan.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index c058a45dd43887..77c2bb3bccf8b9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4526,7 +4526,11 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned } /* to protect the working set of the last N jiffies */ +#ifdef CONFIG_CACHY +static unsigned long lru_gen_min_ttl __read_mostly = HZ; +#else static unsigned long lru_gen_min_ttl __read_mostly; +#endif static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { From f266ef7fef67f46bbd07b6fdd63f9935707c0d80 Mon Sep 17 00:00:00 2001 From: Piotr Gorski Date: Tue, 21 Feb 2023 09:21:06 +0100 Subject: [PATCH 32/37] Kconfig, zstd: Rename COMP_VAL to COMPRESSION_LEVEL Signed-off-by: Piotr Gorski --- init/Kconfig | 2 +- scripts/Makefile.lib | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 6aa69cf5af7c47..c90cf6705d4717 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -353,7 +353,7 @@ endchoice menu "ZSTD compression options" depends on KERNEL_ZSTD -config ZSTD_COMP_VAL +config ZSTD_COMPRESSION_LEVEL int "Compression level (1-22)" range 1 22 default "22" diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 2e16fcca038d38..0d5963a9557243 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -526,8 +526,8 @@ quiet_cmd_xzmisc = XZMISC $@ # decompression is used, like initramfs decompression, zstd22 should likely not # be used because it would require zstd to allocate a 128 MB buffer. -ifdef CONFIG_ZSTD_COMP_VAL -zstd_comp_val := $(CONFIG_ZSTD_COMP_VAL) +ifdef CONFIG_ZSTD_COMPRESSION_LEVEL +zstd_comp_val := $(CONFIG_ZSTD_COMPRESSION_LEVEL) ifeq ($(shell test $(zstd_comp_val) -gt 19; echo $$?),0) zstd_comp_val += --ultra endif From 40f18c9304df0bca6924f1146d48d98d528482d1 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Tue, 14 Mar 2023 20:47:56 +0100 Subject: [PATCH 33/37] Revert "ZEN: Add ACS override support" This reverts commit fed1201b45a2dea27d4afddf3b2f94a95b052b3d. --- .../admin-guide/kernel-parameters.txt | 9 -- drivers/pci/quirks.c | 101 ------------------ 2 files changed, 110 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index de9aec04321aed..e3d0acb92612bc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4145,15 +4145,6 @@ nomsi [MSI] If the PCI_MSI kernel config parameter is enabled, this kernel boot option can be used to disable the use of MSI interrupts system-wide. - pcie_acs_override = - [PCIE] Override missing PCIe ACS support for: - downstream - All downstream ports - full ACS capabilities - multfunction - All multifunction devices - multifunction ACS subset - id:nnnn:nnnn - Specfic device - full ACS capabilities - Specified as vid:did (vendor/device ID) in hex noioapicquirk [APIC] Disable all boot interrupt quirks. Safety option to keep boot IRQs enabled. This should never be necessary. diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 492e88a99c0727..285acc4aaccc1e 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3612,106 +3612,6 @@ static void quirk_no_bus_reset(struct pci_dev *dev) dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; } -static bool acs_on_downstream; -static bool acs_on_multifunction; - -#define NUM_ACS_IDS 16 -struct acs_on_id { - unsigned short vendor; - unsigned short device; -}; -static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; -static u8 max_acs_id; - -static __init int pcie_acs_override_setup(char *p) -{ - if (!p) - return -EINVAL; - - while (*p) { - if (!strncmp(p, "downstream", 10)) - acs_on_downstream = true; - if (!strncmp(p, "multifunction", 13)) - acs_on_multifunction = true; - if (!strncmp(p, "id:", 3)) { - char opt[5]; - int ret; - long val; - - if (max_acs_id >= NUM_ACS_IDS - 1) { - pr_warn("Out of PCIe ACS override slots (%d)\n", - NUM_ACS_IDS); - goto next; - } - - p += 3; - snprintf(opt, 5, "%s", p); - ret = kstrtol(opt, 16, &val); - if (ret) { - pr_warn("PCIe ACS ID parse error %d\n", ret); - goto next; - } - acs_on_ids[max_acs_id].vendor = val; - - p += strcspn(p, ":"); - if (*p != ':') { - pr_warn("PCIe ACS invalid ID\n"); - goto next; - } - - p++; - snprintf(opt, 5, "%s", p); - ret = kstrtol(opt, 16, &val); - if (ret) { - pr_warn("PCIe ACS ID parse error %d\n", ret); - goto next; - } - acs_on_ids[max_acs_id].device = val; - max_acs_id++; - } -next: - p += strcspn(p, ","); - if (*p == ',') - p++; - } - - if (acs_on_downstream || acs_on_multifunction || max_acs_id) - pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); - - return 0; -} -early_param("pcie_acs_override", pcie_acs_override_setup); - -static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) -{ - int i; - - /* Never override ACS for legacy devices or devices with ACS caps */ - if (!pci_is_pcie(dev) || - pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) - return -ENOTTY; - - for (i = 0; i < max_acs_id; i++) - if (acs_on_ids[i].vendor == dev->vendor && - acs_on_ids[i].device == dev->device) - return 1; - - switch (pci_pcie_type(dev)) { - case PCI_EXP_TYPE_DOWNSTREAM: - case PCI_EXP_TYPE_ROOT_PORT: - if (acs_on_downstream) - return 1; - break; - case PCI_EXP_TYPE_ENDPOINT: - case PCI_EXP_TYPE_UPSTREAM: - case PCI_EXP_TYPE_LEG_END: - case PCI_EXP_TYPE_RC_END: - if (acs_on_multifunction && dev->multifunction) - return 1; - } - - return -ENOTTY; -} /* * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be * prevented for those affected devices. @@ -5080,7 +4980,6 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_NXP, 0x8d9b, pci_quirk_nxp_rp_acs }, /* Zhaoxin Root/Downstream Ports */ { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, - { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, { 0 } }; From 4a78c3d5605c66feb7b81f870c292ff2af2654f4 Mon Sep 17 00:00:00 2001 From: Piotr Gorski Date: Fri, 30 Jun 2023 23:31:40 +0200 Subject: [PATCH 34/37] Drop zstd compression patches Signed-off-by: Piotr Gorski --- init/Kconfig | 13 ------------- kernel/module/Kconfig | 25 ------------------------- scripts/Makefile.lib | 13 +++---------- scripts/Makefile.modinst | 7 +------ 4 files changed, 4 insertions(+), 54 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index c90cf6705d4717..89a3e52edcb9f9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -350,19 +350,6 @@ config KERNEL_UNCOMPRESSED endchoice -menu "ZSTD compression options" - depends on KERNEL_ZSTD - -config ZSTD_COMPRESSION_LEVEL - int "Compression level (1-22)" - range 1 22 - default "22" - help - Choose a compression level for zstd kernel compression. - Default is 22, which is the maximum. - -endmenu - config DEFAULT_INIT string "Default init path" default "" diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index ecf2798c5ccf75..424b3bc58f3f51 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -219,31 +219,6 @@ config MODULE_COMPRESS_ZSTD endchoice -menu "ZSTD module compression options" - depends on MODULE_COMPRESS_ZSTD - -config MODULE_COMPRESS_ZSTD_LEVEL - int "Compression level (1-19)" - range 1 19 - default 9 - help - Compression level used by zstd for compressing modules. - -config MODULE_COMPRESS_ZSTD_ULTRA - bool "Enable ZSTD ultra compression" - help - Compress modules with ZSTD using the highest possible compression. - -config MODULE_COMPRESS_ZSTD_LEVEL_ULTRA - int "Compression level (20-22)" - depends on MODULE_COMPRESS_ZSTD_ULTRA - range 20 22 - default 20 - help - Ultra compression level used by zstd for compressing modules. - -endmenu - config MODULE_DECOMPRESS bool "Support in-kernel module decompression" depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 0d5963a9557243..3aa384cec76b8b 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -526,21 +526,14 @@ quiet_cmd_xzmisc = XZMISC $@ # decompression is used, like initramfs decompression, zstd22 should likely not # be used because it would require zstd to allocate a 128 MB buffer. -ifdef CONFIG_ZSTD_COMPRESSION_LEVEL -zstd_comp_val := $(CONFIG_ZSTD_COMPRESSION_LEVEL) -ifeq ($(shell test $(zstd_comp_val) -gt 19; echo $$?),0) -zstd_comp_val += --ultra -endif -endif - quiet_cmd_zstd = ZSTD $@ - cmd_zstd = cat $(real-prereqs) | $(ZSTD) -T0 -19 > $@ + cmd_zstd = cat $(real-prereqs) | $(ZSTD) -19 > $@ quiet_cmd_zstd22 = ZSTD22 $@ - cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -T0 -22 --ultra > $@ + cmd_zstd22 = cat $(real-prereqs) | $(ZSTD) -22 --ultra > $@ quiet_cmd_zstd22_with_size = ZSTD22 $@ - cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -T0 -$(zstd_comp_val); $(size_append); } > $@ + cmd_zstd22_with_size = { cat $(real-prereqs) | $(ZSTD) -22 --ultra; $(size_append); } > $@ # ASM offsets # --------------------------------------------------------------------------- diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst index 132863cf3183ce..a4c987c23750f6 100644 --- a/scripts/Makefile.modinst +++ b/scripts/Makefile.modinst @@ -96,13 +96,8 @@ quiet_cmd_gzip = GZIP $@ cmd_gzip = $(KGZIP) -n -f $< quiet_cmd_xz = XZ $@ cmd_xz = $(XZ) --lzma2=dict=2MiB -f $< -ifdef CONFIG_MODULE_COMPRESS_ZSTD_ULTRA quiet_cmd_zstd = ZSTD $@ - cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL_ULTRA) --ultra --zstd=wlog=21 -T0 --rm -f -q $< -else -quiet_cmd_zstd = ZSTD $@ - cmd_zstd = $(ZSTD) -$(CONFIG_MODULE_COMPRESS_ZSTD_LEVEL) --zstd=wlog=21 -T0 --rm -f -q $< -endif + cmd_zstd = $(ZSTD) -T0 --rm -f -q $< $(dst)/%.ko.gz: $(dst)/%.ko FORCE $(call cmd,gzip) From e3269edd41598de529e142291d6f54d931c8c83a Mon Sep 17 00:00:00 2001 From: Piotr Gorski Date: Mon, 10 Jul 2023 17:06:46 +0200 Subject: [PATCH 35/37] Revert "x86: Avoid relocation information in final vmlinux" This reverts commit 9eeae3b566dd87c88c05b3c1a4d621c6f9810ea1. Signed-off-by: Piotr Gorski --- .gitignore | 1 - Documentation/dontdiff | 1 - arch/x86/Makefile.postlink | 41 ----------------------------- arch/x86/boot/compressed/.gitignore | 1 + arch/x86/boot/compressed/Makefile | 10 ++++--- 5 files changed, 7 insertions(+), 47 deletions(-) delete mode 100644 arch/x86/Makefile.postlink diff --git a/.gitignore b/.gitignore index cb59d89372c014..5da004814678d0 100644 --- a/.gitignore +++ b/.gitignore @@ -61,7 +61,6 @@ modules.order /vmlinux /vmlinux.32 /vmlinux.map -/vmlinux.relocs /vmlinux.symvers /vmlinux-gdb.py /vmlinuz diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 7c210744d84c6e..352ff53a2306ad 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -255,7 +255,6 @@ vmlinux.aout vmlinux.bin.all vmlinux.lds vmlinux.map -vmlinux.relocs vmlinux.symvers vmlinuz voffset.h diff --git a/arch/x86/Makefile.postlink b/arch/x86/Makefile.postlink deleted file mode 100644 index b38ffa4defb3df..00000000000000 --- a/arch/x86/Makefile.postlink +++ /dev/null @@ -1,41 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# =========================================================================== -# Post-link x86 pass -# =========================================================================== -# -# 1. Separate relocations from vmlinux into vmlinux.relocs. -# 2. Strip relocations from vmlinux. - -PHONY := __archpost -__archpost: - --include include/config/auto.conf -include scripts/Kbuild.include - -CMD_RELOCS = arch/x86/tools/relocs -quiet_cmd_relocs = RELOCS $@.relocs - cmd_relocs = $(CMD_RELOCS) $@ > $@.relocs;$(CMD_RELOCS) --abs-relocs $@ - -quiet_cmd_strip_relocs = RSTRIP $@ - cmd_strip_relocs = $(OBJCOPY) --remove-section='.rel.*' --remove-section='.rel__*' --remove-section='.rela.*' --remove-section='.rela__*' $@ - -# `@true` prevents complaint when there is nothing to be done - -vmlinux: FORCE - @true -ifeq ($(CONFIG_X86_NEED_RELOCS),y) - $(call cmd,relocs) - $(call cmd,strip_relocs) -endif - -%.ko: FORCE - @true - -clean: - @rm -f vmlinux.relocs - -PHONY += FORCE clean - -FORCE: - -.PHONY: $(PHONY) diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore index b2968175fc2757..25805199a50611 100644 --- a/arch/x86/boot/compressed/.gitignore +++ b/arch/x86/boot/compressed/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only relocs vmlinux.bin.all +vmlinux.relocs vmlinux.lds mkpiggy piggy.S diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 75a467a408d2c9..3a261abb6d158d 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -121,12 +121,14 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs -# vmlinux.relocs is created by the vmlinux postlink step. -vmlinux.relocs: vmlinux - @true +CMD_RELOCS = arch/x86/tools/relocs +quiet_cmd_relocs = RELOCS $@ + cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< +$(obj)/vmlinux.relocs: vmlinux FORCE + $(call if_changed,relocs) vmlinux.bin.all-y := $(obj)/vmlinux.bin -vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += vmlinux.relocs +vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) From 88a1c00bc799094085c8a40dbe71a9b970ed39f8 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Mon, 27 Mar 2023 11:54:06 +0200 Subject: [PATCH 36/37] x86/build: Avoid relocation information in final vmlinux The Linux build process on x86 roughly consists of compiling all input files, statically linking them into a vmlinux ELF file, and then taking and turning this file into an actual bzImage bootable file. vmlinux has in this process two main purposes: 1) It is an intermediate build target on the way to produce the final bootable image. 2) It is a file that is expected to be used by debuggers and standard ELF tooling to work with the built kernel. For the second purpose, a vmlinux file is typically collected by various package build recipes, such as distribution spec files, including the kernel's own tar-pkg target. When building a kernel supporting KASLR with CONFIG_X86_NEED_RELOCS, vmlinux contains also relocation information produced by using the --emit-relocs linker option. This is utilized by subsequent build steps to create vmlinux.relocs and produce a relocatable image. However, the information is not needed by debuggers and other standard ELF tooling. The issue is then that the collected vmlinux file and hence distribution packages end up unnecessarily large because of this extra data. The following is a size comparison of vmlinux v6.0 with and without the relocation information: | Configuration | With relocs | Stripped relocs | | x86_64_defconfig | 70 MB | 43 MB | | +CONFIG_DEBUG_INFO | 818 MB | 367 MB | Optimize a resulting vmlinux by adding a postlink step that splits the relocation information into vmlinux.relocs and then strips it from the vmlinux binary. Signed-off-by: Petr Pavlu Signed-off-by: Borislav Petkov (AMD) Tested-by: Nick Desaulniers Link: https://lore.kernel.org/r/20220927084632.14531-1-petr.pavlu@suse.com --- arch/x86/Makefile.postlink | 47 +++++++++++++++++++++++++++++++ arch/x86/boot/compressed/Makefile | 8 ++---- 2 files changed, 50 insertions(+), 5 deletions(-) create mode 100644 arch/x86/Makefile.postlink diff --git a/arch/x86/Makefile.postlink b/arch/x86/Makefile.postlink new file mode 100644 index 00000000000000..936093d291605e --- /dev/null +++ b/arch/x86/Makefile.postlink @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: GPL-2.0 +# =========================================================================== +# Post-link x86 pass +# =========================================================================== +# +# 1. Separate relocations from vmlinux into vmlinux.relocs. +# 2. Strip relocations from vmlinux. + +PHONY := __archpost +__archpost: + +-include include/config/auto.conf +include $(srctree)/scripts/Kbuild.include + +CMD_RELOCS = arch/x86/tools/relocs +OUT_RELOCS = arch/x86/boot/compressed +quiet_cmd_relocs = RELOCS $(OUT_RELOCS)/$@.relocs + cmd_relocs = \ + mkdir -p $(OUT_RELOCS); \ + $(CMD_RELOCS) $@ > $(OUT_RELOCS)/$@.relocs; \ + $(CMD_RELOCS) --abs-relocs $@ + +quiet_cmd_strip_relocs = RSTRIP $@ + cmd_strip_relocs = \ + $(OBJCOPY) --remove-section='.rel.*' --remove-section='.rel__*' \ + --remove-section='.rela.*' --remove-section='.rela__*' $@ + +# `@true` prevents complaint when there is nothing to be done + +vmlinux: FORCE + @true +ifeq ($(CONFIG_X86_NEED_RELOCS),y) + $(call cmd,relocs) + $(call cmd,strip_relocs) +endif + +%.ko: FORCE + @true + +clean: + @rm -f $(OUT_RELOCS)/vmlinux.relocs + +PHONY += FORCE clean + +FORCE: + +.PHONY: $(PHONY) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 3a261abb6d158d..94221417879592 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -121,11 +121,9 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs -CMD_RELOCS = arch/x86/tools/relocs -quiet_cmd_relocs = RELOCS $@ - cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< -$(obj)/vmlinux.relocs: vmlinux FORCE - $(call if_changed,relocs) +# vmlinux.relocs is created by the vmlinux postlink step. +$(obj)/vmlinux.relocs: vmlinux + @true vmlinux.bin.all-y := $(obj)/vmlinux.bin vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs From 4c056b35581c60e7d0657a35ab44e90f9e702773 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Sun, 12 Nov 2023 21:04:29 +0100 Subject: [PATCH 37/37] Revert "Add a sysctl to skip tcp collapse processing when the receive buffer is full." This reverts commit 1a22f73fe301806bae92394a42daf2dbe322e7e1. --- include/net/netns/ipv4.h | 1 - include/trace/events/tcp.h | 7 ------- net/ipv4/sysctl_net_ipv4.c | 7 ------- net/ipv4/tcp_input.c | 36 ------------------------------------ net/ipv4/tcp_ipv4.c | 2 -- 5 files changed, 53 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d6d7f9942b970a..1b800467944517 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -188,7 +188,6 @@ struct netns_ipv4 { int sysctl_udp_rmem_min; u8 sysctl_fib_notify_on_flag_change; - unsigned int sysctl_tcp_collapse_max_bytes; #ifdef CONFIG_NET_L3_MASTER_DEV u8 sysctl_udp_l3mdev_accept; diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 7026df84a0f616..901b440238d5fc 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -187,13 +187,6 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, TP_ARGS(sk) ); -DEFINE_EVENT(tcp_event_sk, tcp_collapse_max_bytes_exceeded, - - TP_PROTO(struct sock *sk), - - TP_ARGS(sk) -); - TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0e364b98c4b3ba..9b8a6db7a66b31 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1384,13 +1384,6 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { - .procname = "tcp_collapse_max_bytes", - .data = &init_net.ipv4.sysctl_tcp_collapse_max_bytes, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_douintvec_minmax, - }, { } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f2b70b05dc33a4..0640453fce54b6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5384,7 +5384,6 @@ static bool tcp_prune_ofo_queue(struct sock *sk) static int tcp_prune_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct net *net = sock_net(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); @@ -5396,39 +5395,6 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; - /* For context and additional information about this patch, see the - * blog post at - * - * sysctl: net.ipv4.tcp_collapse_max_bytes - * - * If tcp_collapse_max_bytes is non-zero, attempt to collapse the - * queue to free up memory if the current amount of memory allocated - * is less than tcp_collapse_max_bytes. Otherwise, the packet is - * dropped without attempting to collapse the queue. - * - * If tcp_collapse_max_bytes is zero, this feature is disabled - * and the default Linux behavior is used. The default Linux - * behavior is to always perform the attempt to collapse the - * queue to free up memory. - * - * When the receive queue is small, we want to collapse the - * queue. There are two reasons for this: (a) the latency of - * performing the collapse will be small on a small queue, and - * (b) we want to avoid sending a congestion signal (via a - * packet drop) to the sender when the receive queue is small. - * - * The result is that we avoid latency spikes caused by the - * time it takes to perform the collapse logic when the receive - * queue is large and full, while preserving existing behavior - * and performance for all other cases. - */ - if (net->ipv4.sysctl_tcp_collapse_max_bytes && - (atomic_read(&sk->sk_rmem_alloc) > net->ipv4.sysctl_tcp_collapse_max_bytes)) { - /* We are dropping the packet */ - trace_tcp_collapse_max_bytes_exceeded(sk); - goto do_not_collapse; - } - tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) tcp_collapse(sk, &sk->sk_receive_queue, NULL, @@ -5447,8 +5413,6 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; -do_not_collapse: - /* If we are really being abused, tell the caller to silently * drop receive data on the floor. It will get retransmitted * and hopefully then we'll have sufficient space. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 77967b2db29efa..da46357f501b3f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3215,8 +3215,6 @@ static int __net_init tcp_sk_init(struct net *net) else net->ipv4.tcp_congestion_control = &tcp_reno; - net->ipv4.sysctl_tcp_collapse_max_bytes = 0; - return 0; }