diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 458c891a827365..d86eb1ebf59f5b 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -175,6 +175,7 @@ extern unsigned int __read_mostly sysctl_sched_itmt_enabled; /* Interface to set priority of a cpu */ void sched_set_itmt_core_prio(int prio, int core_cpu); +void sched_set_itmt_power_ratio(int power_ratio, int core_cpu); /* Interface to notify scheduler that system supports ITMT */ int sched_set_itmt_support(void); diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h index 57b1a7034c640a..e2c45674f98975 100644 --- a/arch/x86/include/asm/vdso/processor.h +++ b/arch/x86/include/asm/vdso/processor.h @@ -10,7 +10,7 @@ /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ static __always_inline void rep_nop(void) { - asm volatile("rep; nop" ::: "memory"); + asm volatile("lfence" ::: "memory"); } static __always_inline void cpu_relax(void) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f901658d9f7c08..7d931995efdc31 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -143,6 +143,8 @@ obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-y += powerbump.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index fbaf12e43f4160..c8c2d6f1a8aca2 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -166,6 +166,10 @@ static ssize_t energy_perf_bias_store(struct device *dev, if (ret < 0) return ret; + /* update the ITMT scheduler logic to use the power policy data */ + /* scale the val up by 2 so the range is 224 - 256 */ + sched_set_itmt_power_ratio(256 - val * 2, cpu); + return count; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 6a41cee242f6d7..18dc2dd80c890b 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -44,6 +44,8 @@ static struct microcode_ops *microcode_ops; static bool dis_ucode_ldr = true; +bool ucode_rollback = false; +int enable_rollback = 0; bool initrd_gone; @@ -80,6 +82,26 @@ static u32 final_levels[] = { 0, /* T-101 terminator */ }; +static int __init ucode_setup(char *str) +{ + if (!str) + return -EINVAL; + + while (*str) { + if (!strncmp(str, "rollback", 8)) { + enable_rollback = 1; + pr_info("Microcode Rollback Enabled\n"); + } + str += strcspn(str, ","); + while (*str == ',') + str++; + } + return 0; +} + +__setup("ucode=", ucode_setup); + + /* * Check the current patch level on this CPU. * @@ -513,6 +535,7 @@ static ssize_t reload_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { + struct cpuinfo_x86 *c = &boot_cpu_data; enum ucode_state tmp_ret = UCODE_OK; int bsp = boot_cpu_data.cpu_index; unsigned long val; @@ -522,7 +545,7 @@ static ssize_t reload_store(struct device *dev, if (ret) return ret; - if (val != 1) + if (!val || val > 2) return size; cpus_read_lock(); @@ -530,6 +553,20 @@ static ssize_t reload_store(struct device *dev, ret = check_online_cpus(); if (ret) goto put; + /* + * Check if the vendor is Intel to permit reloading + * microcode even if the revision is unchanged. + * This is typically used during development of microcode + * and changing rev is a pain. + */ + if ((val == 2) && ((c->x86_vendor != X86_VENDOR_INTEL) || + !enable_rollback)) + return size; + else if (val == 2) { + mutex_lock(µcode_mutex); + ucode_rollback = true; + mutex_unlock(µcode_mutex); + } tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); if (tmp_ret != UCODE_NEW) @@ -540,6 +577,7 @@ static ssize_t reload_store(struct device *dev, mutex_unlock(µcode_mutex); put: + ucode_rollback = false; cpus_read_unlock(); if (ret == 0) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 1fcbd671f1dffc..11acd1a4ca9166 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -44,6 +44,7 @@ static struct microcode_intel *intel_ucode_patch; /* last level cache size per core */ static int llc_size_per_core; +extern bool ucode_rollback; /* * Returns 1 if update has been found, 0 otherwise. @@ -80,7 +81,7 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev { struct microcode_header_intel *mc_hdr = mc; - if (mc_hdr->rev <= new_rev) + if (!ucode_rollback && mc_hdr->rev <= new_rev) return 0; return find_matching_signature(mc, csig, cpf); @@ -120,7 +121,7 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne if (find_matching_signature(data, sig, pf)) { prev_found = true; - if (mc_hdr->rev <= mc_saved_hdr->rev) + if (!ucode_rollback && mc_hdr->rev <= mc_saved_hdr->rev) continue; p = memdup_patch(data, size); @@ -649,7 +650,7 @@ static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) phdr = (struct microcode_header_intel *)iter->data; - if (phdr->rev <= uci->cpu_sig.rev) + if (!ucode_rollback && phdr->rev <= uci->cpu_sig.rev) continue; if (!find_matching_signature(phdr, @@ -734,10 +735,11 @@ static enum ucode_state apply_microcode_intel(int cpu) * already. */ rev = intel_get_microcode_revision(); - if (rev >= mc->hdr.rev) { + if (!ucode_rollback && rev >= mc->hdr.rev) { ret = UCODE_OK; goto out; - } + } else if (ucode_rollback) + ret = UCODE_OK; /* * Writeback and invalidate caches before updating microcode to avoid @@ -756,7 +758,7 @@ static enum ucode_state apply_microcode_intel(int cpu) return UCODE_ERROR; } - if (bsp && rev != prev_rev) { + if (bsp && ((rev != prev_rev) || ucode_rollback)) { pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", rev, mc->hdr.date & 0xffff, diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 9ff480e94511b8..2158b43f7cd9fd 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -25,6 +25,7 @@ static DEFINE_MUTEX(itmt_update_mutex); DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority); +DEFINE_PER_CPU_READ_MOSTLY(int, sched_power_ratio); /* Boolean to track if system has ITMT capabilities */ static bool __read_mostly sched_itmt_capable; @@ -169,37 +170,50 @@ void sched_clear_itmt_support(void) int arch_asym_cpu_priority(int cpu) { - return per_cpu(sched_core_priority, cpu); + int power_ratio = per_cpu(sched_power_ratio, cpu); + + /* a power ratio of 0 (uninitialized) is assumed to be maximum */ + if (power_ratio == 0) + power_ratio = 256 - 2 * 6; + return per_cpu(sched_core_priority, cpu) * power_ratio / 256; } /** * sched_set_itmt_core_prio() - Set CPU priority based on ITMT - * @prio: Priority of cpu core - * @core_cpu: The cpu number associated with the core + * @prio: Priority of @cpu + * @cpu: The CPU number * * The pstate driver will find out the max boost frequency * and call this function to set a priority proportional - * to the max boost frequency. CPU with higher boost + * to the max boost frequency. CPUs with higher boost * frequency will receive higher priority. * * No need to rebuild sched domain after updating * the CPU priorities. The sched domains have no * dependency on CPU priorities. */ -void sched_set_itmt_core_prio(int prio, int core_cpu) +void sched_set_itmt_core_prio(int prio, int cpu) +{ + per_cpu(sched_core_priority, cpu) = prio * 64 - cpu; +} + +/** + * sched_set_itmt_power_ratio() - Set CPU priority based on ITMT + * @power_ratio: The power scaling ratio [1..256] for the core + * @core_cpu: The cpu number associated with the core + * + * Set a scaling to the cpu performance based on long term power + * settings (like EPB). + * + * Note this is for the policy not for the actual dynamic frequency; + * the frequency will increase itself as workloads run on a core. + */ + +void sched_set_itmt_power_ratio(int power_ratio, int core_cpu) { - int cpu, i = 1; + int cpu; for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { - int smt_prio; - - /* - * Ensure that the siblings are moved to the end - * of the priority chain and only used when - * all other high priority cpus are out of capacity. - */ - smt_prio = prio * smp_num_siblings / (i * i); - per_cpu(sched_core_priority, cpu) = smt_prio; - i++; + per_cpu(sched_power_ratio, cpu) = power_ratio; } } diff --git a/arch/x86/kernel/powerbump.c b/arch/x86/kernel/powerbump.c new file mode 100644 index 00000000000000..c6b3762113bf8c --- /dev/null +++ b/arch/x86/kernel/powerbump.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 Intel Corporation + * Author: Arjan van de Ven + * + * Kernel power-bump infrastructructure + */ +#include +#include +#include + +static DEFINE_PER_CPU(unsigned long, bump_timeout); /* jiffies at which the lease for the bump times out */ + + + +/* + * a note about the use of the current cpu versus preemption. + * + * Most uses of in_power_bump() are inside local power management code, + * and are pinned to that cpu already. + * + * On the "set" side, interrupt level code is obviously also fully + * migration-race free. + * + * All other cases are exposed to a migration-race. + * + * The goal of powerbump is statistical rather than deterministic, + * e.g. on average the CPU that hits event X will go towards Y more + * often than not, and the impact of being wrong is a bit of extra + * power potentially for some short durations. + * Weighted against the costs in performance and complexity of dealing + * with the race, the race condition is acceptable. + * + * The second known race is where interrupt context might set a bump + * time in the middle of process context setting a different but smaller bump time, + * with the result that process context will win incorrectly, and the + * actual bump time will be less than expected, but still non-zero. + * Here also the cost of dealing with the raice is outweight with the + * limited impact. + */ + + +int in_power_bump(void) +{ + int cpu = raw_smp_processor_id(); + if (time_before(jiffies, per_cpu(bump_timeout, cpu))) + return 1; + + /* deal with wrap issues by keeping the stored bump value close to current */ + per_cpu(bump_timeout, cpu) = jiffies; + return 0; +} +EXPORT_SYMBOL_GPL(in_power_bump); + +void give_power_bump(int msecs) +{ + unsigned long nextjiffies; + int cpu; + /* we need to round up an extra jiffie */ + nextjiffies = jiffies + msecs_to_jiffies(msecs) + 1; + + cpu = raw_smp_processor_id(); + if (time_before(per_cpu(bump_timeout, cpu), nextjiffies)) + per_cpu(bump_timeout, cpu) = nextjiffies; + +} +EXPORT_SYMBOL_GPL(give_power_bump); + +static __init int powerbump_init(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) { + per_cpu(bump_timeout, cpu) = jiffies; + } + + return 0; +} + +late_initcall(powerbump_init); \ No newline at end of file diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cafacb2e58cceb..c2f80184fd3325 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1569,6 +1569,9 @@ unsigned long calibrate_delay_is_known(void) if (!constant_tsc || !mask) return 0; + if (cpu != 0) + return cpu_data(0).loops_per_jiffy; + sibling = cpumask_any_but(mask, cpu); if (sibling < nr_cpu_ids) return cpu_data(sibling).loops_per_jiffy; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7b0d4ab894c8bc..1a14f52added00 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -799,9 +799,9 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx cpu %i", loglvl, tsk->comm, task_pid_nr(tsk), address, - (void *)regs->ip, (void *)regs->sp, error_code); + (void *)regs->ip, (void *)regs->sp, error_code, raw_smp_processor_id()); print_vma_addr(KERN_CONT " in ", regs->ip); diff --git a/block/bio.c b/block/bio.c index 57c2f327225bd1..08ba43fe3242b3 100644 --- a/block/bio.c +++ b/block/bio.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "blk.h" @@ -1294,6 +1295,7 @@ EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); static void submit_bio_wait_endio(struct bio *bio) { + give_power_bump(BUMP_FOR_DISK); complete(bio->bi_private); } @@ -1319,6 +1321,8 @@ int submit_bio_wait(struct bio *bio) bio->bi_opf |= REQ_SYNC; submit_bio(bio); + give_power_bump(BUMP_FOR_DISK); + /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; if (hang_check) diff --git a/drivers/Makefile b/drivers/Makefile index bdf1c66141c9bd..1e1a0832fb48a1 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -59,15 +59,8 @@ obj-y += char/ # iommu/ comes before gpu as gpu are using iommu controllers obj-y += iommu/ -# gpu/ comes after char for AGP vs DRM startup and after iommu -obj-y += gpu/ - obj-$(CONFIG_CONNECTOR) += connector/ -# i810fb and intelfb depend on char/agp/ -obj-$(CONFIG_FB_I810) += video/fbdev/i810/ -obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ - obj-$(CONFIG_PARPORT) += parport/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ @@ -79,6 +72,14 @@ obj-y += macintosh/ obj-y += scsi/ obj-y += nvme/ obj-$(CONFIG_ATA) += ata/ + +# gpu/ comes after char for AGP vs DRM startup and after iommu +obj-y += gpu/ + +# i810fb and intelfb depend on char/agp/ +obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ + obj-$(CONFIG_TARGET_CORE) += target/ obj-$(CONFIG_MTD) += mtd/ obj-$(CONFIG_SPI) += spi/ diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index 954386a2b5002b..cef60f30278c53 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -34,14 +34,14 @@ #include "libata.h" static int ahci_skip_host_reset; -int ahci_ignore_sss; +int ahci_ignore_sss=1; EXPORT_SYMBOL_GPL(ahci_ignore_sss); module_param_named(skip_host_reset, ahci_skip_host_reset, int, 0444); MODULE_PARM_DESC(skip_host_reset, "skip global host reset (0=don't skip, 1=skip)"); module_param_named(ignore_sss, ahci_ignore_sss, int, 0444); -MODULE_PARM_DESC(ignore_sss, "Ignore staggered spinup flag (0=don't ignore, 1=ignore)"); +MODULE_PARM_DESC(ignore_sss, "Ignore staggered spinup flag (0=don't ignore, 1=ignore [default])"); static int ahci_set_lpm(struct ata_link *link, enum ata_lpm_policy policy, unsigned hints); diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index 7c3590fd97c28d..bb4880e10581f5 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -470,6 +470,8 @@ static int fw_decompress_xz(struct device *dev, struct fw_priv *fw_priv, static char fw_path_para[256]; static const char * const fw_path[] = { fw_path_para, + "/etc/firmware/" UTS_RELEASE, + "/etc/firmware", "/lib/firmware/updates/" UTS_RELEASE, "/lib/firmware/updates", "/lib/firmware/" UTS_RELEASE, diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 6ff73c30769fae..46516074bfd032 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -365,6 +365,13 @@ static void intel_pstate_set_itmt_prio(int cpu) * update them at any time after it has been called. */ sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu); + /* + * On some systems with overclocking enabled, CPPC.highest_perf is hardcoded to 0xff. + * In this case we can't use CPPC.highest_perf to enable ITMT. + * In this case we can look at MSR_HWP_CAPABILITIES bits [8:0] to decide. + */ + if (cppc_perf.highest_perf == 0xff) + cppc_perf.highest_perf = HWP_HIGHEST_PERF(READ_ONCE(all_cpu_data[cpu]->hwp_cap_cached)); if (max_highest_perf <= min_highest_perf) { if (cppc_perf.highest_perf > max_highest_perf) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index c4922684f30583..5bc5de2c1c694a 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -18,6 +18,7 @@ #include #include #include +#include #define BUCKETS 12 #define INTERVAL_SHIFT 3 @@ -279,6 +280,9 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, data->needs_update = 0; } + if (in_power_bump() && latency_req > BUMP_LATENCY_THRESHOLD) + latency_req = BUMP_LATENCY_THRESHOLD; + /* determine the expected residency time, round up */ delta = tick_nohz_get_sleep_length(&delta_tick); if (unlikely(delta < 0)) { diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index cfeb24d40d3789..8ac40b7c7b4127 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -578,7 +578,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 120, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -586,7 +586,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 33, - .target_residency = 100, + .target_residency = 900, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -594,7 +594,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 133, - .target_residency = 400, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -602,7 +602,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x32", .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 166, - .target_residency = 500, + .target_residency = 1500, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -610,7 +610,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 300, - .target_residency = 900, + .target_residency = 2000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -618,7 +618,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 600, - .target_residency = 1800, + .target_residency = 5000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -626,7 +626,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 2600, - .target_residency = 7700, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -646,7 +646,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 120, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -654,7 +654,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 40, - .target_residency = 100, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -662,7 +662,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 133, - .target_residency = 400, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -670,7 +670,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x32", .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 166, - .target_residency = 500, + .target_residency = 2000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -678,7 +678,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 300, - .target_residency = 900, + .target_residency = 4000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -686,7 +686,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 600, - .target_residency = 1800, + .target_residency = 7000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -694,7 +694,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 2600, - .target_residency = 7700, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -715,7 +715,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 120, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -723,7 +723,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 70, - .target_residency = 100, + .target_residency = 1000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -731,7 +731,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 85, - .target_residency = 200, + .target_residency = 600, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -739,7 +739,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x33", .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 124, - .target_residency = 800, + .target_residency = 3000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -747,7 +747,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 200, - .target_residency = 800, + .target_residency = 3200, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -755,7 +755,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 480, - .target_residency = 5000, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -763,7 +763,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 890, - .target_residency = 5000, + .target_residency = 9000, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -784,7 +784,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, - .target_residency = 20, + .target_residency = 300, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -813,7 +813,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 4, - .target_residency = 4, + .target_residency = 40, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -821,7 +821,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 170, - .target_residency = 600, + .target_residency = 900, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { @@ -942,7 +942,7 @@ static struct cpuidle_state adl_n_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 2, - .target_residency = 4, + .target_residency = 40, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c index 6dac7c1853a541..fab04cd8a7a095 100644 --- a/drivers/input/serio/i8042.c +++ b/drivers/input/serio/i8042.c @@ -621,7 +621,7 @@ static int i8042_enable_kbd_port(void) if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_KBDINT; i8042_ctr |= I8042_CTR_KBDDIS; - pr_err("Failed to enable KBD port\n"); + pr_info("Failed to enable KBD port\n"); return -EIO; } @@ -640,7 +640,7 @@ static int i8042_enable_aux_port(void) if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_AUXINT; i8042_ctr |= I8042_CTR_AUXDIS; - pr_err("Failed to enable AUX port\n"); + pr_info("Failed to enable AUX port\n"); return -EIO; } @@ -732,7 +732,7 @@ static int i8042_check_mux(void) i8042_ctr &= ~I8042_CTR_AUXINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { - pr_err("Failed to disable AUX port, can't use MUX\n"); + pr_info("Failed to disable AUX port, can't use MUX\n"); return -EIO; } @@ -955,7 +955,7 @@ static int i8042_controller_selftest(void) do { if (i8042_command(¶m, I8042_CMD_CTL_TEST)) { - pr_err("i8042 controller selftest timeout\n"); + pr_info("i8042 controller selftest timeout\n"); return -ENODEV; } @@ -977,7 +977,7 @@ static int i8042_controller_selftest(void) pr_info("giving up on controller selftest, continuing anyway...\n"); return 0; #else - pr_err("i8042 controller selftest failed\n"); + pr_info("i8042 controller selftest failed\n"); return -EIO; #endif } diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index aa0fc00faecbe8..b93a4d71be29e0 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -43,7 +43,7 @@ #define DRV_NAME "dummy" -static int numdummies = 1; +static int numdummies = 0; /* fake multicast ability */ static void set_multicast_list(struct net_device *dev) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7e3893d06babdf..8984f328a4f6fe 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -58,7 +58,7 @@ static u8 nvme_max_retries = 5; module_param_named(max_retries, nvme_max_retries, byte, 0644); MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); -static unsigned long default_ps_max_latency_us = 100000; +static unsigned long default_ps_max_latency_us = 200; module_param(default_ps_max_latency_us, ulong, 0644); MODULE_PARM_DESC(default_ps_max_latency_us, "max power saving latency for new devices; use PM QOS to change per device"); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 2127aba3550b5d..cf5c72a88c2bae 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -62,7 +62,7 @@ struct pci_pme_device { struct pci_dev *dev; }; -#define PME_TIMEOUT 1000 /* How long between PME checks */ +#define PME_TIMEOUT 4000 /* How long between PME checks */ static void pci_dev_d3_sleep(struct pci_dev *dev) { diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 26d00b1853b421..3e239d6548b523 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1518,7 +1518,7 @@ static int __init rapl_init(void) id = x86_match_cpu(rapl_ids); if (!id) { - pr_err("driver does not support CPU family %d model %d\n", + pr_info("driver does not support CPU family %d model %d\n", boot_cpu_data.x86, boot_cpu_data.x86_model); return -ENODEV; diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c index b80e25ec12615f..187b4ee6e9f5df 100644 --- a/drivers/thermal/intel/intel_powerclamp.c +++ b/drivers/thermal/intel/intel_powerclamp.c @@ -627,6 +627,11 @@ static const struct thermal_cooling_device_ops powerclamp_cooling_ops = { .set_cur_state = powerclamp_set_cur_state, }; +static const struct x86_cpu_id amd_cpu[] = { + { X86_VENDOR_AMD }, + {}, +}; + static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = { X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL), {} @@ -636,6 +641,11 @@ MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids); static int __init powerclamp_probe(void) { + if (x86_match_cpu(amd_cpu)){ + pr_info("Intel PowerClamp does not support AMD CPUs\n"); + return -ENODEV; + } + if (!x86_match_cpu(intel_powerclamp_ids)) { pr_err("CPU does not support MWAIT\n"); return -ENODEV; diff --git a/fs/buffer.c b/fs/buffer.c index d9c6d1fbb6dde5..139a1b18b24014 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "internal.h" @@ -119,6 +120,7 @@ EXPORT_SYMBOL(buffer_check_dirty_writeback); */ void __wait_on_buffer(struct buffer_head * bh) { + give_power_bump(BUMP_FOR_DISK); wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__wait_on_buffer); @@ -156,6 +158,7 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) */ void end_buffer_read_sync(struct buffer_head *bh, int uptodate) { + give_power_bump(BUMP_FOR_DISK); __end_buffer_read_notouch(bh, uptodate); put_bh(bh); } @@ -163,6 +166,7 @@ EXPORT_SYMBOL(end_buffer_read_sync); void end_buffer_write_sync(struct buffer_head *bh, int uptodate) { + give_power_bump(BUMP_FOR_DISK); if (uptodate) { set_buffer_uptodate(bh); } else { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6a404ac1c178f0..f451099d9343a3 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -1104,6 +1105,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, if (buffer_shadow(bh)) { JBUFFER_TRACE(jh, "on shadow: sleep"); spin_unlock(&jh->b_state_lock); + give_power_bump(BUMP_FOR_DISK); wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); goto repeat; } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 0b7242370b5673..16b8fc483b3d35 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -45,7 +45,7 @@ /* * The default maximum commit age, in seconds. */ -#define JBD2_DEFAULT_MAX_COMMIT_AGE 5 +#define JBD2_DEFAULT_MAX_COMMIT_AGE 30 #ifdef CONFIG_JBD2_DEBUG /* diff --git a/include/linux/powerbump.h b/include/linux/powerbump.h new file mode 100644 index 00000000000000..a17ed37744668e --- /dev/null +++ b/include/linux/powerbump.h @@ -0,0 +1,16 @@ +#pragma once + +/* in nsecs */ +#define BUMP_LATENCY_THRESHOLD 2000 + + +/* bump time constants, in msec */ +#define BUMP_FOR_DISK 3 +#define BUMP_FOR_NETWORK 3 +#define BUMP_FOR_FUTEX 3 + + + +/* API prototypes */ +extern void give_power_bump(int msecs); +extern int in_power_bump(void); diff --git a/include/linux/wait.h b/include/linux/wait.h index 7f5a51aae0a73d..8e57dcc0469811 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -165,6 +165,7 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +extern void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); @@ -1192,6 +1193,7 @@ do { \ */ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); +void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index d174914a837dbf..bf8e2af101a3c8 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -82,7 +82,7 @@ #define BOND_STATE_ACTIVE 0 /* link is active */ #define BOND_STATE_BACKUP 1 /* link is backup */ -#define BOND_DEFAULT_MAX_BONDS 1 /* Default maximum number of devices to support */ +#define BOND_DEFAULT_MAX_BONDS 0 /* Default maximum number of devices to support */ #define BOND_DEFAULT_TX_QUEUES 16 /* Default number of tx queues per device */ diff --git a/init/do_mounts.c b/init/do_mounts.c index 811e94daf0a84a..06fef7f97c028e 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -283,8 +283,18 @@ dev_t name_to_dev_t(const char *name) if (strcmp(name, "/dev/ram") == 0) return Root_RAM0; #ifdef CONFIG_BLOCK - if (strncmp(name, "PARTUUID=", 9) == 0) - return devt_from_partuuid(name + 9); + if (strncmp(name, "PARTUUID=", 9) == 0) { + dev_t res; + int needtowait = 40<<1; + res = devt_from_partuuid(name + 9); + while (!res && needtowait) { + /* waiting 0.5 sec */ + msleep(500); + res = devt_from_partuuid(name + 9); + needtowait--; + } + return res; + } if (strncmp(name, "PARTLABEL=", 10) == 0) return devt_from_partlabel(name + 10); if (strncmp(name, "/dev/", 5) == 0) @@ -612,7 +622,9 @@ void __init prepare_namespace(void) * For example, it is not atypical to wait 5 seconds here * for the touchpad of a laptop to initialize. */ + async_synchronize_full(); wait_for_device_probe(); + async_synchronize_full(); md_run_setup(); diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index ba01b94082033b..e4fc09a98cbcd6 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "futex.h" @@ -336,6 +337,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, */ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); futex_queue(q, hb); + give_power_bump(BUMP_FOR_FUTEX); /* Arm the timer */ if (timeout) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 44873594de0316..fe62d59f2bdcf4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -755,6 +755,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) struct task_struct *new, *owner; unsigned long flags, new_flags; enum owner_state state; + int i = 0; lockdep_assert_preemption_disabled(); @@ -791,7 +792,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) break; } - cpu_relax(); + if (i++ > 1000) + cpu_relax(); } return state; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e4a0b8bd941c78..155862be793155 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8900,12 +8900,10 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, struct sched_group *sg) { #ifdef CONFIG_SCHED_SMT - bool local_is_smt, sg_is_smt; + bool local_is_smt; int sg_busy_cpus; local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; - sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY; - sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; if (!local_is_smt) { @@ -8926,25 +8924,16 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); } - /* @dst_cpu has SMT siblings. */ - - if (sg_is_smt) { - int local_busy_cpus = sds->local->group_weight - - sds->local_stat.idle_cpus; - int busy_cpus_delta = sg_busy_cpus - local_busy_cpus; - - if (busy_cpus_delta == 1) - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); - - return false; - } - /* - * @sg does not have SMT siblings. Ensure that @sds::local does not end - * up with more than one busy SMT sibling and only pull tasks if there - * are not busy CPUs (i.e., no CPU has running tasks). + * @dst_cpu has SMT siblings. When both @dst_cpu and the busiest core + * have one or more busy siblings, moving tasks between them results + * in the same throughput. Only if all the siblings of @dst_cpu are + * idle throughput can increase. + * + * If the difference in the number of busy CPUs is two or more, let + * find_busiest_group() take care of it. */ - if (!sds->local_stat.sum_nr_running) + if (sg_busy_cpus == 1 && !sds->local_stat.sum_nr_running) return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); return false; @@ -10162,9 +10151,14 @@ asym_active_balance(struct lb_env *env) * ASYM_PACKING needs to force migrate tasks from busy but * lower priority CPUs in order to pack all tasks in the * highest priority CPUs. + * + * If the busy CPU has higher priority but is an SMT sibling + * in which other SMT siblings are also busy, a lower-priority + * CPU in a separate core can help. */ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && - sched_asym_prefer(env->dst_cpu, env->src_cpu); + (sched_asym_prefer(env->dst_cpu, env->src_cpu) || + !(env->sd->flags & SD_SHARE_CPUCAPACITY)); } static inline bool diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 9860bb9a847cf0..c7f045873a5fbd 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -47,6 +47,17 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_ } EXPORT_SYMBOL_GPL(add_wait_queue_priority); +void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive_lifo); + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -289,6 +300,19 @@ prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_ent } EXPORT_SYMBOL(prepare_to_wait_exclusive); +void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + if (list_empty(&wq_entry->entry)) + __add_wait_queue(wq_head, wq_entry); + set_current_state(state); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait_exclusive_lifo); + void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) { wq_entry->flags = flags; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8e61f21e7e33e2..be1439d38f260c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -41,7 +41,7 @@ unsigned long __read_mostly watchdog_enabled; int __read_mostly watchdog_user_enabled = 1; int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; int __read_mostly soft_watchdog_user_enabled = 1; -int __read_mostly watchdog_thresh = 10; +int __read_mostly watchdog_thresh = 40; static int __read_mostly nmi_watchdog_available; struct cpumask watchdog_cpumask __read_mostly; diff --git a/mm/ksm.c b/mm/ksm.c index c19fcca9bc03dc..7009cf42be76b1 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2427,9 +2427,14 @@ static int ksm_scan_thread(void *nothing) if (ksmd_should_run()) { sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); - wait_event_interruptible_timeout(ksm_iter_wait, - sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), - msecs_to_jiffies(sleep_ms)); + if (sleep_ms >= 1000) + wait_event_interruptible_timeout(ksm_iter_wait, + sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), + msecs_to_jiffies(round_jiffies_relative(sleep_ms))); + else + wait_event_interruptible_timeout(ksm_iter_wait, + sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), + msecs_to_jiffies(sleep_ms)); } else { wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 266a1ab054341c..25a5289cb334de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -611,7 +611,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); x = __this_cpu_add_return(stats_updates, abs(val)); - if (x > MEMCG_CHARGE_BATCH) { + if (x > MEMCG_CHARGE_BATCH * 128) { /* * If stats_flush_threshold exceeds the threshold * (>num_online_cpus()), cgroup stats update will be triggered diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e60657875d328..688ebfafd58ecf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7069,11 +7069,11 @@ static int zone_batchsize(struct zone *zone) /* * The number of pages to batch allocate is either ~0.1% - * of the zone or 1MB, whichever is smaller. The batch + * of the zone or 4MB, whichever is smaller. The batch * size is striking a balance between allocation latency * and zone lock contention. */ - batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); + batch = min(zone_managed_pages(zone) >> 10, 4 * SZ_1M / PAGE_SIZE); batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; diff --git a/net/core/dev.c b/net/core/dev.c index 3be256051e99b9..bed00f3a22694d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -150,6 +150,7 @@ #include #include #include +#include #include "dev.h" #include "net-sysfs.h" @@ -5744,6 +5745,7 @@ int netif_receive_skb(struct sk_buff *skb) int ret; trace_netif_receive_skb_entry(skb); + give_power_bump(BUMP_FOR_NETWORK); ret = netif_receive_skb_internal(skb); trace_netif_receive_skb_exit(ret); @@ -5768,6 +5770,7 @@ void netif_receive_skb_list(struct list_head *head) if (list_empty(head)) return; + give_power_bump(BUMP_FOR_NETWORK); if (trace_netif_receive_skb_list_entry_enabled()) { list_for_each_entry(skb, head, list) trace_netif_receive_skb_list_entry(skb); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4e84ed21d16fed..c6e54eca42d983 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -588,7 +588,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) * having to remove and re-insert us on the wait queue. */ for (;;) { - prepare_to_wait_exclusive(sk_sleep(sk), &wait, + prepare_to_wait_exclusive_lifo(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4f2205756cfeee..2d20275dd6504d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4810,8 +4810,8 @@ void __init tcp_init(void) tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); - max_wshare = min(4UL*1024*1024, limit); - max_rshare = min(6UL*1024*1024, limit); + max_wshare = min(16UL*1024*1024, limit); + max_rshare = min(16UL*1024*1024, limit); init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;