diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 458c891a827365..d86eb1ebf59f5b 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -175,6 +175,7 @@ extern unsigned int __read_mostly sysctl_sched_itmt_enabled;
 
 /* Interface to set priority of a cpu */
 void sched_set_itmt_core_prio(int prio, int core_cpu);
+void sched_set_itmt_power_ratio(int power_ratio, int core_cpu);
 
 /* Interface to notify scheduler that system supports ITMT */
 int sched_set_itmt_support(void);
diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h
index 57b1a7034c640a..e2c45674f98975 100644
--- a/arch/x86/include/asm/vdso/processor.h
+++ b/arch/x86/include/asm/vdso/processor.h
@@ -10,7 +10,7 @@
 /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
 static __always_inline void rep_nop(void)
 {
-	asm volatile("rep; nop" ::: "memory");
+	asm volatile("lfence" ::: "memory");
 }
 
 static __always_inline void cpu_relax(void)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f901658d9f7c08..7d931995efdc31 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -143,6 +143,8 @@ obj-$(CONFIG_AMD_MEM_ENCRYPT)		+= sev.o
 
 obj-$(CONFIG_CFI_CLANG)			+= cfi.o
 
+obj-y				+= powerbump.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index fbaf12e43f4160..c8c2d6f1a8aca2 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -166,6 +166,10 @@ static ssize_t energy_perf_bias_store(struct device *dev,
 	if (ret < 0)
 		return ret;
 
+	/* update the ITMT scheduler logic to use the power policy data */
+	/* scale the val up by 2 so the range is 224 - 256 */
+	sched_set_itmt_power_ratio(256 - val * 2, cpu);
+
 	return count;
 }
 
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 6a41cee242f6d7..18dc2dd80c890b 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -44,6 +44,8 @@
 
 static struct microcode_ops	*microcode_ops;
 static bool dis_ucode_ldr = true;
+bool ucode_rollback = false;
+int enable_rollback = 0;
 
 bool initrd_gone;
 
@@ -80,6 +82,26 @@ static u32 final_levels[] = {
 	0, /* T-101 terminator */
 };
 
+static int __init ucode_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	while (*str) {
+		if (!strncmp(str, "rollback", 8)) {
+			enable_rollback = 1;
+			pr_info("Microcode Rollback Enabled\n");
+		}
+		str += strcspn(str, ",");
+		while (*str == ',')
+			str++;
+	}
+	return 0;
+}
+
+__setup("ucode=", ucode_setup);
+
+
 /*
  * Check the current patch level on this CPU.
  *
@@ -513,6 +535,7 @@ static ssize_t reload_store(struct device *dev,
 			    struct device_attribute *attr,
 			    const char *buf, size_t size)
 {
+	struct cpuinfo_x86 *c = &boot_cpu_data;
 	enum ucode_state tmp_ret = UCODE_OK;
 	int bsp = boot_cpu_data.cpu_index;
 	unsigned long val;
@@ -522,7 +545,7 @@ static ssize_t reload_store(struct device *dev,
 	if (ret)
 		return ret;
 
-	if (val != 1)
+	if (!val || val > 2)
 		return size;
 
 	cpus_read_lock();
@@ -530,6 +553,20 @@ static ssize_t reload_store(struct device *dev,
 	ret = check_online_cpus();
 	if (ret)
 		goto put;
+	/*
+	 * Check if the vendor is Intel to permit reloading
+	 * microcode even if the revision is unchanged.
+	 * This is typically used during development of microcode
+	 * and changing rev is a pain.
+	 */
+	if ((val == 2) && ((c->x86_vendor != X86_VENDOR_INTEL) ||
+	     !enable_rollback))
+		return size;
+	else if (val == 2) {
+		mutex_lock(&microcode_mutex);
+		ucode_rollback = true;
+		mutex_unlock(&microcode_mutex);
+	}
 
 	tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
 	if (tmp_ret != UCODE_NEW)
@@ -540,6 +577,7 @@ static ssize_t reload_store(struct device *dev,
 	mutex_unlock(&microcode_mutex);
 
 put:
+	ucode_rollback = false;
 	cpus_read_unlock();
 
 	if (ret == 0)
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 1fcbd671f1dffc..11acd1a4ca9166 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -44,6 +44,7 @@ static struct microcode_intel *intel_ucode_patch;
 
 /* last level cache size per core */
 static int llc_size_per_core;
+extern bool ucode_rollback;
 
 /*
  * Returns 1 if update has been found, 0 otherwise.
@@ -80,7 +81,7 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev
 {
 	struct microcode_header_intel *mc_hdr = mc;
 
-	if (mc_hdr->rev <= new_rev)
+	if (!ucode_rollback && mc_hdr->rev <= new_rev)
 		return 0;
 
 	return find_matching_signature(mc, csig, cpf);
@@ -120,7 +121,7 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne
 		if (find_matching_signature(data, sig, pf)) {
 			prev_found = true;
 
-			if (mc_hdr->rev <= mc_saved_hdr->rev)
+			if (!ucode_rollback && mc_hdr->rev <= mc_saved_hdr->rev)
 				continue;
 
 			p = memdup_patch(data, size);
@@ -649,7 +650,7 @@ static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
 
 		phdr = (struct microcode_header_intel *)iter->data;
 
-		if (phdr->rev <= uci->cpu_sig.rev)
+		if (!ucode_rollback && phdr->rev <= uci->cpu_sig.rev)
 			continue;
 
 		if (!find_matching_signature(phdr,
@@ -734,10 +735,11 @@ static enum ucode_state apply_microcode_intel(int cpu)
 	 * already.
 	 */
 	rev = intel_get_microcode_revision();
-	if (rev >= mc->hdr.rev) {
+	if (!ucode_rollback && rev >= mc->hdr.rev) {
 		ret = UCODE_OK;
 		goto out;
-	}
+	} else if (ucode_rollback)
+		ret = UCODE_OK;
 
 	/*
 	 * Writeback and invalidate caches before updating microcode to avoid
@@ -756,7 +758,7 @@ static enum ucode_state apply_microcode_intel(int cpu)
 		return UCODE_ERROR;
 	}
 
-	if (bsp && rev != prev_rev) {
+	if (bsp && ((rev != prev_rev) || ucode_rollback)) {
 		pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
 			rev,
 			mc->hdr.date & 0xffff,
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 9ff480e94511b8..2158b43f7cd9fd 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -25,6 +25,7 @@
 
 static DEFINE_MUTEX(itmt_update_mutex);
 DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
+DEFINE_PER_CPU_READ_MOSTLY(int, sched_power_ratio);
 
 /* Boolean to track if system has ITMT capabilities */
 static bool __read_mostly sched_itmt_capable;
@@ -169,37 +170,50 @@ void sched_clear_itmt_support(void)
 
 int arch_asym_cpu_priority(int cpu)
 {
-	return per_cpu(sched_core_priority, cpu);
+	int power_ratio = per_cpu(sched_power_ratio, cpu);
+
+	/* a power ratio of 0 (uninitialized) is assumed to be maximum */
+	if (power_ratio == 0)
+		power_ratio = 256 - 2 * 6;
+	return per_cpu(sched_core_priority, cpu) * power_ratio / 256;
 }
 
 /**
  * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
- * @prio:	Priority of cpu core
- * @core_cpu:	The cpu number associated with the core
+ * @prio:	Priority of @cpu
+ * @cpu:	The CPU number
  *
  * The pstate driver will find out the max boost frequency
  * and call this function to set a priority proportional
- * to the max boost frequency. CPU with higher boost
+ * to the max boost frequency. CPUs with higher boost
  * frequency will receive higher priority.
  *
  * No need to rebuild sched domain after updating
  * the CPU priorities. The sched domains have no
  * dependency on CPU priorities.
  */
-void sched_set_itmt_core_prio(int prio, int core_cpu)
+void sched_set_itmt_core_prio(int prio, int cpu)
+{
+	per_cpu(sched_core_priority, cpu) = prio * 64 - cpu;
+}
+
+/**
+ * sched_set_itmt_power_ratio() - Set CPU priority based on ITMT
+ * @power_ratio:	The power scaling ratio [1..256] for the core
+ * @core_cpu:		The cpu number associated with the core
+ *
+ * Set a scaling to the cpu performance based on long term power
+ * settings (like EPB).
+ *
+ * Note this is for the policy not for the actual dynamic frequency;
+ * the frequency will increase itself as workloads run on a core.
+ */
+
+void sched_set_itmt_power_ratio(int power_ratio, int core_cpu)
 {
-	int cpu, i = 1;
+	int cpu;
 
 	for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
-		int smt_prio;
-
-		/*
-		 * Ensure that the siblings are moved to the end
-		 * of the priority chain and only used when
-		 * all other high priority cpus are out of capacity.
-		 */
-		smt_prio = prio * smp_num_siblings / (i * i);
-		per_cpu(sched_core_priority, cpu) = smt_prio;
-		i++;
+		per_cpu(sched_power_ratio, cpu) = power_ratio;
 	}
 }
diff --git a/arch/x86/kernel/powerbump.c b/arch/x86/kernel/powerbump.c
new file mode 100644
index 00000000000000..c6b3762113bf8c
--- /dev/null
+++ b/arch/x86/kernel/powerbump.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Copyright (C) 2023 Intel Corporation
+ *  Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * Kernel power-bump infrastructructure
+ */
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/jiffies.h>
+
+static DEFINE_PER_CPU(unsigned long, bump_timeout); /* jiffies at which the lease for the bump times out */
+
+
+
+/*
+ * a note about the use of the current cpu versus preemption.
+ *
+ * Most uses of in_power_bump() are inside local power management code,
+ * and are pinned to that cpu already.
+ *
+ * On the "set" side, interrupt level code is obviously also fully
+ * migration-race free.
+ *
+ * All other cases are exposed to a migration-race.
+ * 
+ * The goal of powerbump is statistical rather than deterministic,
+ * e.g. on average the CPU that hits event X will go towards Y more
+ * often than not, and the impact of being wrong is a bit of extra
+ * power potentially for some short durations.
+ * Weighted against the costs in performance and complexity of dealing
+ * with the race, the race condition is acceptable.
+ *
+ * The second known race is where interrupt context might set a bump
+ * time in the middle of process context setting a different but smaller bump time,
+ * with the result that process context will win incorrectly, and the
+ * actual bump time will be less than expected, but still non-zero.
+ * Here also the cost of dealing with the raice is outweight with the
+ * limited impact.
+ */
+
+
+int in_power_bump(void)
+{
+	int cpu = raw_smp_processor_id();
+	if (time_before(jiffies, per_cpu(bump_timeout, cpu)))
+		return 1;
+	
+	/* deal with wrap issues by keeping the stored bump value close to current */
+	per_cpu(bump_timeout, cpu) = jiffies;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(in_power_bump);
+
+void give_power_bump(int msecs)
+{
+	unsigned long nextjiffies;
+	int cpu;
+	/* we need to round up an extra jiffie */
+	nextjiffies = jiffies + msecs_to_jiffies(msecs) + 1;
+
+	cpu = raw_smp_processor_id();
+	if (time_before(per_cpu(bump_timeout, cpu), nextjiffies))
+		 per_cpu(bump_timeout, cpu) = nextjiffies;
+	
+}
+EXPORT_SYMBOL_GPL(give_power_bump);
+
+static __init int powerbump_init(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		per_cpu(bump_timeout, cpu) = jiffies;
+	}
+
+	return 0;
+}
+
+late_initcall(powerbump_init);
\ No newline at end of file
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cafacb2e58cceb..c2f80184fd3325 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1569,6 +1569,9 @@ unsigned long calibrate_delay_is_known(void)
 	if (!constant_tsc || !mask)
 		return 0;
 
+	if (cpu != 0)
+		return cpu_data(0).loops_per_jiffy;
+
 	sibling = cpumask_any_but(mask, cpu);
 	if (sibling < nr_cpu_ids)
 		return cpu_data(sibling).loops_per_jiffy;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7b0d4ab894c8bc..1a14f52added00 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -799,9 +799,9 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 	if (!printk_ratelimit())
 		return;
 
-	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
+	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx cpu %i",
 		loglvl, tsk->comm, task_pid_nr(tsk), address,
-		(void *)regs->ip, (void *)regs->sp, error_code);
+		(void *)regs->ip, (void *)regs->sp, error_code, raw_smp_processor_id());
 
 	print_vma_addr(KERN_CONT " in ", regs->ip);
 
diff --git a/block/bio.c b/block/bio.c
index 57c2f327225bd1..08ba43fe3242b3 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -19,6 +19,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/blk-crypto.h>
 #include <linux/xarray.h>
+#include <linux/powerbump.h>
 
 #include <trace/events/block.h>
 #include "blk.h"
@@ -1294,6 +1295,7 @@ EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
 
 static void submit_bio_wait_endio(struct bio *bio)
 {
+	give_power_bump(BUMP_FOR_DISK);
 	complete(bio->bi_private);
 }
 
@@ -1319,6 +1321,8 @@ int submit_bio_wait(struct bio *bio)
 	bio->bi_opf |= REQ_SYNC;
 	submit_bio(bio);
 
+	give_power_bump(BUMP_FOR_DISK);
+
 	/* Prevent hang_check timer from firing at us during very long I/O */
 	hang_check = sysctl_hung_task_timeout_secs;
 	if (hang_check)
diff --git a/drivers/Makefile b/drivers/Makefile
index bdf1c66141c9bd..1e1a0832fb48a1 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -59,15 +59,8 @@ obj-y				+= char/
 # iommu/ comes before gpu as gpu are using iommu controllers
 obj-y				+= iommu/
 
-# gpu/ comes after char for AGP vs DRM startup and after iommu
-obj-y				+= gpu/
-
 obj-$(CONFIG_CONNECTOR)		+= connector/
 
-# i810fb and intelfb depend on char/agp/
-obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
-obj-$(CONFIG_FB_INTEL)          += video/fbdev/intelfb/
-
 obj-$(CONFIG_PARPORT)		+= parport/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
@@ -79,6 +72,14 @@ obj-y				+= macintosh/
 obj-y				+= scsi/
 obj-y				+= nvme/
 obj-$(CONFIG_ATA)		+= ata/
+
+# gpu/ comes after char for AGP vs DRM startup and after iommu
+obj-y				+= gpu/
+
+# i810fb and intelfb depend on char/agp/
+obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
+obj-$(CONFIG_FB_INTEL)          += video/fbdev/intelfb/
+
 obj-$(CONFIG_TARGET_CORE)	+= target/
 obj-$(CONFIG_MTD)		+= mtd/
 obj-$(CONFIG_SPI)		+= spi/
diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index 954386a2b5002b..cef60f30278c53 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -34,14 +34,14 @@
 #include "libata.h"
 
 static int ahci_skip_host_reset;
-int ahci_ignore_sss;
+int ahci_ignore_sss=1;
 EXPORT_SYMBOL_GPL(ahci_ignore_sss);
 
 module_param_named(skip_host_reset, ahci_skip_host_reset, int, 0444);
 MODULE_PARM_DESC(skip_host_reset, "skip global host reset (0=don't skip, 1=skip)");
 
 module_param_named(ignore_sss, ahci_ignore_sss, int, 0444);
-MODULE_PARM_DESC(ignore_sss, "Ignore staggered spinup flag (0=don't ignore, 1=ignore)");
+MODULE_PARM_DESC(ignore_sss, "Ignore staggered spinup flag (0=don't ignore, 1=ignore [default])");
 
 static int ahci_set_lpm(struct ata_link *link, enum ata_lpm_policy policy,
 			unsigned hints);
diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c
index 7c3590fd97c28d..bb4880e10581f5 100644
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -470,6 +470,8 @@ static int fw_decompress_xz(struct device *dev, struct fw_priv *fw_priv,
 static char fw_path_para[256];
 static const char * const fw_path[] = {
 	fw_path_para,
+	"/etc/firmware/" UTS_RELEASE,
+	"/etc/firmware",
 	"/lib/firmware/updates/" UTS_RELEASE,
 	"/lib/firmware/updates",
 	"/lib/firmware/" UTS_RELEASE,
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 6ff73c30769fae..46516074bfd032 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -365,6 +365,13 @@ static void intel_pstate_set_itmt_prio(int cpu)
 	 * update them at any time after it has been called.
 	 */
 	sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu);
+	/*
+	 * On some systems with overclocking enabled, CPPC.highest_perf is hardcoded to 0xff.
+	 * In this case we can't use CPPC.highest_perf to enable ITMT.
+	 * In this case we can look at MSR_HWP_CAPABILITIES bits [8:0] to decide.
+	 */
+	if (cppc_perf.highest_perf == 0xff)
+		cppc_perf.highest_perf = HWP_HIGHEST_PERF(READ_ONCE(all_cpu_data[cpu]->hwp_cap_cached));
 
 	if (max_highest_perf <= min_highest_perf) {
 		if (cppc_perf.highest_perf > max_highest_perf)
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index c4922684f30583..5bc5de2c1c694a 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -18,6 +18,7 @@
 #include <linux/sched/loadavg.h>
 #include <linux/sched/stat.h>
 #include <linux/math64.h>
+#include <linux/powerbump.h>
 
 #define BUCKETS 12
 #define INTERVAL_SHIFT 3
@@ -279,6 +280,9 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		data->needs_update = 0;
 	}
 
+	if (in_power_bump() && latency_req > BUMP_LATENCY_THRESHOLD) 
+		latency_req = BUMP_LATENCY_THRESHOLD;
+
 	/* determine the expected residency time, round up */
 	delta = tick_nohz_get_sleep_length(&delta_tick);
 	if (unlikely(delta < 0)) {
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index cfeb24d40d3789..8ac40b7c7b4127 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -578,7 +578,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
 		.desc = "MWAIT 0x01",
 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
-		.target_residency = 20,
+		.target_residency = 120,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -586,7 +586,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
 		.desc = "MWAIT 0x10",
 		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 33,
-		.target_residency = 100,
+		.target_residency = 900,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -594,7 +594,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
 		.desc = "MWAIT 0x20",
 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 133,
-		.target_residency = 400,
+		.target_residency = 1000,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -602,7 +602,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
 		.desc = "MWAIT 0x32",
 		.flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 166,
-		.target_residency = 500,
+		.target_residency = 1500,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -610,7 +610,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
 		.desc = "MWAIT 0x40",
 		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 300,
-		.target_residency = 900,
+		.target_residency = 2000,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -618,7 +618,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
 		.desc = "MWAIT 0x50",
 		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 600,
-		.target_residency = 1800,
+		.target_residency = 5000,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -626,7 +626,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
 		.desc = "MWAIT 0x60",
 		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 2600,
-		.target_residency = 7700,
+		.target_residency = 9000,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -646,7 +646,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
 		.desc = "MWAIT 0x01",
 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
-		.target_residency = 20,
+		.target_residency = 120,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -654,7 +654,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
 		.desc = "MWAIT 0x10",
 		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 40,
-		.target_residency = 100,
+		.target_residency = 1000,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -662,7 +662,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
 		.desc = "MWAIT 0x20",
 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 133,
-		.target_residency = 400,
+		.target_residency = 1000,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -670,7 +670,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
 		.desc = "MWAIT 0x32",
 		.flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 166,
-		.target_residency = 500,
+		.target_residency = 2000,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -678,7 +678,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
 		.desc = "MWAIT 0x40",
 		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 300,
-		.target_residency = 900,
+		.target_residency = 4000,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -686,7 +686,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
 		.desc = "MWAIT 0x50",
 		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 600,
-		.target_residency = 1800,
+		.target_residency = 7000,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -694,7 +694,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
 		.desc = "MWAIT 0x60",
 		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 2600,
-		.target_residency = 7700,
+		.target_residency = 9000,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -715,7 +715,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
 		.desc = "MWAIT 0x01",
 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
-		.target_residency = 20,
+		.target_residency = 120,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -723,7 +723,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
 		.desc = "MWAIT 0x10",
 		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 70,
-		.target_residency = 100,
+		.target_residency = 1000,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -731,7 +731,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
 		.desc = "MWAIT 0x20",
 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
 		.exit_latency = 85,
-		.target_residency = 200,
+		.target_residency = 600,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -739,7 +739,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
 		.desc = "MWAIT 0x33",
 		.flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
 		.exit_latency = 124,
-		.target_residency = 800,
+		.target_residency = 3000,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -747,7 +747,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
 		.desc = "MWAIT 0x40",
 		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
 		.exit_latency = 200,
-		.target_residency = 800,
+		.target_residency = 3200,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -755,7 +755,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
 		.desc = "MWAIT 0x50",
 		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
 		.exit_latency = 480,
-		.target_residency = 5000,
+		.target_residency = 9000,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -763,7 +763,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
 		.desc = "MWAIT 0x60",
 		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
 		.exit_latency = 890,
-		.target_residency = 5000,
+		.target_residency = 9000,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -784,7 +784,7 @@ static struct cpuidle_state skx_cstates[] __initdata = {
 		.desc = "MWAIT 0x01",
 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 10,
-		.target_residency = 20,
+		.target_residency = 300,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -813,7 +813,7 @@ static struct cpuidle_state icx_cstates[] __initdata = {
 		.desc = "MWAIT 0x01",
 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 4,
-		.target_residency = 4,
+		.target_residency = 40,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -821,7 +821,7 @@ static struct cpuidle_state icx_cstates[] __initdata = {
 		.desc = "MWAIT 0x20",
 		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 170,
-		.target_residency = 600,
+		.target_residency = 900,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
@@ -942,7 +942,7 @@ static struct cpuidle_state adl_n_cstates[] __initdata = {
 		.desc = "MWAIT 0x01",
 		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 2,
-		.target_residency = 4,
+		.target_residency = 40,
 		.enter = &intel_idle,
 		.enter_s2idle = intel_idle_s2idle, },
 	{
diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c
index 6dac7c1853a541..fab04cd8a7a095 100644
--- a/drivers/input/serio/i8042.c
+++ b/drivers/input/serio/i8042.c
@@ -621,7 +621,7 @@ static int i8042_enable_kbd_port(void)
 	if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) {
 		i8042_ctr &= ~I8042_CTR_KBDINT;
 		i8042_ctr |= I8042_CTR_KBDDIS;
-		pr_err("Failed to enable KBD port\n");
+		pr_info("Failed to enable KBD port\n");
 		return -EIO;
 	}
 
@@ -640,7 +640,7 @@ static int i8042_enable_aux_port(void)
 	if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) {
 		i8042_ctr &= ~I8042_CTR_AUXINT;
 		i8042_ctr |= I8042_CTR_AUXDIS;
-		pr_err("Failed to enable AUX port\n");
+		pr_info("Failed to enable AUX port\n");
 		return -EIO;
 	}
 
@@ -732,7 +732,7 @@ static int i8042_check_mux(void)
 	i8042_ctr &= ~I8042_CTR_AUXINT;
 
 	if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) {
-		pr_err("Failed to disable AUX port, can't use MUX\n");
+		pr_info("Failed to disable AUX port, can't use MUX\n");
 		return -EIO;
 	}
 
@@ -955,7 +955,7 @@ static int i8042_controller_selftest(void)
 	do {
 
 		if (i8042_command(&param, I8042_CMD_CTL_TEST)) {
-			pr_err("i8042 controller selftest timeout\n");
+			pr_info("i8042 controller selftest timeout\n");
 			return -ENODEV;
 		}
 
@@ -977,7 +977,7 @@ static int i8042_controller_selftest(void)
 	pr_info("giving up on controller selftest, continuing anyway...\n");
 	return 0;
 #else
-	pr_err("i8042 controller selftest failed\n");
+	pr_info("i8042 controller selftest failed\n");
 	return -EIO;
 #endif
 }
diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index aa0fc00faecbe8..b93a4d71be29e0 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -43,7 +43,7 @@
 
 #define DRV_NAME	"dummy"
 
-static int numdummies = 1;
+static int numdummies = 0;
 
 /* fake multicast ability */
 static void set_multicast_list(struct net_device *dev)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7e3893d06babdf..8984f328a4f6fe 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -58,7 +58,7 @@ static u8 nvme_max_retries = 5;
 module_param_named(max_retries, nvme_max_retries, byte, 0644);
 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
 
-static unsigned long default_ps_max_latency_us = 100000;
+static unsigned long default_ps_max_latency_us = 200;
 module_param(default_ps_max_latency_us, ulong, 0644);
 MODULE_PARM_DESC(default_ps_max_latency_us,
 		 "max power saving latency for new devices; use PM QOS to change per device");
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 2127aba3550b5d..cf5c72a88c2bae 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -62,7 +62,7 @@ struct pci_pme_device {
 	struct pci_dev *dev;
 };
 
-#define PME_TIMEOUT 1000 /* How long between PME checks */
+#define PME_TIMEOUT 4000 /* How long between PME checks */
 
 static void pci_dev_d3_sleep(struct pci_dev *dev)
 {
diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 26d00b1853b421..3e239d6548b523 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -1518,7 +1518,7 @@ static int __init rapl_init(void)
 
 	id = x86_match_cpu(rapl_ids);
 	if (!id) {
-		pr_err("driver does not support CPU family %d model %d\n",
+		pr_info("driver does not support CPU family %d model %d\n",
 		       boot_cpu_data.x86, boot_cpu_data.x86_model);
 
 		return -ENODEV;
diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c
index b80e25ec12615f..187b4ee6e9f5df 100644
--- a/drivers/thermal/intel/intel_powerclamp.c
+++ b/drivers/thermal/intel/intel_powerclamp.c
@@ -627,6 +627,11 @@ static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
 	.set_cur_state = powerclamp_set_cur_state,
 };
 
+static const struct x86_cpu_id amd_cpu[] = {
+	{ X86_VENDOR_AMD },
+	{},
+};
+
 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
 	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
 	{}
@@ -636,6 +641,11 @@ MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
 static int __init powerclamp_probe(void)
 {
 
+	if (x86_match_cpu(amd_cpu)){
+		pr_info("Intel PowerClamp does not support AMD CPUs\n");
+		return -ENODEV;
+	}
+
 	if (!x86_match_cpu(intel_powerclamp_ids)) {
 		pr_err("CPU does not support MWAIT\n");
 		return -ENODEV;
diff --git a/fs/buffer.c b/fs/buffer.c
index d9c6d1fbb6dde5..139a1b18b24014 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -48,6 +48,7 @@
 #include <linux/sched/mm.h>
 #include <trace/events/block.h>
 #include <linux/fscrypt.h>
+#include <linux/powerbump.h>
 
 #include "internal.h"
 
@@ -119,6 +120,7 @@ EXPORT_SYMBOL(buffer_check_dirty_writeback);
  */
 void __wait_on_buffer(struct buffer_head * bh)
 {
+	give_power_bump(BUMP_FOR_DISK);
 	wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__wait_on_buffer);
@@ -156,6 +158,7 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
  */
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 {
+	give_power_bump(BUMP_FOR_DISK);
 	__end_buffer_read_notouch(bh, uptodate);
 	put_bh(bh);
 }
@@ -163,6 +166,7 @@ EXPORT_SYMBOL(end_buffer_read_sync);
 
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
+	give_power_bump(BUMP_FOR_DISK);
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6a404ac1c178f0..f451099d9343a3 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -27,6 +27,7 @@
 #include <linux/bug.h>
 #include <linux/module.h>
 #include <linux/sched/mm.h>
+#include <linux/powerbump.h>
 
 #include <trace/events/jbd2.h>
 
@@ -1104,6 +1105,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 	if (buffer_shadow(bh)) {
 		JBUFFER_TRACE(jh, "on shadow: sleep");
 		spin_unlock(&jh->b_state_lock);
+		give_power_bump(BUMP_FOR_DISK);
 		wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
 		goto repeat;
 	}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 0b7242370b5673..16b8fc483b3d35 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -45,7 +45,7 @@
 /*
  * The default maximum commit age, in seconds.
  */
-#define JBD2_DEFAULT_MAX_COMMIT_AGE 5
+#define JBD2_DEFAULT_MAX_COMMIT_AGE 30
 
 #ifdef CONFIG_JBD2_DEBUG
 /*
diff --git a/include/linux/powerbump.h b/include/linux/powerbump.h
new file mode 100644
index 00000000000000..a17ed37744668e
--- /dev/null
+++ b/include/linux/powerbump.h
@@ -0,0 +1,16 @@
+#pragma once
+
+/* in nsecs */
+#define BUMP_LATENCY_THRESHOLD 2000
+
+
+/* bump time constants, in msec */
+#define BUMP_FOR_DISK		3
+#define BUMP_FOR_NETWORK	3
+#define BUMP_FOR_FUTEX		3
+
+
+
+/* API prototypes */
+extern void give_power_bump(int msecs);
+extern int in_power_bump(void);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 7f5a51aae0a73d..8e57dcc0469811 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -165,6 +165,7 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
 
 extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+extern void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 
@@ -1192,6 +1193,7 @@ do {										\
  */
 void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
 bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
 long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
 void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h
index d174914a837dbf..bf8e2af101a3c8 100644
--- a/include/uapi/linux/if_bonding.h
+++ b/include/uapi/linux/if_bonding.h
@@ -82,7 +82,7 @@
 #define BOND_STATE_ACTIVE       0   /* link is active */
 #define BOND_STATE_BACKUP       1   /* link is backup */
 
-#define BOND_DEFAULT_MAX_BONDS  1   /* Default maximum number of devices to support */
+#define BOND_DEFAULT_MAX_BONDS  0   /* Default maximum number of devices to support */
 
 #define BOND_DEFAULT_TX_QUEUES 16   /* Default number of tx queues per device */
 
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 811e94daf0a84a..06fef7f97c028e 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -283,8 +283,18 @@ dev_t name_to_dev_t(const char *name)
 	if (strcmp(name, "/dev/ram") == 0)
 		return Root_RAM0;
 #ifdef CONFIG_BLOCK
-	if (strncmp(name, "PARTUUID=", 9) == 0)
-		return devt_from_partuuid(name + 9);
+	if (strncmp(name, "PARTUUID=", 9) == 0) {
+		dev_t res;
+		int  needtowait = 40<<1;
+		res = devt_from_partuuid(name + 9);
+		while (!res && needtowait) {
+			/* waiting 0.5 sec */
+			msleep(500);
+			res = devt_from_partuuid(name + 9);
+			needtowait--;
+		}
+		return res;
+	}
 	if (strncmp(name, "PARTLABEL=", 10) == 0)
 		return devt_from_partlabel(name + 10);
 	if (strncmp(name, "/dev/", 5) == 0)
@@ -612,7 +622,9 @@ void __init prepare_namespace(void)
 	 * For example, it is not atypical to wait 5 seconds here
 	 * for the touchpad of a laptop to initialize.
 	 */
+	async_synchronize_full();
 	wait_for_device_probe();
+	async_synchronize_full();
 
 	md_run_setup();
 
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index ba01b94082033b..e4fc09a98cbcd6 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -3,6 +3,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/signal.h>
 #include <linux/freezer.h>
+#include <linux/powerbump.h>
 
 #include "futex.h"
 
@@ -336,6 +337,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
 	 */
 	set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
 	futex_queue(q, hb);
+	give_power_bump(BUMP_FOR_FUTEX);
 
 	/* Arm the timer */
 	if (timeout)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 44873594de0316..fe62d59f2bdcf4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -755,6 +755,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
 	struct task_struct *new, *owner;
 	unsigned long flags, new_flags;
 	enum owner_state state;
+	int i = 0;
 
 	lockdep_assert_preemption_disabled();
 
@@ -791,7 +792,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
 			break;
 		}
 
-		cpu_relax();
+		if (i++ > 1000)
+			cpu_relax();
 	}
 
 	return state;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4a0b8bd941c78..155862be793155 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8900,12 +8900,10 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
 				    struct sched_group *sg)
 {
 #ifdef CONFIG_SCHED_SMT
-	bool local_is_smt, sg_is_smt;
+	bool local_is_smt;
 	int sg_busy_cpus;
 
 	local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
-	sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
-
 	sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
 
 	if (!local_is_smt) {
@@ -8926,25 +8924,16 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
 		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
 	}
 
-	/* @dst_cpu has SMT siblings. */
-
-	if (sg_is_smt) {
-		int local_busy_cpus = sds->local->group_weight -
-				      sds->local_stat.idle_cpus;
-		int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
-
-		if (busy_cpus_delta == 1)
-			return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
-
-		return false;
-	}
-
 	/*
-	 * @sg does not have SMT siblings. Ensure that @sds::local does not end
-	 * up with more than one busy SMT sibling and only pull tasks if there
-	 * are not busy CPUs (i.e., no CPU has running tasks).
+	 * @dst_cpu has SMT siblings. When both @dst_cpu and the busiest core
+	 * have one or more busy siblings, moving tasks between them results
+	 * in the same throughput. Only if all the siblings of @dst_cpu are
+	 * idle throughput can increase.
+	 *
+	 * If the difference in the number of busy CPUs is two or more, let
+	 * find_busiest_group() take care of it.
 	 */
-	if (!sds->local_stat.sum_nr_running)
+	if (sg_busy_cpus == 1 && !sds->local_stat.sum_nr_running)
 		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
 
 	return false;
@@ -10162,9 +10151,14 @@ asym_active_balance(struct lb_env *env)
 	 * ASYM_PACKING needs to force migrate tasks from busy but
 	 * lower priority CPUs in order to pack all tasks in the
 	 * highest priority CPUs.
+	 *
+	 * If the busy CPU has higher priority but is an SMT sibling
+	 * in which other SMT siblings are also busy, a lower-priority
+	 * CPU in a separate core can help.
 	 */
 	return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
-	       sched_asym_prefer(env->dst_cpu, env->src_cpu);
+	       (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
+		!(env->sd->flags & SD_SHARE_CPUCAPACITY));
 }
 
 static inline bool
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 9860bb9a847cf0..c7f045873a5fbd 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -47,6 +47,17 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_
 }
 EXPORT_SYMBOL_GPL(add_wait_queue_priority);
 
+void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+	unsigned long flags;
+
+	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&wq_head->lock, flags);
+	__add_wait_queue(wq_head, wq_entry);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive_lifo);
+
 void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
 	unsigned long flags;
@@ -289,6 +300,19 @@ prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_ent
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
 
+void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
+{
+	unsigned long flags;
+
+	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&wq_head->lock, flags);
+	if (list_empty(&wq_entry->entry))
+		__add_wait_queue(wq_head, wq_entry);
+	set_current_state(state);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait_exclusive_lifo);
+
 void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
 {
 	wq_entry->flags = flags;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 8e61f21e7e33e2..be1439d38f260c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -41,7 +41,7 @@ unsigned long __read_mostly watchdog_enabled;
 int __read_mostly watchdog_user_enabled = 1;
 int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
 int __read_mostly soft_watchdog_user_enabled = 1;
-int __read_mostly watchdog_thresh = 10;
+int __read_mostly watchdog_thresh = 40;
 static int __read_mostly nmi_watchdog_available;
 
 struct cpumask watchdog_cpumask __read_mostly;
diff --git a/mm/ksm.c b/mm/ksm.c
index c19fcca9bc03dc..7009cf42be76b1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2427,9 +2427,14 @@ static int ksm_scan_thread(void *nothing)
 
 		if (ksmd_should_run()) {
 			sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
-			wait_event_interruptible_timeout(ksm_iter_wait,
-				sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
-				msecs_to_jiffies(sleep_ms));
+			if (sleep_ms >= 1000)
+				wait_event_interruptible_timeout(ksm_iter_wait,
+					sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
+					msecs_to_jiffies(round_jiffies_relative(sleep_ms)));
+			else
+				wait_event_interruptible_timeout(ksm_iter_wait,
+					sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
+					msecs_to_jiffies(sleep_ms));
 		} else {
 			wait_event_freezable(ksm_thread_wait,
 				ksmd_should_run() || kthread_should_stop());
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 266a1ab054341c..25a5289cb334de 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -611,7 +611,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 	cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
 
 	x = __this_cpu_add_return(stats_updates, abs(val));
-	if (x > MEMCG_CHARGE_BATCH) {
+	if (x > MEMCG_CHARGE_BATCH * 128) {
 		/*
 		 * If stats_flush_threshold exceeds the threshold
 		 * (>num_online_cpus()), cgroup stats update will be triggered
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e60657875d328..688ebfafd58ecf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7069,11 +7069,11 @@ static int zone_batchsize(struct zone *zone)
 
 	/*
 	 * The number of pages to batch allocate is either ~0.1%
-	 * of the zone or 1MB, whichever is smaller. The batch
+	 * of the zone or 4MB, whichever is smaller. The batch
 	 * size is striking a balance between allocation latency
 	 * and zone lock contention.
 	 */
-	batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
+	batch = min(zone_managed_pages(zone) >> 10, 4 * SZ_1M / PAGE_SIZE);
 	batch /= 4;		/* We effectively *= 4 below */
 	if (batch < 1)
 		batch = 1;
diff --git a/net/core/dev.c b/net/core/dev.c
index 3be256051e99b9..bed00f3a22694d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -150,6 +150,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/prandom.h>
 #include <linux/once_lite.h>
+#include <linux/powerbump.h>
 
 #include "dev.h"
 #include "net-sysfs.h"
@@ -5744,6 +5745,7 @@ int netif_receive_skb(struct sk_buff *skb)
 	int ret;
 
 	trace_netif_receive_skb_entry(skb);
+	give_power_bump(BUMP_FOR_NETWORK);
 
 	ret = netif_receive_skb_internal(skb);
 	trace_netif_receive_skb_exit(ret);
@@ -5768,6 +5770,7 @@ void netif_receive_skb_list(struct list_head *head)
 
 	if (list_empty(head))
 		return;
+	give_power_bump(BUMP_FOR_NETWORK);
 	if (trace_netif_receive_skb_list_entry_enabled()) {
 		list_for_each_entry(skb, head, list)
 			trace_netif_receive_skb_list_entry(skb);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 4e84ed21d16fed..c6e54eca42d983 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -588,7 +588,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
 	 * having to remove and re-insert us on the wait queue.
 	 */
 	for (;;) {
-		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
+		prepare_to_wait_exclusive_lifo(sk_sleep(sk), &wait,
 					  TASK_INTERRUPTIBLE);
 		release_sock(sk);
 		if (reqsk_queue_empty(&icsk->icsk_accept_queue))
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4f2205756cfeee..2d20275dd6504d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4810,8 +4810,8 @@ void __init tcp_init(void)
 	tcp_init_mem();
 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
 	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
-	max_wshare = min(4UL*1024*1024, limit);
-	max_rshare = min(6UL*1024*1024, limit);
+	max_wshare = min(16UL*1024*1024, limit);
+	max_rshare = min(16UL*1024*1024, limit);
 
 	init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
 	init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;