From 031a0300f2c94e81598bcc08a9e6de7f10a18d7b Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Date: Thu, 1 Sep 2022 18:07:04 +0200
Subject: [PATCH 001/707] ecryptfs: Replace kmap() with kmap_local_page()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The use of kmap() is being deprecated in favor of kmap_local_page().

There are two main problems with kmap(): (1) It comes with an overhead as
the mapping space is restricted and protected by a global lock for
synchronization and (2) it also requires global TLB invalidation when the
kmap’s pool wraps and it might block when the mapping space is fully
utilized until a slot becomes available.

With kmap_local_page() the mappings are per thread, CPU local, can take
page faults, and can be called from any context (including interrupts).
It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore,
the tasks can be preempted and, when they are scheduled to run again, the
kernel virtual addresses are restored and still valid.

Since its use in fs/ecryptfs is safe everywhere, it should be preferred.

Therefore, replace kmap() with kmap_local_page() in fs/ecryptfs.

Cc: "Venkataramanan, Anirudh" <anirudh.venkataramanan@intel.com>
Suggested-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Signed-off-by: Tyler Hicks <code@tyhicks.com>
Link: https://lore.kernel.org/r/20220901160704.25701-1-fmdefrancesco@gmail.com
---
 fs/ecryptfs/crypto.c     | 8 ++++----
 fs/ecryptfs/read_write.c | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index e3f5d7f3c8a0ad..03263ebcccc6bd 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -465,10 +465,10 @@ int ecryptfs_encrypt_page(struct page *page)
 	}
 
 	lower_offset = lower_offset_for_page(crypt_stat, page);
-	enc_extent_virt = kmap(enc_extent_page);
+	enc_extent_virt = kmap_local_page(enc_extent_page);
 	rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
 				  PAGE_SIZE);
-	kunmap(enc_extent_page);
+	kunmap_local(enc_extent_virt);
 	if (rc < 0) {
 		ecryptfs_printk(KERN_ERR,
 			"Error attempting to write lower page; rc = [%d]\n",
@@ -514,10 +514,10 @@ int ecryptfs_decrypt_page(struct page *page)
 	BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
 
 	lower_offset = lower_offset_for_page(crypt_stat, page);
-	page_virt = kmap(page);
+	page_virt = kmap_local_page(page);
 	rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE,
 				 ecryptfs_inode);
-	kunmap(page);
+	kunmap_local(page_virt);
 	if (rc < 0) {
 		ecryptfs_printk(KERN_ERR,
 			"Error attempting to read lower page; rc = [%d]\n",
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 60bdcaddcbe57e..5edf027c835906 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -64,11 +64,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
 
 	offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT)
 		  + offset_in_page);
-	virt = kmap(page_for_lower);
+	virt = kmap_local_page(page_for_lower);
 	rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
 	if (rc > 0)
 		rc = 0;
-	kunmap(page_for_lower);
+	kunmap_local(virt);
 	return rc;
 }
 
@@ -253,11 +253,11 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
 	int rc;
 
 	offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page);
-	virt = kmap(page_for_ecryptfs);
+	virt = kmap_local_page(page_for_ecryptfs);
 	rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
 	if (rc > 0)
 		rc = 0;
-	kunmap(page_for_ecryptfs);
+	kunmap_local(virt);
 	flush_dcache_page(page_for_ecryptfs);
 	return rc;
 }

From c1cc2db216078f9b1e29c991b1b9177c26757162 Mon Sep 17 00:00:00 2001
From: Slark Xiao <slark_xiao@163.com>
Date: Fri, 22 Jul 2022 18:02:12 +0800
Subject: [PATCH 002/707] ecryptfs: keystore: Fix typo 'the the' in comment

Replace 'the the' with 'the' in the comment.

Signed-off-by: Slark Xiao <slark_xiao@163.com>
Signed-off-by: Tyler Hicks <code@tyhicks.com>
Link: https://lore.kernel.org/r/20220722100212.79490-1-slark_xiao@163.com
---
 fs/ecryptfs/keystore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 3fe41964c0d8d9..2452d6fd7062d7 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -878,7 +878,7 @@ struct ecryptfs_parse_tag_70_packet_silly_stack {
  * @filename: This function kmalloc's the memory for the filename
  * @filename_size: This function sets this to the amount of memory
  *                 kmalloc'd for the filename
- * @packet_size: This function sets this to the the number of octets
+ * @packet_size: This function sets this to the number of octets
  *               in the packet parsed
  * @mount_crypt_stat: The mount-wide cryptographic context
  * @data: The memory location containing the start of the tag 70

From a3d78fe3e1ae8c6a1901635c54a1a799656f72c8 Mon Sep 17 00:00:00 2001
From: Zipeng Zhang <zhangzipeng0@foxmail.com>
Date: Mon, 20 Mar 2023 10:04:28 +0800
Subject: [PATCH 003/707] fs: ecryptfs: comment typo fix

Comment typo fix "vitual" -> "virtual".

Signed-off-by: Zipeng Zhang <zhangzipeng0@foxmail.com>
Signed-off-by: Tyler Hicks <code@tyhicks.com>
---
 fs/ecryptfs/crypto.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 03263ebcccc6bd..c64985bf8c9e34 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1313,7 +1313,7 @@ static int ecryptfs_read_headers_virt(char *page_virt,
 
 /**
  * ecryptfs_read_xattr_region
- * @page_virt: The vitual address into which to read the xattr data
+ * @page_virt: The virtual address into which to read the xattr data
  * @ecryptfs_inode: The eCryptfs inode
  *
  * Attempts to read the crypto metadata from the extended attribute

From 7c1b1906229db88c487e21e1ecb622db64a1830d Mon Sep 17 00:00:00 2001
From: Han Xu <han.xu@nxp.com>
Date: Wed, 8 Nov 2023 09:07:01 -0600
Subject: [PATCH 004/707] mtd: spinand: gigadevice: Fix the get ecc status
 issue

Some GigaDevice ecc_get_status functions use on-stack buffer for
spi_mem_op causes spi_mem_check_op failing, fix the issue by using
spinand scratchbuf.

Fixes: c40c7a990a46 ("mtd: spinand: Add support for GigaDevice GD5F1GQ4UExxG")
Signed-off-by: Han Xu <han.xu@nxp.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20231108150701.593912-1-han.xu@nxp.com
---
 drivers/mtd/nand/spi/gigadevice.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/mtd/nand/spi/gigadevice.c b/drivers/mtd/nand/spi/gigadevice.c
index 987710e09441ad..6023cba748bb85 100644
--- a/drivers/mtd/nand/spi/gigadevice.c
+++ b/drivers/mtd/nand/spi/gigadevice.c
@@ -186,7 +186,7 @@ static int gd5fxgq4uexxg_ecc_get_status(struct spinand_device *spinand,
 {
 	u8 status2;
 	struct spi_mem_op op = SPINAND_GET_FEATURE_OP(GD5FXGQXXEXXG_REG_STATUS2,
-						      &status2);
+						      spinand->scratchbuf);
 	int ret;
 
 	switch (status & STATUS_ECC_MASK) {
@@ -207,6 +207,7 @@ static int gd5fxgq4uexxg_ecc_get_status(struct spinand_device *spinand,
 		 * report the maximum of 4 in this case
 		 */
 		/* bits sorted this way (3...0): ECCS1,ECCS0,ECCSE1,ECCSE0 */
+		status2 = *(spinand->scratchbuf);
 		return ((status & STATUS_ECC_MASK) >> 2) |
 			((status2 & STATUS_ECC_MASK) >> 4);
 
@@ -228,7 +229,7 @@ static int gd5fxgq5xexxg_ecc_get_status(struct spinand_device *spinand,
 {
 	u8 status2;
 	struct spi_mem_op op = SPINAND_GET_FEATURE_OP(GD5FXGQXXEXXG_REG_STATUS2,
-						      &status2);
+						      spinand->scratchbuf);
 	int ret;
 
 	switch (status & STATUS_ECC_MASK) {
@@ -248,6 +249,7 @@ static int gd5fxgq5xexxg_ecc_get_status(struct spinand_device *spinand,
 		 * 1 ... 4 bits are flipped (and corrected)
 		 */
 		/* bits sorted this way (1...0): ECCSE1, ECCSE0 */
+		status2 = *(spinand->scratchbuf);
 		return ((status2 & STATUS_ECC_MASK) >> 4) + 1;
 
 	case STATUS_ECC_UNCOR_ERROR:

From 564eac2860bdbe6ac651e6909ac07ecd93d778f3 Mon Sep 17 00:00:00 2001
From: Peter Martincic <pmartincic@microsoft.com>
Date: Mon, 27 Nov 2023 13:35:24 -0800
Subject: [PATCH 005/707] hv_utils: Allow implicit ICTIMESYNCFLAG_SYNC

Hyper-V hosts can omit the _SYNC flag to due a bug on resume from modern
suspend. In such a case, the guest may fail to update its time-of-day to
account for the period when it was suspended, and could proceed with a
significantly wrong time-of-day. In such a case when the guest is
significantly behind, fix it by treating a _SAMPLE the same as if _SYNC
was received so that the guest time-of-day is updated.

This is hidden behind param hv_utils.timesync_implicit.

Signed-off-by: Peter Martincic <pmartincic@microsoft.com>
Acked-by: Boqun Feng <boqun.feng@gmail.com>
Link: https://lore.kernel.org/r/20231127213524.52783-1-pmartincic@linux.microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Message-ID: <20231127213524.52783-1-pmartincic@linux.microsoft.com>
---
 drivers/hv/hv_util.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index 42aec2c5606af7..9c97c4065fe736 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -296,6 +296,11 @@ static struct {
 	spinlock_t			lock;
 } host_ts;
 
+static bool timesync_implicit;
+
+module_param(timesync_implicit, bool, 0644);
+MODULE_PARM_DESC(timesync_implicit, "If set treat SAMPLE as SYNC when clock is behind");
+
 static inline u64 reftime_to_ns(u64 reftime)
 {
 	return (reftime - WLTIMEDELTA) * 100;
@@ -344,6 +349,29 @@ static void hv_set_host_time(struct work_struct *work)
 		do_settimeofday64(&ts);
 }
 
+/*
+ * Due to a bug on Hyper-V hosts, the sync flag may not always be sent on resume.
+ * Force a sync if the guest is behind.
+ */
+static inline bool hv_implicit_sync(u64 host_time)
+{
+	struct timespec64 new_ts;
+	struct timespec64 threshold_ts;
+
+	new_ts = ns_to_timespec64(reftime_to_ns(host_time));
+	ktime_get_real_ts64(&threshold_ts);
+
+	threshold_ts.tv_sec += 5;
+
+	/*
+	 * If guest behind the host by 5 or more seconds.
+	 */
+	if (timespec64_compare(&new_ts, &threshold_ts) >= 0)
+		return true;
+
+	return false;
+}
+
 /*
  * Synchronize time with host after reboot, restore, etc.
  *
@@ -384,7 +412,8 @@ static inline void adj_guesttime(u64 hosttime, u64 reftime, u8 adj_flags)
 	spin_unlock_irqrestore(&host_ts.lock, flags);
 
 	/* Schedule work to do do_settimeofday64() */
-	if (adj_flags & ICTIMESYNCFLAG_SYNC)
+	if ((adj_flags & ICTIMESYNCFLAG_SYNC) ||
+	    (timesync_implicit && hv_implicit_sync(host_ts.host_time)))
 		schedule_work(&adj_time_work);
 }
 

From 96c4f072b2ed4beaed7b001c9eb1a4d997ff3a22 Mon Sep 17 00:00:00 2001
From: Zev Weiss <zev@bewilderbeest.net>
Date: Mon, 20 Nov 2023 04:19:56 -0800
Subject: [PATCH 006/707] dt-bindings: arm: aspeed: document ASRock SPC621D8HM3

Document ASRock SPC621D8HM3 board compatible.

Signed-off-by: Zev Weiss <zev@bewilderbeest.net>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20231120121954.19926-5-zev@bewilderbeest.net
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
index 749ee54a3ff83a..f8f66821cb5faa 100644
--- a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
+++ b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
@@ -36,6 +36,7 @@ properties:
               - aspeed,ast2500-evb
               - asrock,e3c246d4i-bmc
               - asrock,romed8hm3-bmc
+              - asrock,spc621d8hm3-bmc
               - bytedance,g220a-bmc
               - facebook,cmm-bmc
               - facebook,minipack-bmc

From 2e09eb0615f012fb0d967e864d18b121b8ed2ae4 Mon Sep 17 00:00:00 2001
From: Zev Weiss <zev@bewilderbeest.net>
Date: Mon, 20 Nov 2023 04:19:57 -0800
Subject: [PATCH 007/707] ARM: dts: aspeed: Add ASRock SPC621D8HM3 BMC

This is a Xeon board broadly similar (aside from CPU vendor) to the
already-support romed8hm3 (half-width, single-socket, ast2500).  It
doesn't require anything terribly special for OpenBMC support, so this
device-tree should provide everything necessary for basic
functionality with it.

Signed-off-by: Zev Weiss <zev@bewilderbeest.net>
Link: https://lore.kernel.org/r/20231120121954.19926-6-zev@bewilderbeest.net
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/Makefile             |   1 +
 .../aspeed/aspeed-bmc-asrock-spc621d8hm3.dts  | 324 ++++++++++++++++++
 2 files changed, 325 insertions(+)
 create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts

diff --git a/arch/arm/boot/dts/aspeed/Makefile b/arch/arm/boot/dts/aspeed/Makefile
index d3ac20e316d01e..2df0a2e88df712 100644
--- a/arch/arm/boot/dts/aspeed/Makefile
+++ b/arch/arm/boot/dts/aspeed/Makefile
@@ -10,6 +10,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \
 	aspeed-bmc-arm-stardragon4800-rep2.dtb \
 	aspeed-bmc-asrock-e3c246d4i.dtb \
 	aspeed-bmc-asrock-romed8hm3.dtb \
+	aspeed-bmc-asrock-spc621d8hm3.dtb \
 	aspeed-bmc-bytedance-g220a.dtb \
 	aspeed-bmc-delta-ahe50dc.dtb \
 	aspeed-bmc-facebook-bletchley.dtb \
diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts
new file mode 100644
index 00000000000000..555485871e7a7d
--- /dev/null
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts
@@ -0,0 +1,324 @@
+// SPDX-License-Identifier: GPL-2.0+
+/dts-v1/;
+
+#include "aspeed-g5.dtsi"
+#include <dt-bindings/gpio/aspeed-gpio.h>
+#include <dt-bindings/i2c/i2c.h>
+#include <dt-bindings/interrupt-controller/irq.h>
+#include <dt-bindings/leds/common.h>
+
+/{
+	model = "ASRock SPC621D8HM3 BMC";
+	compatible = "asrock,spc621d8hm3-bmc", "aspeed,ast2500";
+
+	aliases {
+		serial4 = &uart5;
+
+		i2c20 = &i2c1mux0ch0;
+		i2c21 = &i2c1mux0ch1;
+	};
+
+	chosen {
+		stdout-path = &uart5;
+	};
+
+	memory@80000000 {
+		reg = <0x80000000 0x20000000>;
+	};
+
+	leds {
+		compatible = "gpio-leds";
+
+		/* BMC heartbeat */
+		led-0 {
+			gpios = <&gpio ASPEED_GPIO(H, 6) GPIO_ACTIVE_LOW>;
+			function = LED_FUNCTION_HEARTBEAT;
+			color = <LED_COLOR_ID_GREEN>;
+			linux,default-trigger = "timer";
+		};
+
+		/* system fault */
+		led-1 {
+			gpios = <&gpio ASPEED_GPIO(Z, 2) GPIO_ACTIVE_LOW>;
+			function = LED_FUNCTION_FAULT;
+			color = <LED_COLOR_ID_RED>;
+			panic-indicator;
+		};
+	};
+
+	iio-hwmon {
+		compatible = "iio-hwmon";
+		io-channels = <&adc 0>, <&adc 1>, <&adc 2>, <&adc 3>,
+			<&adc 4>, <&adc 5>, <&adc 6>, <&adc 7>,
+			<&adc 8>, <&adc 9>, <&adc 10>, <&adc 11>,
+			<&adc 12>, <&adc 13>, <&adc 14>, <&adc 15>;
+	};
+};
+
+&fmc {
+	status = "okay";
+	flash@0 {
+		status = "okay";
+		m25p,fast-read;
+		label = "bmc";
+		spi-max-frequency = <50000000>; /* 50 MHz */
+#include "openbmc-flash-layout-64.dtsi"
+	};
+};
+
+&uart5 {
+	status = "okay";
+};
+
+&vuart {
+	status = "okay";
+	aspeed,lpc-io-reg = <0x2f8>;
+	aspeed,lpc-interrupts = <3 IRQ_TYPE_LEVEL_HIGH>;
+};
+
+&mac0 {
+	status = "okay";
+
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_rgmii1_default &pinctrl_mdio1_default>;
+
+	nvmem-cells = <&eth0_macaddress>;
+	nvmem-cell-names = "mac-address";
+};
+
+&i2c0 {
+	status = "okay";
+};
+
+&i2c1 {
+	status = "okay";
+
+	/* hardware monitor/thermal sensor */
+	temperature-sensor@29 {
+		compatible = "nuvoton,nct7802";
+		reg = <0x29>;
+	};
+
+	/* motherboard temp sensor (TMP1, near BMC) */
+	temperature-sensor@4c {
+		compatible = "nuvoton,w83773g";
+		reg = <0x4c>;
+	};
+
+	/* motherboard FRU eeprom */
+	eeprom@50 {
+		compatible = "st,24c128", "atmel,24c128";
+		reg = <0x50>;
+		pagesize = <16>;
+		#address-cells = <1>;
+		#size-cells = <1>;
+
+		eth0_macaddress: macaddress@3f80 {
+			reg = <0x3f80 6>;
+		};
+	};
+
+	/* M.2 slot smbus mux */
+	i2c-mux@71 {
+		compatible = "nxp,pca9545";
+		reg = <0x71>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		i2c1mux0ch0: i2c@0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+		};
+
+		i2c1mux0ch1: i2c@1 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <1>;
+		};
+	};
+};
+
+&i2c2 {
+	status = "okay";
+};
+
+&i2c3 {
+	status = "okay";
+};
+
+&i2c4 {
+	status = "okay";
+};
+
+&i2c5 {
+	status = "okay";
+};
+
+&i2c6 {
+	status = "okay";
+};
+
+&i2c7 {
+	status = "okay";
+};
+
+&i2c8 {
+	status = "okay";
+};
+
+&i2c9 {
+	status = "okay";
+};
+
+&i2c10 {
+	status = "okay";
+};
+
+&i2c11 {
+	status = "okay";
+};
+
+&i2c12 {
+	status = "okay";
+};
+
+&i2c13 {
+	status = "okay";
+};
+
+&video {
+	status = "okay";
+};
+
+&vhub {
+	status = "okay";
+};
+
+&lpc_ctrl {
+	status = "okay";
+};
+
+&lpc_snoop {
+	status = "okay";
+	snoop-ports = <0x80>;
+};
+
+&kcs3 {
+	status = "okay";
+	aspeed,lpc-io-reg = <0xca2>;
+};
+
+&peci0 {
+	status = "okay";
+};
+
+&pwm_tacho {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_pwm0_default
+		&pinctrl_pwm2_default
+		&pinctrl_pwm3_default
+		&pinctrl_pwm4_default>;
+
+	fan@0 {
+		reg = <0x00>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x00>;
+	};
+
+	fan@2 {
+		reg = <0x02>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x02>;
+	};
+
+	fan@3 {
+		reg = <0x03>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x03>;
+	};
+
+	fan@4 {
+		reg = <0x04>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x04>;
+	};
+};
+
+&gpio {
+	status = "okay";
+	gpio-line-names =
+		/*  A */ "LOCATORLED_STATUS_N", "LOCATORBTN_N",
+			"BMC_READY_N", "FM_SPD_DDRCPU_LVLSHFT_EN",
+			"", "", "", "",
+		/*  B */ "NODE_ID_1", "NODE_ID_2", "PSU_FAN_FAIL_N", "",
+			"", "", "", "GPIO_RST",
+		/*  C */ "", "", "", "", "", "", "", "",
+		/*  D */ "FP_PWR_BTN_MUX_N", "FM_BMC_PWRBTN_OUT_N",
+			"FP_RST_BTN_N", "RST_BMC_RSTBTN_OUT_N",
+			"NMI_BTN_N", "BMC_NMI",
+			"", "",
+		/*  E */ "", "", "", "FM_ME_RCVR_N", "", "", "", "",
+		/*  F */ "BMC_SMB_SEL_N", "FM_CPU2_DISABLE_COD_N",
+			"FM_REMOTE_DEBUG_BMC_EN", "FM_CPU_ERR0_LVT3_EN",
+			"FM_CPU_ERR1_LVT3_EN", "FM_CPU_ERR2_LVT3_EN",
+			"FM_MEM_THERM_EVENT_CPU1_LVT3_N", "FM_MEM_THERM_EVENT_CPU2_LVT3_N",
+		/*  G */ "HWM_BAT_EN", "", "BMC_PHYRST_N", "FM_BIOS_SPI_BMC_CTRL",
+			"BMC_ALERT1_N", "BMC_ALERT2_N", "BMC_ALERT3_N", "IRQ_SML0_ALERT_N",
+		/*  H */ "BMC_SMB_PRESENT_1_N", "FM_PCH_CORE_VID_0", "FM_PCH_CORE_VID_1", "",
+			"FM_MFG_MODE", "BMC_RTCRST", "BMC_HB_LED_N", "BMC_CASEOPEN",
+		/*  I */ "IRQ_PVDDQ_ABCD_CPU1_VRHOT_LVC3_N", "IRQ_PVDDQ_ABCD_CPU2_VRHOT_LVC3_N",
+			"IRQ_PVDDQ_EFGH_CPU1_VRHOT_LVC3_N", "IRQ_PVDDQ_EFGH_CPU2_VRHOT_LVC3_N",
+			"", "", "", "",
+		/*  J */ "", "", "", "", "", "", "", "",
+		/*  K */ "", "", "", "", "", "", "", "",
+		/*  L */ "", "", "", "", "", "", "", "",
+		/*  M */ "FM_PVCCIN_CPU1_PWR_IN_ALERT_N", "FM_PVCCIN_CPU2_PWR_IN_ALERT_N",
+			"IRQ_PVCCIN_CPU1_VRHOT_LVC3_N", "IRQ_PVCCIN_CPU2_VRHOT_LVC3_N",
+			"FM_CPU1_PROCHOT_BMC_LVC3_N", "",
+			"FM_CPU1_MEMHOT_OUT_N", "FM_CPU2_MEMHOT_OUT_N",
+		/*  N */ "", "", "", "", "", "", "", "",
+		/*  O */ "", "", "", "", "", "", "", "",
+		/*  P */ "", "", "", "", "", "", "", "",
+		/*  Q */ "", "", "", "", "", "", "RST_GLB_RST_WARN_N", "PCIE_WAKE_N",
+		/*  R */ "", "", "FM_BMC_SUSACK_N", "FM_BMC_EUP_LOT6_N",
+			"", "FM_BMC_PCH_SCI_LPC_N", "", "",
+		/*  S */ "FM_DBP_PRESENT_N", "FM_CPU2_SKTOCC_LCT3_N",
+			"FM_CPU1_FIVR_FAULT_LVT3", "FM_CPU2_FIVR_FAULT_LVT3",
+			 "", "", "", "",
+		/*  T */ "", "", "", "", "", "", "", "",
+		/*  U */ "", "", "", "", "", "", "", "",
+		/*  V */ "", "", "", "", "", "", "", "",
+		/*  W */ "", "", "", "", "", "", "", "",
+		/*  X */ "", "", "", "", "", "", "", "",
+		/*  Y */ "FM_SLPS3_N", "FM_SLPS4_N", "", "FM_BMC_ONCTL_N_PLD",
+			"", "", "", "",
+		/*  Z */ "FM_CPU_MSMI_CATERR_LVT3_N", "", "SYSTEM_FAULT_LED_N", "BMC_THROTTLE_N",
+			"", "", "", "",
+		/* AA */ "FM_CPU1_THERMTRIP_LATCH_LVT3_N", "FM_CPU2_THERMTRIP_LATCH_LVT3_N",
+			"FM_BIOS_POST_COMPLT_N", "DBP_BMC_SYSPWROK",
+			"", "IRQ_SML0_ALERT_MUX_N",
+			"IRQ_SMI_ACTIVE_N", "IRQ_NMI_EVENT_N",
+		/* AB */ "FM_PCH_BMC_THERMTRIP_N", "PWRGD_SYS_PWROK",
+			"ME_OVERRIDE", "IRQ_BMC_PCH_SMI_LPC_N",
+			"", "", "", "",
+		/* AC */ "", "", "", "", "", "", "", "";
+};
+
+&adc {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_adc0_default /* 3VSB */
+		&pinctrl_adc1_default	   /* 5VSB */
+		&pinctrl_adc2_default	   /* CPU1 */
+		&pinctrl_adc3_default	   /* NC */
+		&pinctrl_adc4_default	   /* VCCMABCD */
+		&pinctrl_adc5_default	   /* VCCMEFGH */
+		&pinctrl_adc6_default	   /* NC */
+		&pinctrl_adc7_default	   /* NC */
+		&pinctrl_adc8_default	   /* PVNN_PCH */
+		&pinctrl_adc9_default	   /* 1P05PCH */
+		&pinctrl_adc10_default	   /* 1P8PCH */
+		&pinctrl_adc11_default	   /* BAT */
+		&pinctrl_adc12_default	   /* 3V */
+		&pinctrl_adc13_default	   /* 5V */
+		&pinctrl_adc14_default	   /* 12V */
+		&pinctrl_adc15_default>;   /* GND */
+};

From 01bb8d5bf1ab1bd847a277f546e5f9af2c6933e1 Mon Sep 17 00:00:00 2001
From: Zev Weiss <zev@bewilderbeest.net>
Date: Mon, 20 Nov 2023 04:19:03 -0800
Subject: [PATCH 008/707] dt-bindings: arm: aspeed: document ASRock E3C256D4I

Document ASRock E3C256D4I board compatible.

Signed-off-by: Zev Weiss <zev@bewilderbeest.net>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20231120121901.19817-5-zev@bewilderbeest.net
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
index f8f66821cb5faa..6f7543463d894c 100644
--- a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
+++ b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
@@ -35,6 +35,7 @@ properties:
               - ampere,mtjade-bmc
               - aspeed,ast2500-evb
               - asrock,e3c246d4i-bmc
+              - asrock,e3c256d4i-bmc
               - asrock,romed8hm3-bmc
               - asrock,spc621d8hm3-bmc
               - bytedance,g220a-bmc

From f957714c0f5353b151639654a62680d27cf53e44 Mon Sep 17 00:00:00 2001
From: Zev Weiss <zev@bewilderbeest.net>
Date: Mon, 20 Nov 2023 04:19:04 -0800
Subject: [PATCH 009/707] ARM: dts: aspeed: Add ASRock E3C256D4I BMC

Like the E3C246D4I, this is a reasonably affordable off-the-shelf
mini-ITX AST2500/Xeon motherboard with good potential as an OpenBMC
development platform.  Booting the host requires a modicum of eSPI
support that's not yet in the mainline kernel, but most other basic
BMC functionality is available with this device-tree.

Signed-off-by: Zev Weiss <zev@bewilderbeest.net>
Link: https://lore.kernel.org/r/20231120121901.19817-6-zev@bewilderbeest.net
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/Makefile             |   1 +
 .../aspeed/aspeed-bmc-asrock-e3c256d4i.dts    | 322 ++++++++++++++++++
 2 files changed, 323 insertions(+)
 create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts

diff --git a/arch/arm/boot/dts/aspeed/Makefile b/arch/arm/boot/dts/aspeed/Makefile
index 2df0a2e88df712..3e3e6b96cb799d 100644
--- a/arch/arm/boot/dts/aspeed/Makefile
+++ b/arch/arm/boot/dts/aspeed/Makefile
@@ -9,6 +9,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \
 	aspeed-bmc-ampere-mtmitchell.dtb \
 	aspeed-bmc-arm-stardragon4800-rep2.dtb \
 	aspeed-bmc-asrock-e3c246d4i.dtb \
+	aspeed-bmc-asrock-e3c256d4i.dtb \
 	aspeed-bmc-asrock-romed8hm3.dtb \
 	aspeed-bmc-asrock-spc621d8hm3.dtb \
 	aspeed-bmc-bytedance-g220a.dtb \
diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts
new file mode 100644
index 00000000000000..263fcc8106ffaa
--- /dev/null
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0+
+/dts-v1/;
+
+#include "aspeed-g5.dtsi"
+#include <dt-bindings/gpio/aspeed-gpio.h>
+#include <dt-bindings/i2c/i2c.h>
+#include <dt-bindings/interrupt-controller/irq.h>
+#include <dt-bindings/leds/common.h>
+#include <dt-bindings/watchdog/aspeed-wdt.h>
+
+/{
+	model = "ASRock E3C256D4I BMC";
+	compatible = "asrock,e3c256d4i-bmc", "aspeed,ast2500";
+
+	aliases {
+		serial4 = &uart5;
+
+		i2c20 = &i2c2mux0ch0;
+		i2c21 = &i2c2mux0ch1;
+		i2c22 = &i2c2mux0ch2;
+		i2c23 = &i2c2mux0ch3;
+	};
+
+	chosen {
+		stdout-path = &uart5;
+	};
+
+	memory@80000000 {
+		reg = <0x80000000 0x20000000>;
+	};
+
+	leds {
+		compatible = "gpio-leds";
+
+		/* BMC heartbeat */
+		led-0 {
+			gpios = <&gpio ASPEED_GPIO(H, 6) GPIO_ACTIVE_LOW>;
+			function = LED_FUNCTION_HEARTBEAT;
+			color = <LED_COLOR_ID_GREEN>;
+			linux,default-trigger = "timer";
+		};
+
+		/* system fault */
+		led-1 {
+			gpios = <&gpio ASPEED_GPIO(Z, 2) GPIO_ACTIVE_LOW>;
+			function = LED_FUNCTION_FAULT;
+			color = <LED_COLOR_ID_RED>;
+			panic-indicator;
+		};
+	};
+
+	iio-hwmon {
+		compatible = "iio-hwmon";
+		io-channels = <&adc 0>, <&adc 1>, <&adc 2>, <&adc 3>,
+			<&adc 4>, <&adc 5>, <&adc 6>, <&adc 7>,
+			<&adc 8>, <&adc 9>, <&adc 10>, <&adc 11>,
+			<&adc 12>, <&adc 13>, <&adc 14>, <&adc 15>;
+	};
+};
+
+&fmc {
+	status = "okay";
+	flash@0 {
+		status = "okay";
+		m25p,fast-read;
+		label = "bmc";
+		spi-max-frequency = <100000000>; /* 100 MHz */
+#include "openbmc-flash-layout-64.dtsi"
+	};
+};
+
+&uart1 {
+	status = "okay";
+};
+
+&uart2 {
+	status = "okay";
+};
+
+&uart3 {
+	status = "okay";
+};
+
+&uart4 {
+	status = "okay";
+};
+
+&uart5 {
+	status = "okay";
+};
+
+&uart_routing {
+	status = "okay";
+};
+
+&mac0 {
+	status = "okay";
+
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_rgmii1_default &pinctrl_mdio1_default>;
+
+	nvmem-cells = <&eth0_macaddress>;
+	nvmem-cell-names = "mac-address";
+};
+
+&i2c0 {
+	status = "okay";
+};
+
+&i2c1 {
+	status = "okay";
+};
+
+&i2c2 {
+	status = "okay";
+
+	i2c-mux@70 {
+		compatible = "nxp,pca9545";
+		reg = <0x70>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		i2c2mux0ch0: i2c@0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+		};
+
+		i2c2mux0ch1: i2c@1 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <1>;
+		};
+
+		i2c2mux0ch2: i2c@2 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <2>;
+		};
+
+		i2c2mux0ch3: i2c@3 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <3>;
+		};
+	};
+};
+
+&i2c3 {
+	status = "okay";
+};
+
+&i2c4 {
+	status = "okay";
+};
+
+&i2c5 {
+	status = "okay";
+};
+
+&i2c6 {
+	status = "okay";
+};
+
+&i2c7 {
+	status = "okay";
+};
+
+&i2c9 {
+	status = "okay";
+};
+
+&i2c10 {
+	status = "okay";
+};
+
+&i2c11 {
+	status = "okay";
+
+	vrm@60 {
+		compatible = "renesas,isl69269", "isl69269";
+		reg = <0x60>;
+	};
+};
+
+&i2c12 {
+	status = "okay";
+
+	/* FRU eeprom */
+	eeprom@57 {
+		compatible = "st,24c128", "atmel,24c128";
+		reg = <0x57>;
+		pagesize = <16>;
+		#address-cells = <1>;
+		#size-cells = <1>;
+
+		eth0_macaddress: macaddress@3f80 {
+			reg = <0x3f80 6>;
+		};
+	};
+};
+
+&video {
+	status = "okay";
+};
+
+&vhub {
+	status = "okay";
+};
+
+&lpc_ctrl {
+	status = "okay";
+};
+
+&lpc_snoop {
+	status = "okay";
+	snoop-ports = <0x80>;
+};
+
+&kcs3 {
+	status = "okay";
+	aspeed,lpc-io-reg = <0xca2>;
+};
+
+&peci0 {
+	status = "okay";
+};
+
+&wdt1 {
+	aspeed,reset-mask = <(AST2500_WDT_RESET_DEFAULT & ~AST2500_WDT_RESET_LPC)>;
+};
+
+&wdt2 {
+	aspeed,reset-mask = <(AST2500_WDT_RESET_DEFAULT & ~AST2500_WDT_RESET_LPC)>;
+};
+
+&pwm_tacho {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_pwm0_default /* CPU */
+		&pinctrl_pwm2_default      /* rear */
+		&pinctrl_pwm4_default>;    /* front */
+
+	/* CPU */
+	fan@0 {
+		reg = <0x00>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x00>;
+	};
+
+	/* rear */
+	fan@2 {
+		reg = <0x02>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x02>;
+	};
+
+	/* front */
+	fan@4 {
+		reg = <0x04>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x04>;
+	};
+};
+
+&gpio {
+	status = "okay";
+	gpio-line-names =
+		/*  A */ "", "", "NMI_BTN_N", "BMC_NMI", "", "", "", "",
+		/*  B */ "", "", "", "", "", "", "", "",
+		/*  C */ "", "", "", "", "", "", "", "",
+		/*  D */ "BMC_PSIN", "BMC_PSOUT", "BMC_RESETCON", "RESETCON",
+			"", "", "", "",
+		/*  E */ "", "", "", "", "", "", "", "",
+		/*  F */ "LOCATORLED_STATUS_N", "LOCATORBTN", "", "",
+			"", "", "BMC_PCH_SCI_LPC", "BMC_NCSI_MUX_CTL",
+		/*  G */ "HWM_BAT_EN", "CHASSIS_ID0", "CHASSIS_ID1", "CHASSIS_ID2",
+			"", "", "", "",
+		/*  H */ "FM_ME_RCVR_N", "O_PWROK", "", "D4_DIMM_EVENT_3V_N",
+			"MFG_MODE_N", "BMC_RTCRST", "BMC_HB_LED_N", "BMC_CASEOPEN",
+		/*  I */ "", "", "", "", "", "", "", "",
+		/*  J */ "BMC_READY", "BMC_PCH_BIOS_CS_N", "BMC_SMI", "", "", "", "", "",
+		/*  K */ "", "", "", "", "", "", "", "",
+		/*  L */ "", "", "", "", "", "", "", "",
+		/*  M */ "", "", "", "", "", "", "", "",
+		/*  N */ "", "", "", "", "", "", "", "",
+		/*  O */ "", "", "", "", "", "", "", "",
+		/*  P */ "", "", "", "", "", "", "", "",
+		/*  Q */ "", "", "", "", "", "", "", "",
+		/*  R */ "", "", "", "", "", "", "", "",
+		/*  S */ "PCHHOT_BMC_N", "", "RSMRST", "", "", "", "", "",
+		/*  T */ "", "", "", "", "", "", "", "",
+		/*  U */ "", "", "", "", "", "", "", "",
+		/*  V */ "", "", "", "", "", "", "", "",
+		/*  W */ "", "", "", "", "", "", "", "",
+		/*  X */ "", "", "", "", "", "", "", "",
+		/*  Y */ "SLP_S3", "SLP_S5", "", "", "", "", "", "",
+		/*  Z */ "CPU_CATERR_BMC_N", "", "SYSTEM_FAULT_LED_N", "BMC_THROTTLE_N",
+			"", "", "", "",
+		/* AA */ "CPU1_THERMTRIP_LATCH_N", "", "CPU1_PROCHOT_N", "",
+			"", "", "IRQ_SMI_ACTIVE_N", "FM_BIOS_POST_CMPLT_N",
+		/* AB */ "", "", "ME_OVERRIDE", "BMC_DMI_MODIFY", "", "", "", "",
+		/* AC */ "", "", "", "", "", "", "", "";
+};
+
+&adc {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_adc0_default /* 3VSB */
+		&pinctrl_adc1_default	   /* 5VSB */
+		&pinctrl_adc2_default	   /* CPU1 */
+		&pinctrl_adc3_default	   /* VCCSA */
+		&pinctrl_adc4_default	   /* VCCM */
+		&pinctrl_adc5_default	   /* V10M */
+		&pinctrl_adc6_default	   /* VCCIO */
+		&pinctrl_adc7_default	   /* VCCGT */
+		&pinctrl_adc8_default	   /* VPPM */
+		&pinctrl_adc9_default	   /* BAT */
+		&pinctrl_adc10_default	   /* 3V */
+		&pinctrl_adc11_default	   /* 5V */
+		&pinctrl_adc12_default	   /* 12V */
+		&pinctrl_adc13_default	   /* GND */
+		&pinctrl_adc14_default	   /* GND */
+		&pinctrl_adc15_default>;   /* GND */
+};

From eadd52a6233d4e50391eb68a7a77c24a8c262313 Mon Sep 17 00:00:00 2001
From: Renze Nicolai <renze@rnplus.nl>
Date: Sat, 2 Dec 2023 01:38:44 +0100
Subject: [PATCH 010/707] dt-bindings: arm: aspeed: add Asrock X570D4U board

Document Asrock X570D4U compatible.

Signed-off-by: Renze Nicolai <renze@rnplus.nl>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231202003908.3635695-2-renze@rnplus.nl
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
index 6f7543463d894c..85e2c00a238477 100644
--- a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
+++ b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
@@ -38,6 +38,7 @@ properties:
               - asrock,e3c256d4i-bmc
               - asrock,romed8hm3-bmc
               - asrock,spc621d8hm3-bmc
+              - asrock,x570d4u-bmc
               - bytedance,g220a-bmc
               - facebook,cmm-bmc
               - facebook,minipack-bmc

From ecab6c95f79bb6143090d0d48ee26501f28e0a59 Mon Sep 17 00:00:00 2001
From: Renze Nicolai <renze@rnplus.nl>
Date: Sat, 2 Dec 2023 01:38:45 +0100
Subject: [PATCH 011/707] ARM: dts: aspeed: asrock: Add ASRock X570D4U BMC

This is a relatively low-cost AST2500-based Amd Ryzen 5000 Series
micro-ATX board that we hope can provide a decent platform for OpenBMC
development.

This initial device-tree provides the necessary configuration for
basic BMC functionality such as serial console, KVM support
and POST code snooping.

Signed-off-by: Renze Nicolai <renze@rnplus.nl>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231202003908.3635695-3-renze@rnplus.nl
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/Makefile             |   1 +
 .../dts/aspeed/aspeed-bmc-asrock-x570d4u.dts  | 377 ++++++++++++++++++
 2 files changed, 378 insertions(+)
 create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts

diff --git a/arch/arm/boot/dts/aspeed/Makefile b/arch/arm/boot/dts/aspeed/Makefile
index 3e3e6b96cb799d..fb9cc95f1b60f3 100644
--- a/arch/arm/boot/dts/aspeed/Makefile
+++ b/arch/arm/boot/dts/aspeed/Makefile
@@ -12,6 +12,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \
 	aspeed-bmc-asrock-e3c256d4i.dtb \
 	aspeed-bmc-asrock-romed8hm3.dtb \
 	aspeed-bmc-asrock-spc621d8hm3.dtb \
+	aspeed-bmc-asrock-x570d4u.dtb \
 	aspeed-bmc-bytedance-g220a.dtb \
 	aspeed-bmc-delta-ahe50dc.dtb \
 	aspeed-bmc-facebook-bletchley.dtb \
diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts
new file mode 100644
index 00000000000000..3c975bc41ae7de
--- /dev/null
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts
@@ -0,0 +1,377 @@
+// SPDX-License-Identifier: GPL-2.0+
+/dts-v1/;
+#include "aspeed-g5.dtsi"
+#include <dt-bindings/gpio/aspeed-gpio.h>
+#include <dt-bindings/leds/common.h>
+
+/ {
+	model = "Asrock Rack X570D4U BMC";
+	compatible = "asrock,x570d4u-bmc", "aspeed,ast2500";
+
+	aliases {
+		i2c40 = &i2c4mux0ch0;
+		i2c41 = &i2c4mux0ch1;
+		i2c42 = &i2c4mux0ch2;
+		i2c43 = &i2c4mux0ch3;
+	};
+
+	chosen {
+		stdout-path = &uart5;
+	};
+
+	memory@80000000 {
+		reg = <0x80000000 0x20000000>;
+	};
+
+	reserved-memory {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		ranges;
+
+		pci_memory: region@9a000000 {
+			no-map;
+			reg = <0x9a000000 0x00010000>; /* 64K */
+		};
+
+		video_engine_memory: jpegbuffer {
+			size = <0x02800000>;	/* 40M */
+			alignment = <0x01000000>;
+			compatible = "shared-dma-pool";
+			reusable;
+		};
+
+		gfx_memory: framebuffer {
+			size = <0x01000000>;
+			alignment = <0x01000000>;
+			compatible = "shared-dma-pool";
+			reusable;
+		};
+	};
+
+	leds {
+		compatible = "gpio-leds";
+
+		led-0 {
+			/* led-heartbeat-n */
+			gpios = <&gpio ASPEED_GPIO(H, 6) GPIO_ACTIVE_LOW>;
+			color = <LED_COLOR_ID_GREEN>;
+			function = LED_FUNCTION_HEARTBEAT;
+			linux,default-trigger = "timer";
+		};
+
+		led-1 {
+			/* led-fault-n */
+			gpios = <&gpio ASPEED_GPIO(Z, 2) GPIO_ACTIVE_LOW>;
+			color = <LED_COLOR_ID_AMBER>;
+			function = LED_FUNCTION_FAULT;
+			panic-indicator;
+		};
+	};
+
+	iio-hwmon {
+		compatible = "iio-hwmon";
+		io-channels = <&adc 0>, <&adc 1>, <&adc 2>, <&adc 3>, <&adc 4>,
+			<&adc 5>, <&adc 6>, <&adc 7>, <&adc 8>, <&adc 9>,
+			<&adc 10>, <&adc 11>, <&adc 12>;
+	};
+};
+
+&gpio {
+	status = "okay";
+	gpio-line-names =
+	/*A0-A3*/       "status-locatorled-n",                    "",                      "button-nmi-n",          "",
+	/*A4-A7*/       "",                                       "",                      "",                      "",
+	/*B0-B3*/       "input-bios-post-cmplt-n",                "",                      "",                      "",
+	/*B4-B7*/       "",                                       "",                      "",                      "",
+	/*C0-C3*/       "",                                       "",                      "",                      "",
+	/*C4-C7*/       "",                                       "",                      "control-locatorbutton", "",
+	/*D0-D3*/       "button-power",                           "control-power",         "button-reset",          "control-reset",
+	/*D4-D7*/       "",                                       "",                      "",                      "",
+	/*E0-E3*/       "",                                       "",                      "",                      "",
+	/*E4-E7*/       "",                                       "",                      "",                      "",
+	/*F0-F3*/       "",                                       "",                      "",                      "",
+	/*F4-F7*/       "",                                       "",                      "",                      "",
+	/*G0-G3*/       "output-rtc-battery-voltage-read-enable", "input-id0",             "input-id1",             "input-id2",
+	/*G4-G7*/       "input-alert1-n",                         "input-alert2-n",        "input-alert3-n",        "",
+	/*H0-H3*/       "",                                       "",                      "",                      "",
+	/*H4-H7*/       "input-mfg",                              "",                      "led-heartbeat-n",       "input-caseopen",
+	/*I0-I3*/       "",                                       "",                      "",                      "",
+	/*I4-I7*/       "",                                       "",                      "",                      "",
+	/*J0-J3*/       "output-bmc-ready",                       "",                      "",                      "",
+	/*J4-J7*/       "",                                       "",                      "",                      "",
+	/*K0-K3*/       "",                                       "",                      "",                      "",
+	/*K4-K7*/       "",                                       "",                      "",                      "",
+	/*L0-L3*/       "",                                       "",                      "",                      "",
+	/*L4-L7*/       "",                                       "",                      "",                      "",
+	/*M0-M3*/       "",                                       "",                      "",                      "",
+	/*M4-M7*/       "",                                       "",                      "",                      "",
+	/*N0-N3*/       "",                                       "",                      "",                      "",
+	/*N4-N7*/       "",                                       "",                      "",                      "",
+	/*O0-O3*/       "",                                       "",                      "",                      "",
+	/*O4-O7*/       "",                                       "",                      "",                      "",
+	/*P0-P3*/       "",                                       "",                      "",                      "",
+	/*P4-P7*/       "",                                       "",                      "",                      "",
+	/*Q0-Q3*/       "",                                       "",                      "",                      "",
+	/*Q4-Q7*/       "",                                       "",                      "",                      "",
+	/*R0-R3*/       "",                                       "",                      "",                      "",
+	/*R4-R7*/       "",                                       "",                      "",                      "",
+	/*S0-S3*/       "input-bmc-pchhot-n",                     "",                      "",                      "",
+	/*S4-S7*/       "",                                       "",                      "",                      "",
+	/*T0-T3*/       "",                                       "",                      "",                      "",
+	/*T4-T7*/       "",                                       "",                      "",                      "",
+	/*U0-U3*/       "",                                       "",                      "",                      "",
+	/*U4-U7*/       "",                                       "",                      "",                      "",
+	/*V0-V3*/       "",                                       "",                      "",                      "",
+	/*V4-V7*/       "",                                       "",                      "",                      "",
+	/*W0-W3*/       "",                                       "",                      "",                      "",
+	/*W4-W7*/       "",                                       "",                      "",                      "",
+	/*X0-X3*/       "",                                       "",                      "",                      "",
+	/*X4-X7*/       "",                                       "",                      "",                      "",
+	/*Y0-Y3*/       "",                                       "",                      "",                      "",
+	/*Y4-Y7*/       "",                                       "",                      "",                      "",
+	/*Z0-Z3*/       "",                                       "",                      "led-fault-n",           "output-bmc-throttle-n",
+	/*Z4-Z7*/       "",                                       "",                      "",                      "",
+	/*AA0-AA3*/     "input-cpu1-thermtrip-latch-n",           "",                      "input-cpu1-prochot-n",  "",
+	/*AA4-AC7*/     "",                                       "",                      "",                      "",
+	/*AB0-AB3*/     "",                                       "",                      "",                      "",
+	/*AB4-AC7*/     "",                                       "",                      "",                      "",
+	/*AC0-AC3*/     "",                                       "",                      "",                      "",
+	/*AC4-AC7*/     "",                                       "",                      "",                      "";
+};
+
+&fmc {
+	status = "okay";
+	flash@0 {
+		status = "okay";
+		label = "bmc";
+		m25p,fast-read;
+		spi-max-frequency = <10000000>;
+#include "openbmc-flash-layout-64.dtsi"
+	};
+};
+
+&uart5 {
+	status = "okay";
+};
+
+&vuart {
+	status = "okay";
+};
+
+&mac0 {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_rgmii1_default &pinctrl_mdio1_default>;
+
+	nvmem-cells = <&eth0_macaddress>;
+	nvmem-cell-names = "mac-address";
+};
+
+&mac1 {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_rmii2_default &pinctrl_mdio2_default>;
+	use-ncsi;
+
+	nvmem-cells = <&eth1_macaddress>;
+	nvmem-cell-names = "mac-address";
+};
+
+&i2c0 {
+	/* SMBus on auxiliary panel header (AUX_PANEL1) */
+	status = "okay";
+};
+
+&i2c1 {
+	status = "okay";
+
+	w83773g@4c {
+		compatible = "nuvoton,w83773g";
+		reg = <0x4c>;
+	};
+};
+
+&i2c2 {
+	/* PSU SMBus (PSU_SMB1) */
+	status = "okay";
+};
+
+&i2c3 {
+	status = "okay";
+};
+
+&i2c4 {
+	status = "okay";
+
+	i2c-mux@70 {
+		compatible = "nxp,pca9545";
+		reg = <0x70>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		i2c4mux0ch0: i2c@0 {
+			/* SMBus on PCI express 16x slot */
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+		};
+
+		i2c4mux0ch1: i2c@1 {
+			/* SMBus on PCI express 8x slot */
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <1>;
+		};
+
+		i2c4mux0ch2: i2c@2 {
+			/* Unknown */
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <2>;
+		};
+
+		i2c4mux0ch3: i2c@3 {
+			/* SMBus on PCI express 1x slot */
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <3>;
+		};
+	};
+};
+
+&i2c5 {
+	status = "okay";
+};
+
+&i2c7 {
+	/* FRU and SPD EEPROM SMBus */
+	status = "okay";
+
+	eeprom@57 {
+		compatible = "st,24c128", "atmel,24c128";
+		reg = <0x57>;
+		pagesize = <16>;
+		#address-cells = <1>;
+		#size-cells = <1>;
+
+		eth0_macaddress: macaddress@3f80 {
+			reg = <0x3f80 6>;
+		};
+
+		eth1_macaddress: macaddress@3f88 {
+			reg = <0x3f88 6>;
+		};
+	};
+};
+
+&gfx {
+	status = "okay";
+};
+
+&pinctrl {
+	aspeed,external-nodes = <&gfx &lhc>;
+};
+
+&vhub {
+	status = "okay";
+};
+
+&ehci1 {
+	status = "okay";
+};
+
+&uhci {
+	status = "okay";
+};
+
+&kcs3 {
+	aspeed,lpc-io-reg = <0xca2>;
+	status = "okay";
+};
+
+&lpc_ctrl {
+	status = "okay";
+};
+
+&lpc_snoop {
+	status = "okay";
+	snoop-ports = <0x80>;
+};
+
+&p2a {
+	status = "okay";
+	memory-region = <&pci_memory>;
+};
+
+&video {
+	status = "okay";
+	memory-region = <&video_engine_memory>;
+};
+
+&pwm_tacho {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_pwm0_default
+				&pinctrl_pwm1_default
+				&pinctrl_pwm2_default
+				&pinctrl_pwm3_default
+				&pinctrl_pwm4_default
+				&pinctrl_pwm5_default>;
+
+	fan@0 {
+		/* FAN1 (4-pin) */
+		reg = <0x00>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x00>;
+	};
+
+	fan@1 {
+		/* FAN2 (4-pin) */
+		reg = <0x01>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x01>;
+	};
+
+	fan@2 {
+		/* FAN3 (4-pin) */
+		reg = <0x02>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x02>;
+	};
+
+	fan@3 {
+		/* FAN4 (6-pin) */
+		reg = <0x03>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x04 0x0b>;
+	};
+
+	fan@4 {
+		/* FAN6 (6-pin) */
+		reg = <0x04>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x06 0x0d>;
+	};
+
+	fan@5 {
+		/* FAN5 (6-pin) */
+		reg = <0x05>;
+		aspeed,fan-tach-ch = /bits/ 8 <0x05 0x0c>;
+	};
+};
+
+&adc {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_adc0_default
+				&pinctrl_adc1_default
+				&pinctrl_adc2_default
+				&pinctrl_adc3_default
+				&pinctrl_adc4_default
+				&pinctrl_adc5_default
+				&pinctrl_adc6_default
+				&pinctrl_adc7_default
+				&pinctrl_adc8_default
+				&pinctrl_adc9_default
+				&pinctrl_adc10_default
+				&pinctrl_adc11_default
+				&pinctrl_adc12_default
+				&pinctrl_adc13_default
+				&pinctrl_adc14_default
+				&pinctrl_adc15_default>;
+};

From 763f0b3f1402cbb9d1ec7a0a37bcc6ebb465b119 Mon Sep 17 00:00:00 2001
From: Peter Yin <peteryin.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 00:26:54 +0800
Subject: [PATCH 012/707] dt-bindings: arm: aspeed: add Meta Harma board

Document the new compatibles used on Meta Harma.

Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Peter Yin <peteryin.openbmc@gmail.com>
Link: https://lore.kernel.org/r/20231211162656.2564267-2-peteryin.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
index 85e2c00a238477..7dfcdc2d571eb0 100644
--- a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
+++ b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
@@ -82,6 +82,7 @@ properties:
               - facebook,elbert-bmc
               - facebook,fuji-bmc
               - facebook,greatlakes-bmc
+              - facebook,harma-bmc
               - facebook,minerva-cmc
               - facebook,yosemite4-bmc
               - ibm,everest-bmc

From e17770a3388e05aa59d6a8d4fbcd2a4130222db7 Mon Sep 17 00:00:00 2001
From: Peter Yin <peteryin.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 00:26:55 +0800
Subject: [PATCH 013/707] ARM: dts: aspeed: Harma: Add Meta Harma (AST2600) BMC

Add linux device tree entry related to
the Meta(Facebook) computer-node system use an AT2600 BMC.
This node is named "Harma".

Signed-off-by: Peter Yin <peteryin.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231211162656.2564267-3-peteryin.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/Makefile             |   1 +
 .../dts/aspeed/aspeed-bmc-facebook-harma.dts  | 585 ++++++++++++++++++
 2 files changed, 586 insertions(+)
 create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-harma.dts

diff --git a/arch/arm/boot/dts/aspeed/Makefile b/arch/arm/boot/dts/aspeed/Makefile
index fb9cc95f1b60f3..6ecc21d04a6299 100644
--- a/arch/arm/boot/dts/aspeed/Makefile
+++ b/arch/arm/boot/dts/aspeed/Makefile
@@ -22,6 +22,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \
 	aspeed-bmc-facebook-fuji.dtb \
 	aspeed-bmc-facebook-galaxy100.dtb \
 	aspeed-bmc-facebook-greatlakes.dtb \
+	aspeed-bmc-facebook-harma.dtb \
 	aspeed-bmc-facebook-minerva-cmc.dtb \
 	aspeed-bmc-facebook-minipack.dtb \
 	aspeed-bmc-facebook-tiogapass.dtb \
diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-harma.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-harma.dts
new file mode 100644
index 00000000000000..7db3f9eb00161a
--- /dev/null
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-harma.dts
@@ -0,0 +1,585 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright 2023 Facebook Inc.
+
+/dts-v1/;
+#include "aspeed-g6.dtsi"
+#include <dt-bindings/gpio/aspeed-gpio.h>
+#include <dt-bindings/i2c/i2c.h>
+
+/ {
+	model = "Facebook Harma";
+	compatible = "facebook,harma-bmc", "aspeed,ast2600";
+
+	aliases {
+		serial0 = &uart1;
+		serial1 = &uart6;
+		serial2 = &uart2;
+		serial4 = &uart5;
+
+		i2c20 = &imux20;
+		i2c21 = &imux21;
+		i2c22 = &imux22;
+		i2c23 = &imux23;
+		i2c24 = &imux24;
+		i2c25 = &imux25;
+		i2c26 = &imux26;
+		i2c27 = &imux27;
+		i2c28 = &imux28;
+		i2c29 = &imux29;
+		i2c30 = &imux30;
+		i2c31 = &imux31;
+	};
+
+	chosen {
+		stdout-path = &uart5;
+	};
+
+	memory@80000000 {
+		device_type = "memory";
+		reg = <0x80000000 0x80000000>;
+	};
+
+	iio-hwmon {
+		compatible = "iio-hwmon";
+		io-channels = <&adc0 0>, <&adc0 1>, <&adc0 2>, <&adc0 3>,
+			      <&adc0 4>, <&adc0 5>, <&adc0 6>, <&adc0 7>,
+			      <&adc1 2>;
+	};
+
+	leds {
+		compatible = "gpio-leds";
+
+		led-0 {
+			label = "bmc_heartbeat_amber";
+			gpios = <&gpio0 ASPEED_GPIO(P, 7) GPIO_ACTIVE_LOW>;
+			linux,default-trigger = "heartbeat";
+		};
+
+		led-1 {
+			label = "fp_id_amber";
+			default-state = "off";
+			gpios = <&gpio0 13 GPIO_ACTIVE_HIGH>;
+		};
+
+		led-2 {
+			label = "power_blue";
+			default-state = "off";
+			gpios = <&gpio0 124 GPIO_ACTIVE_HIGH>;
+		};
+	};
+};
+
+// HOST BIOS Debug
+&uart1 {
+	status = "okay";
+};
+
+// SOL Host Console
+&uart2 {
+	status = "okay";
+	pinctrl-0 = <>;
+};
+
+// SOL BMC Console
+&uart4 {
+	status = "okay";
+	pinctrl-0 = <>;
+};
+
+// BMC Debug Console
+&uart5 {
+	status = "okay";
+};
+
+// MTIA
+&uart6 {
+	status = "okay";
+};
+
+&uart_routing {
+	status = "okay";
+};
+
+&vuart1 {
+	status = "okay";
+};
+
+&wdt1 {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_wdtrst1_default>;
+	aspeed,reset-type = "soc";
+	aspeed,external-signal;
+	aspeed,ext-push-pull;
+	aspeed,ext-active-high;
+	aspeed,ext-pulse-duration = <256>;
+};
+
+&mac3 {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_rmii4_default>;
+	use-ncsi;
+	mlx,multi-host;
+};
+
+&rtc {
+	status = "okay";
+};
+
+&fmc {
+	status = "okay";
+
+	flash@0 {
+		status = "okay";
+		m25p,fast-read;
+		label = "bmc";
+		spi-max-frequency = <50000000>;
+#include "openbmc-flash-layout-128.dtsi"
+	};
+
+	flash@1 {
+		status = "okay";
+		m25p,fast-read;
+		label = "alt-bmc";
+		spi-max-frequency = <50000000>;
+	};
+};
+
+// BIOS Flash
+&spi2 {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_spi2_default>;
+
+	flash@0 {
+		status = "okay";
+		m25p,fast-read;
+		label = "pnor";
+		spi-max-frequency = <12000000>;
+		spi-tx-bus-width = <2>;
+		spi-rx-bus-width = <2>;
+	};
+};
+
+&kcs2 {
+	status = "okay";
+	aspeed,lpc-io-reg = <0xca8>;
+};
+
+&kcs3 {
+	status = "okay";
+	aspeed,lpc-io-reg = <0xca2>;
+};
+
+&i2c0 {
+	status = "okay";
+
+	max31790@30{
+		compatible = "max31790";
+		reg = <0x30>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+	};
+};
+
+&i2c1 {
+	status = "okay";
+
+	tmp75@4b {
+		compatible = "ti,tmp75";
+		reg = <0x4b>;
+	};
+};
+
+&i2c2 {
+	status = "okay";
+
+	max31790@30{
+		compatible = "max31790";
+		reg = <0x30>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+	};
+};
+
+&i2c3 {
+	status = "okay";
+
+	i2c-mux@70 {
+		compatible = "nxp,pca9543";
+		reg = <0x70>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		imux20: i2c@0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+			//Retimer Flash
+			eeprom@50 {
+				compatible = "atmel,24c2048";
+				reg = <0x50>;
+				pagesize = <128>;
+			};
+		};
+		imux21: i2c@1 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <1>;
+		};
+	};
+};
+
+&i2c4 {
+	status = "okay";
+	// PDB FRU
+	eeprom@52 {
+		compatible = "atmel,24c64";
+		reg = <0x52>;
+	};
+
+	delta_brick@69 {
+		compatible = "pmbus";
+		reg = <0x69>;
+	};
+};
+
+&i2c5 {
+	status = "okay";
+};
+
+&i2c6 {
+	status = "okay";
+
+	i2c-mux@70 {
+		compatible = "nxp,pca9543";
+		reg = <0x70>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		imux22: i2c@0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+		};
+		imux23: i2c@1 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <1>;
+		};
+	};
+};
+
+&i2c7 {
+	status = "okay";
+};
+
+&i2c8 {
+	status = "okay";
+};
+
+&i2c9 {
+	status = "okay";
+
+	gpio@30 {
+		compatible = "nxp,pca9555";
+		reg = <0x30>;
+		gpio-controller;
+		#gpio-cells = <2>;
+	};
+	gpio@31 {
+		compatible = "nxp,pca9555";
+		reg = <0x31>;
+		gpio-controller;
+		#gpio-cells = <2>;
+	};
+
+	i2c-mux@71 {
+		compatible = "nxp,pca9546";
+		reg = <0x71>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		imux24: i2c@0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+		};
+		imux25: i2c@1 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <1>;
+		};
+		imux26: i2c@2 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <2>;
+		};
+		imux27: i2c@3 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <3>;
+		};
+	};
+	// PTTV FRU
+	eeprom@52 {
+		compatible = "atmel,24c64";
+		reg = <0x52>;
+	};
+};
+
+&i2c11 {
+	status = "okay";
+};
+
+&i2c12 {
+	status = "okay";
+};
+
+&i2c13 {
+	status = "okay";
+
+	i2c-mux@70 {
+		compatible = "nxp,pca9545";
+		reg = <0x70>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		imux28: i2c@0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+		};
+		imux29: i2c@1 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <1>;
+			//MB FRU
+			eeprom@54 {
+				compatible = "atmel,24c64";
+				reg = <0x54>;
+			};
+		};
+		imux30: i2c@2 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <2>;
+		};
+		imux31: i2c@3 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <3>;
+		};
+	};
+};
+
+// To Debug card
+&i2c14 {
+	status = "okay";
+	multi-master;
+
+	ipmb@10 {
+		compatible = "ipmb-dev";
+		reg = <(0x10 | I2C_OWN_SLAVE_ADDRESS)>;
+		i2c-protocol;
+	};
+};
+
+&i2c15 {
+	status = "okay";
+
+	// SCM FRU
+	eeprom@50 {
+		compatible = "atmel,24c64";
+		reg = <0x50>;
+	};
+
+	// BSM FRU
+	eeprom@56 {
+		compatible = "atmel,24c64";
+		reg = <0x56>;
+	};
+};
+
+&adc0 {
+	aspeed,int-vref-microvolt = <2500000>;
+	status = "okay";
+	pinctrl-0 = <&pinctrl_adc0_default &pinctrl_adc1_default
+		&pinctrl_adc2_default &pinctrl_adc3_default
+		&pinctrl_adc4_default &pinctrl_adc5_default
+		&pinctrl_adc6_default &pinctrl_adc7_default>;
+};
+
+&adc1 {
+	aspeed,int-vref-microvolt = <2500000>;
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_adc10_default>;
+};
+
+&ehci0 {
+	status = "okay";
+};
+
+&gpio0 {
+	pinctrl-names = "default";
+	gpio-line-names =
+	/*A0-A7*/	"","","","","","","","",
+	/*B0-B7*/	"","","","",
+			"bmc-spi-mux-select-0","led-identify","","",
+	/*C0-C7*/	"","","","","","","","",
+	/*D0-D7*/	"","","sol-uart-select","","","","","",
+	/*E0-E7*/	"","","","","","","","",
+	/*F0-F7*/	"","","","","","","","",
+	/*G0-G7*/	"","","","","","","","",
+	/*H0-H7*/	"","","","","","","","",
+	/*I0-I7*/	"","","","","","","","",
+	/*J0-J7*/	"","","","","","","","",
+	/*K0-K7*/	"","","","","","","","",
+	/*L0-L7*/	"","","","","","","","",
+	/*M0-M7*/	"","","","","","","","",
+	/*N0-N7*/	"led-postcode-0","led-postcode-1",
+			"led-postcode-2","led-postcode-3",
+			"led-postcode-4","led-postcode-5",
+			"led-postcode-6","led-postcode-7",
+	/*O0-O7*/	"","","","","","","","",
+	/*P0-P7*/	"power-button","power-host-control",
+			"reset-button","","led-power","","","",
+	/*Q0-Q7*/	"","","","","","","","",
+	/*R0-R7*/	"","","","","","","","",
+	/*S0-S7*/	"","","","","","","","",
+	/*T0-T7*/	"","","","","","","","",
+	/*U0-U7*/	"","","","","","","led-identify-gate","",
+	/*V0-V7*/	"","","","",
+			"rtc-battery-voltage-read-enable","","","",
+	/*W0-W7*/	"","","","","","","","",
+	/*X0-X7*/	"","","","","","","","",
+	/*Y0-Y7*/	"","","","","","","","",
+	/*Z0-Z7*/	"","","","","","","","";
+};
+
+&sgpiom0 {
+	status = "okay";
+	max-ngpios = <128>;
+	ngpios = <128>;
+	bus-frequency = <2000000>;
+	gpio-line-names =
+	/*in - out - in - out */
+	/*A0-A3 line 0-7*/
+	"presence-scm-cable","power-config-disable-e1s-0",
+	"","",
+	"","power-config-disable-e1s-1",
+	"","",
+	/*A4-A7 line 8-15*/
+	"","power-config-asic-module-enable",
+	"","power-config-asic-power-good",
+	"","power-config-pdb-power-good",
+	"presence-cpu","smi-control-n",
+	/*B0-B3 line 16-23*/
+	"","nmi-control-n",
+	"","nmi-control-sync-flood-n",
+	"","",
+	"","",
+	/*B4-B7 line 24-31*/
+	"","FM_CPU_SP5R1",
+	"reset-cause-rsmrst","FM_CPU_SP5R2",
+	"","FM_CPU_SP5R3",
+	"","FM_CPU_SP5R4",
+	/*C0-C3 line 32-39*/
+	"","FM_CPU0_SA0",
+	"","FM_CPU0_SA1",
+	"","rt-cpu0-p0-enable",
+	"","rt-cpu0-p1-enable",
+	/*C4-C7 line 40-47*/
+	"","smb-rt-rom-p0-select",
+	"","smb-rt-rom-p1-select",
+	"","i3c-cpu-mux0-oe-n",
+	"","i3c-cpu-mux0-select",
+	/*D0-D3 line 48-55*/
+	"","i3c-cpu-mux1-oe-n",
+	"","i3c-cpu-mux1-select",
+	"","reset-control-bmc",
+	"","reset-control-cpu0-p0-mux",
+	/*D4-D7 line 56-63*/
+	"","reset-control-cpu0-p1-mux",
+	"","reset-control-e1s-mux",
+	"power-host-good","reset-control-mb-mux",
+	"","reset-control-smb-e1s",
+	/*E0-E3 line 64-71*/
+	"","reset-control-smb-e1s",
+	"host-ready-n","reset-control-srst",
+	"presence-e1s-0","reset-control-usb-hub",
+	"","reset-control",
+	/*E4-E7 line 72-79*/
+	"presence-e1s-1","reset-control-cpu-kbrst",
+	"","reset-control-platrst",
+	"","bmc-jtag-mux-select-0",
+	"","bmc-jtag-mux-select-1",
+	/*F0-F3 line 80-87*/
+	"","bmc-jtag-select",
+	"","bmc-ready-n",
+	"","bmc-ready-sgpio",
+	"","rt-cpu0-p0-force-enable",
+	/*F4-F7 line 88-95*/
+	"presence-asic-modules-0","rt-cpu0-p1-force-enable",
+	"presence-asic-modules-1","bios-debug-msg-disable",
+	"","uart-control-buffer-select",
+	"","ac-control-n",
+	/*G0-G3 line 96-103*/
+	"FM_CPU_CORETYPE2","",
+	"FM_CPU_CORETYPE1","",
+	"FM_CPU_CORETYPE0","",
+	"FM_BOARD_REV_ID5","",
+	/*G4-G7 line 104-111*/
+	"FM_BOARD_REV_ID4","",
+	"FM_BOARD_REV_ID3","",
+	"FM_BOARD_REV_ID2","",
+	"FM_BOARD_REV_ID1","",
+	/*H0-H3 line 112-119*/
+	"FM_BOARD_REV_ID0","",
+	"","","","","","",
+	/*H4-H7 line 120-127*/
+	"","",
+	"reset-control-pcie-expansion-3","",
+	"reset-control-pcie-expansion-2","",
+	"reset-control-pcie-expansion-1","",
+	/*I0-I3 line 128-135*/
+	"reset-control-pcie-expansion-0","",
+	"FM_EXP_SLOT_ID1","",
+	"FM_EXP_SLOT_ID0","",
+	"","",
+	/*I4-I7 line 136-143*/
+	"","","","","","","","",
+	/*J0-J3 line 144-151*/
+	"","","","","","","","",
+	/*J4-J7 line 152-159*/
+	"SLOT_ID_BCB_0","",
+	"SLOT_ID_BCB_1","",
+	"SLOT_ID_BCB_2","",
+	"SLOT_ID_BCB_3","",
+	/*K0-K3 line 160-167*/
+	"","","","","","","","",
+	/*K4-K7 line 168-175*/
+	"","","","","","","","",
+	/*L0-L3 line 176-183*/
+	"","","","","","","","",
+	/*L4-L7 line 184-191*/
+	"","","","","","","","",
+	/*M0-M3 line 192-199*/
+	"","","","","","","","",
+	/*M4-M7 line 200-207*/
+	"","","","","","","","",
+	/*N0-N3 line 208-215*/
+	"","","","","","","","",
+	/*N4-N7 line 216-223*/
+	"","","","","","","","",
+	/*O0-O3 line 224-231*/
+	"","","","","","","","",
+	/*O4-O7 line 232-239*/
+	"","","","","","","","",
+	/*P0-P3 line 240-247*/
+	"","","","","","","","",
+	/*P4-P7 line 248-255*/
+	"","","","","","","","";
+};

From 965a8ea59ec55483fb9311036ef928d16770a63a Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:50 +0800
Subject: [PATCH 014/707] ARM: dts: aspeed: minerva: Revise the name of DTS

The project Minerva which is the platform used by Meta has two boards: the
Chassis Management Module (Minerva) and the Motherboard (Harma), so change
the DTS name to minerva here for CMM use.

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-2-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/Makefile                               | 2 +-
 ...facebook-minerva-cmc.dts => aspeed-bmc-facebook-minerva.dts} | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename arch/arm/boot/dts/aspeed/{aspeed-bmc-facebook-minerva-cmc.dts => aspeed-bmc-facebook-minerva.dts} (99%)

diff --git a/arch/arm/boot/dts/aspeed/Makefile b/arch/arm/boot/dts/aspeed/Makefile
index 6ecc21d04a6299..75fff585675a0b 100644
--- a/arch/arm/boot/dts/aspeed/Makefile
+++ b/arch/arm/boot/dts/aspeed/Makefile
@@ -23,7 +23,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \
 	aspeed-bmc-facebook-galaxy100.dtb \
 	aspeed-bmc-facebook-greatlakes.dtb \
 	aspeed-bmc-facebook-harma.dtb \
-	aspeed-bmc-facebook-minerva-cmc.dtb \
+	aspeed-bmc-facebook-minerva.dtb \
 	aspeed-bmc-facebook-minipack.dtb \
 	aspeed-bmc-facebook-tiogapass.dtb \
 	aspeed-bmc-facebook-wedge40.dtb \
diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva-cmc.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
similarity index 99%
rename from arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva-cmc.dts
rename to arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index f04ef906352080..c755fb3258a485 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva-cmc.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -7,7 +7,7 @@
 #include <dt-bindings/i2c/i2c.h>
 
 / {
-	model = "Facebook Minerva CMC";
+	model = "Facebook Minerva CMM";
 	compatible = "facebook,minerva-cmc", "aspeed,ast2600";
 
 	aliases {

From bbdcf72f21fd0e1cc118ad304a867fa863ce2a21 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:51 +0800
Subject: [PATCH 015/707] ARM: dts: aspeed: minerva: Modify mac3 setting

Remove the unuse setting and fix the link to 100 M

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-3-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index c755fb3258a485..9979dba1ef0e4a 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -48,10 +48,13 @@
 
 &mac3 {
 	status = "okay";
+	phy-mode = "rmii";
 	pinctrl-names = "default";
 	pinctrl-0 = <&pinctrl_rmii4_default>;
-	use-ncsi;
-	mlx,multi-host;
+	fixed-link {
+		speed = <100>;
+		full-duplex;
+	};
 };
 
 &fmc {

From f15468aa4cdf3fe6568555d0f3e6df477f8b616d Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:52 +0800
Subject: [PATCH 016/707] ARM: dts: aspeed: minerva: Change sgpio use

Correct the sgpio use from sgpiom1 to sgpiom0

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-4-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index 9979dba1ef0e4a..ad77057f921cd1 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -78,7 +78,7 @@
 	status = "okay";
 };
 
-&sgpiom1 {
+&sgpiom0 {
 	status = "okay";
 	ngpios = <128>;
 	bus-frequency = <2000000>;

From aca2d2f36bf73b6f129d7626b60ddc1f58e91f6c Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:53 +0800
Subject: [PATCH 017/707] ARM: dts: aspeed: minerva: Enable power monitor
 device

Enable power monitor device ina230 and ltc2945 on the i2c bus 0

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-5-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 .../aspeed/aspeed-bmc-facebook-minerva.dts    | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index ad77057f921cd1..ee9691647e4a50 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -86,6 +86,28 @@
 
 &i2c0 {
 	status = "okay";
+
+	power-monitor@40 {
+		compatible = "ti,ina230";
+		reg = <0x40>;
+		shunt-resistor = <1000>;
+	};
+
+	power-monitor@41 {
+		compatible = "ti,ina230";
+		reg = <0x41>;
+		shunt-resistor = <1000>;
+	};
+
+	power-monitor@67 {
+		compatible = "adi,ltc2945";
+		reg = <0x67>;
+	};
+
+	power-monitor@68 {
+		compatible = "adi,ltc2945";
+		reg = <0x68>;
+	};
 };
 
 &i2c1 {

From 10f776c80b1a5d2834430c850457a1d10f12a9ce Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:54 +0800
Subject: [PATCH 018/707] ARM: dts: aspeed: minerva: Add temperature sensor

Add one temperature sensor on i2c bus 1

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-6-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index ee9691647e4a50..783d4d5a8f3d7e 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -115,7 +115,12 @@
 
 	temperature-sensor@4b {
 		compatible = "ti,tmp75";
-		reg = <0x4B>;
+		reg = <0x4b>;
+	};
+
+	temperature-sensor@48 {
+		compatible = "ti,tmp75";
+		reg = <0x48>;
 	};
 
 	eeprom@51 {

From 96b198848ecd07165f58d93e195ca63244c8dbd4 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:55 +0800
Subject: [PATCH 019/707] ARM: dts: aspeed: minerva: correct the address of
 eeprom

Correct the address from 0x51 to 0x54 of eeprom on the i2c bus 1

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-7-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index 783d4d5a8f3d7e..f2a48033ac5ce7 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -123,9 +123,9 @@
 		reg = <0x48>;
 	};
 
-	eeprom@51 {
+	eeprom@54 {
 		compatible = "atmel,24c128";
-		reg = <0x51>;
+		reg = <0x54>;
 	};
 };
 

From 0a40f5979a40e6fe43eaede7d3244dfde86653ee Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:56 +0800
Subject: [PATCH 020/707] ARM: dts: aspeed: minerva: add bus labels and aliases

Add bus labels and aliases for the fan control board.

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-8-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 .../aspeed/aspeed-bmc-facebook-minerva.dts    | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index f2a48033ac5ce7..f4cb5ef72310f9 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -12,6 +12,16 @@
 
 	aliases {
 		serial5 = &uart5;
+		/*
+		 * PCA9548 (2-0077) provides 8 channels connecting to
+		 * 6 pcs of FCB (Fan Controller Board).
+		 */
+		i2c16 = &imux16;
+		i2c17 = &imux17;
+		i2c18 = &imux18;
+		i2c19 = &imux19;
+		i2c20 = &imux20;
+		i2c21 = &imux21;
 	};
 
 	chosen {
@@ -139,7 +149,7 @@
 		#size-cells = <0>;
 		i2c-mux-idle-disconnect;
 
-		i2c@0 {
+		imux16: i2c@0 {
 			#address-cells = <1>;
 			#size-cells = <0>;
 			reg = <0>;
@@ -150,7 +160,7 @@
 			};
 		};
 
-		i2c@1 {
+		imux17: i2c@1 {
 			#address-cells = <1>;
 			#size-cells = <0>;
 			reg = <1>;
@@ -161,7 +171,7 @@
 			};
 		};
 
-		i2c@2 {
+		imux18: i2c@2 {
 			#address-cells = <1>;
 			#size-cells = <0>;
 			reg = <2>;
@@ -172,7 +182,7 @@
 			};
 		};
 
-		i2c@3 {
+		imux19: i2c@3 {
 			#address-cells = <1>;
 			#size-cells = <0>;
 			reg = <3>;
@@ -183,7 +193,7 @@
 			};
 		};
 
-		i2c@4 {
+		imux20: i2c@4 {
 			#address-cells = <1>;
 			#size-cells = <0>;
 			reg = <4>;
@@ -194,7 +204,7 @@
 			};
 		};
 
-		i2c@5 {
+		imux21: i2c@5 {
 			#address-cells = <1>;
 			#size-cells = <0>;
 			reg = <5>;

From 165a1f2db3dd5cf7e7c6b65b0c0750412be75d5d Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:57 +0800
Subject: [PATCH 021/707] ARM: dts: aspeed: minerva: add fan rpm controller

Add fan rpm controller max31790 on all bus of FCB.

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-9-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 .../aspeed/aspeed-bmc-facebook-minerva.dts    | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index f4cb5ef72310f9..c7445c819baf8e 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -158,6 +158,13 @@
 				compatible = "atmel,24c128";
 				reg = <0x50>;
 			};
+
+			pwm@5e{
+				compatible = "max31790";
+				reg = <0x5e>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
 		};
 
 		imux17: i2c@1 {
@@ -169,6 +176,13 @@
 				compatible = "atmel,24c128";
 				reg = <0x50>;
 			};
+
+			pwm@5e{
+				compatible = "max31790";
+				reg = <0x5e>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
 		};
 
 		imux18: i2c@2 {
@@ -180,6 +194,13 @@
 				compatible = "atmel,24c128";
 				reg = <0x50>;
 			};
+
+			pwm@5e{
+				compatible = "max31790";
+				reg = <0x5e>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
 		};
 
 		imux19: i2c@3 {
@@ -191,6 +212,13 @@
 				compatible = "atmel,24c128";
 				reg = <0x50>;
 			};
+
+			pwm@5e{
+				compatible = "max31790";
+				reg = <0x5e>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
 		};
 
 		imux20: i2c@4 {
@@ -202,6 +230,13 @@
 				compatible = "atmel,24c128";
 				reg = <0x50>;
 			};
+
+			pwm@5e{
+				compatible = "max31790";
+				reg = <0x5e>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
 		};
 
 		imux21: i2c@5 {
@@ -213,6 +248,13 @@
 				compatible = "atmel,24c128";
 				reg = <0x50>;
 			};
+
+			pwm@5e{
+				compatible = "max31790";
+				reg = <0x5e>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
 		};
 	};
 };

From f5dac195b500520257bc608f22937b928c2d0145 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:58 +0800
Subject: [PATCH 022/707] ARM: dts: aspeed: minerva: Add led-fan-fault gpio

Add led-fan-fault gpio pin on the PCA9555 on the i2c bus 0.

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-10-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 .../dts/aspeed/aspeed-bmc-facebook-minerva.dts  | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index c7445c819baf8e..090fe2f6b1d897 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -39,6 +39,16 @@
 			<&adc0 4>, <&adc0 5>, <&adc0 6>, <&adc0 7>,
 			<&adc1 2>;
 	};
+
+	leds {
+		compatible = "gpio-leds";
+
+		led-fan-fault {
+			label = "led-fan-fault";
+			gpios = <&leds_gpio 9 GPIO_ACTIVE_HIGH>;
+			default-state = "off";
+		};
+	};
 };
 
 &uart6 {
@@ -118,6 +128,13 @@
 		compatible = "adi,ltc2945";
 		reg = <0x68>;
 	};
+
+	leds_gpio: gpio@19 {
+		compatible = "nxp,pca9555";
+		reg = <0x19>;
+		gpio-controller;
+		#gpio-cells = <2>;
+	};
 };
 
 &i2c1 {

From 7a7ed4a02a945c3fa8689c67711ed1aa34aa2415 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:51:59 +0800
Subject: [PATCH 023/707] ARM: dts: aspeed: minerva: add gpio line name

Add the GPIO line name that the project's function can use by the
meaningful name.

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-11-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 .../aspeed/aspeed-bmc-facebook-minerva.dts    | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index 090fe2f6b1d897..31197183cc59e5 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -362,3 +362,33 @@
 &uhci {
 	status = "okay";
 };
+
+&gpio0 {
+	gpio-line-names =
+	/*A0-A7*/	"","","","","","","","",
+	/*B0-B7*/	"","","","","","","","",
+	/*C0-C7*/	"","","","","BLADE_UART_SEL2","","","",
+	/*D0-D7*/	"","","","","","","","",
+	/*E0-E7*/	"","","","","","","","",
+	/*F0-F7*/	"","","","","","","","",
+	/*G0-G7*/	"","","","","","","","",
+	/*H0-H7*/	"","","","","","","","",
+	/*I0-I7*/	"","","","","","","","",
+	/*J0-J7*/	"","","","","","","","",
+	/*K0-K7*/	"","","","","","","","",
+	/*L0-L7*/	"","","","","BLADE_UART_SEL0","","","",
+	/*M0-M7*/	"","","","","","BLADE_UART_SEL1","","",
+	/*N0-N7*/	"","","","","","","","",
+	/*O0-O7*/	"","","","","","","","",
+	/*P0-P7*/	"","","","","","","","",
+	/*Q0-Q7*/	"","","","","","","","",
+	/*R0-R7*/	"","","","","","","","",
+	/*S0-S7*/	"","","","","","","","",
+	/*T0-T7*/	"","","","","","","","",
+	/*U0-U7*/	"","","","","","","","",
+	/*V0-V7*/	"","","","","BAT_DETECT","","","",
+	/*W0-W7*/	"","","","","","","","",
+	/*X0-X7*/	"","","BLADE_UART_SEL3","","","","","",
+	/*Y0-Y7*/	"","","","","","","","",
+	/*Z0-Z7*/	"","","","","","","","";
+};

From e60f7a99d3789b5d0b24d3c0571b013309e56815 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangchen.openbmc@gmail.com>
Date: Tue, 12 Dec 2023 15:52:00 +0800
Subject: [PATCH 024/707] ARM: dts: aspeed: minerva: add sgpio line name

Add the SGPIO line name that the project's function can use by the
meaningful name.

Signed-off-by: Yang Chen <yangchen.openbmc@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20231212075200.983536-12-yangchen.openbmc@gmail.com
Signed-off-by: Joel Stanley <joel@jms.id.au>
---
 .../aspeed/aspeed-bmc-facebook-minerva.dts    | 149 ++++++++++++++++++
 1 file changed, 149 insertions(+)

diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
index 31197183cc59e5..942e53d5c71409 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
@@ -392,3 +392,152 @@
 	/*Y0-Y7*/	"","","","","","","","",
 	/*Z0-Z7*/	"","","","","","","","";
 };
+
+&sgpiom0 {
+	gpio-line-names =
+	/*"input pin","output pin"*/
+	/*A0 - A7*/
+	"PRSNT_MTIA_BLADE0_N","PWREN_MTIA_BLADE0_EN",
+	"PRSNT_MTIA_BLADE1_N","PWREN_MTIA_BLADE1_EN",
+	"PRSNT_MTIA_BLADE2_N","PWREN_MTIA_BLADE2_EN",
+	"PRSNT_MTIA_BLADE3_N","PWREN_MTIA_BLADE3_EN",
+	"PRSNT_MTIA_BLADE4_N","PWREN_MTIA_BLADE4_EN",
+	"PRSNT_MTIA_BLADE5_N","PWREN_MTIA_BLADE5_EN",
+	"PRSNT_MTIA_BLADE6_N","PWREN_MTIA_BLADE6_EN",
+	"PRSNT_MTIA_BLADE7_N","PWREN_MTIA_BLADE7_EN",
+	/*B0 - B7*/
+	"PRSNT_MTIA_BLADE8_N","PWREN_MTIA_BLADE8_EN",
+	"PRSNT_MTIA_BLADE9_N","PWREN_MTIA_BLADE9_EN",
+	"PRSNT_MTIA_BLADE10_N","PWREN_MTIA_BLADE10_EN",
+	"PRSNT_MTIA_BLADE11_N","PWREN_MTIA_BLADE11_EN",
+	"PRSNT_MTIA_BLADE12_N","PWREN_MTIA_BLADE12_EN",
+	"PRSNT_MTIA_BLADE13_N","PWREN_MTIA_BLADE13_EN",
+	"PRSNT_MTIA_BLADE14_N","PWREN_MTIA_BLADE14_EN",
+	"PRSNT_MTIA_BLADE15_N","PWREN_MTIA_BLADE15_EN",
+	/*C0 - C7*/
+	"PRSNT_NW_BLADE0_N","PWREN_NW_BLADE0_EN",
+	"PRSNT_NW_BLADE1_N","PWREN_NW_BLADE1_EN",
+	"PRSNT_NW_BLADE2_N","PWREN_NW_BLADE2_EN",
+	"PRSNT_NW_BLADE3_N","PWREN_NW_BLADE3_EN",
+	"PRSNT_NW_BLADE4_N","PWREN_NW_BLADE4_EN",
+	"PRSNT_NW_BLADE5_N","PWREN_NW_BLADE5_EN",
+	"PRSNT_FCB_TOP_0_N","PWREN_MTIA_BLADE0_HSC_EN",
+	"PRSNT_FCB_TOP_1_N","PWREN_MTIA_BLADE1_HSC_EN",
+	/*D0 - D7*/
+	"PRSNT_FCB_MIDDLE_0_N","PWREN_MTIA_BLADE2_HSC_EN",
+	"PRSNT_FCB_MIDDLE_1_N","PWREN_MTIA_BLADE3_HSC_EN",
+	"PRSNT_FCB_BOTTOM_0_N","PWREN_MTIA_BLADE4_HSC_EN",
+	"PRSNT_FCB_BOTTOM_1_N","PWREN_MTIA_BLADE5_HSC_EN",
+	"PWRGD_MTIA_BLADE0_PWROK_L_BUF","PWREN_MTIA_BLADE6_HSC_EN",
+	"PWRGD_MTIA_BLADE1_PWROK_L_BUF","PWREN_MTIA_BLADE7_HSC_EN",
+	"PWRGD_MTIA_BLADE2_PWROK_L_BUF","PWREN_MTIA_BLADE8_HSC_EN",
+	"PWRGD_MTIA_BLADE3_PWROK_L_BUF","PWREN_MTIA_BLADE9_HSC_EN",
+	/*E0 - E7*/
+	"PWRGD_MTIA_BLADE4_PWROK_L_BUF","PWREN_MTIA_BLADE10_HSC_EN",
+	"PWRGD_MTIA_BLADE5_PWROK_L_BUF","PWREN_MTIA_BLADE11_HSC_EN",
+	"PWRGD_MTIA_BLADE6_PWROK_L_BUF","PWREN_MTIA_BLADE12_HSC_EN",
+	"PWRGD_MTIA_BLADE7_PWROK_L_BUF","PWREN_MTIA_BLADE13_HSC_EN",
+	"PWRGD_MTIA_BLADE8_PWROK_L_BUF","PWREN_MTIA_BLADE14_HSC_EN",
+	"PWRGD_MTIA_BLADE9_PWROK_L_BUF","PWREN_MTIA_BLADE15_HSC_EN",
+	"PWRGD_MTIA_BLADE10_PWROK_L_BUF","PWREN_NW_BLADE0_HSC_EN",
+	"PWRGD_MTIA_BLADE11_PWROK_L_BUF","PWREN_NW_BLADE1_HSC_EN",
+	/*F0 - F7*/
+	"PWRGD_MTIA_BLADE12_PWROK_L_BUF","PWREN_NW_BLADE2_HSC_EN",
+	"PWRGD_MTIA_BLADE13_PWROK_L_BUF","PWREN_NW_BLADE3_HSC_EN",
+	"PWRGD_MTIA_BLADE14_PWROK_L_BUF","PWREN_NW_BLADE4_HSC_EN",
+	"PWRGD_MTIA_BLADE15_PWROK_L_BUF","PWREN_NW_BLADE5_HSC_EN",
+	"PWRGD_NW_BLADE0_PWROK_L_BUF","PWREN_FCB_TOP_L_EN",
+	"PWRGD_NW_BLADE1_PWROK_L_BUF","PWREN_FCB_TOP_R_EN",
+	"PWRGD_NW_BLADE2_PWROK_L_BUF","PWREN_FCB_MIDDLE_L_EN",
+	"PWRGD_NW_BLADE3_PWROK_L_BUF","PWREN_FCB_MIDDLE_R_EN",
+	/*G0 - G7*/
+	"PWRGD_NW_BLADE4_PWROK_L_BUF","PWREN_FCB_BOTTOM_L_EN",
+	"PWRGD_NW_BLADE5_PWROK_L_BUF","PWREN_FCB_BOTTOM_R_EN",
+	"PWRGD_FCB_TOP_0_PWROK_L_BUF","FM_CMM_AC_CYCLE_N",
+	"PWRGD_FCB_TOP_1_PWROK_L_BUF","MGMT_SFP_TX_DIS",
+	"PWRGD_FCB_MIDDLE_0_PWROK_L_BUF","",
+	"PWRGD_FCB_MIDDLE_1_PWROK_L_BUF","RST_I2CRST_MTIA_BLADE0_1_N",
+	"PWRGD_FCB_BOTTOM_0_PWROK_L_BUF","RST_I2CRST_MTIA_BLADE2_3_N",
+	"PWRGD_FCB_BOTTOM_1_PWROK_L_BUF","RST_I2CRST_MTIA_BLADE4_5_N",
+	/*H0 - H7*/
+	"LEAK_DETECT_MTIA_BLADE0_N_BUF","RST_I2CRST_MTIA_BLADE6_7_N",
+	"LEAK_DETECT_MTIA_BLADE1_N_BUF","RST_I2CRST_MTIA_BLADE8_9_N",
+	"LEAK_DETECT_MTIA_BLADE2_N_BUF","RST_I2CRST_MTIA_BLADE10_11_N",
+	"LEAK_DETECT_MTIA_BLADE3_N_BUF","RST_I2CRST_MTIA_BLADE12_13_N",
+	"LEAK_DETECT_MTIA_BLADE4_N_BUF","RST_I2CRST_MTIA_BLADE14_15_N",
+	"LEAK_DETECT_MTIA_BLADE5_N_BUF","RST_I2CRST_NW_BLADE0_1_2_N",
+	"LEAK_DETECT_MTIA_BLADE6_N_BUF","RST_I2CRST_NW_BLADE3_4_5_N",
+	"LEAK_DETECT_MTIA_BLADE7_N_BUF","RST_I2CRST_FCB_N",
+	/*I0 - I7*/
+	"LEAK_DETECT_MTIA_BLADE8_N_BUF","RST_I2CRST_FCB_B_L_N",
+	"LEAK_DETECT_MTIA_BLADE9_N_BUF","RST_I2CRST_FCB_B_R_N",
+	"LEAK_DETECT_MTIA_BLADE10_N_BUF","RST_I2CRST_FCB_M_L_N",
+	"LEAK_DETECT_MTIA_BLADE11_N_BUF","RST_I2CRST_FCB_M_R_N",
+	"LEAK_DETECT_MTIA_BLADE12_N_BUF","RST_I2CRST_FCB_T_L_N",
+	"LEAK_DETECT_MTIA_BLADE13_N_BUF","RST_I2CRST_FCB_T_R_N",
+	"LEAK_DETECT_MTIA_BLADE14_N_BUF","BMC_READY",
+	"LEAK_DETECT_MTIA_BLADE15_N_BUF","wFM_88E6393X_BIN_UPDATE_EN_N",
+	/*J0 - J7*/
+	"LEAK_DETECT_NW_BLADE0_N_BUF","WATER_VALVE_CLOSED_N",
+	"LEAK_DETECT_NW_BLADE1_N_BUF","",
+	"LEAK_DETECT_NW_BLADE2_N_BUF","",
+	"LEAK_DETECT_NW_BLADE3_N_BUF","",
+	"LEAK_DETECT_NW_BLADE4_N_BUF","",
+	"LEAK_DETECT_NW_BLADE5_N_BUF","",
+	"MTIA_BLADE0_STATUS_LED","",
+	"MTIA_BLADE1_STATUS_LED","",
+	/*K0 - K7*/
+	"MTIA_BLADE2_STATUS_LED","",
+	"MTIA_BLADE3_STATUS_LED","",
+	"MTIA_BLADE4_STATUS_LED","",
+	"MTIA_BLADE5_STATUS_LED","",
+	"MTIA_BLADE6_STATUS_LED","",
+	"MTIA_BLADE7_STATUS_LED","",
+	"MTIA_BLADE8_STATUS_LED","",
+	"MTIA_BLADE9_STATUS_LED","",
+	/*L0 - L7*/
+	"MTIA_BLADE10_STATUS_LED","",
+	"MTIA_BLADE11_STATUS_LED","",
+	"MTIA_BLADE12_STATUS_LED","",
+	"MTIA_BLADE13_STATUS_LED","",
+	"MTIA_BLADE14_STATUS_LED","",
+	"MTIA_BLADE15_STATUS_LED","",
+	"NW_BLADE0_STATUS_LED","",
+	"NW_BLADE1_STATUS_LED","",
+	/*M0 - M7*/
+	"NW_BLADE2_STATUS_LED","",
+	"NW_BLADE3_STATUS_LED","",
+	"NW_BLADE4_STATUS_LED","",
+	"NW_BLADE5_STATUS_LED","",
+	"RPU_READY","",
+	"IT_GEAR_RPU_LINK_N","",
+	"IT_GEAR_LEAK","",
+	"WATER_VALVE_CLOSED_N","",
+	/*N0 - N7*/
+	"VALVE_STS0","",
+	"VALVE_STS1","",
+	"VALVE_STS2","",
+	"VALVE_STS3","",
+	"CR_TOGGLE_BOOT_BUF_N","",
+	"CMM_LC_RDY_LED_N","",
+	"CMM_LC_UNRDY_LED_N","",
+	"CMM_CABLE_CARTRIDGE_PRSNT_BOT_N","",
+	/*O0 - O7*/
+	"CMM_CABLE_CARTRIDGE_PRSNT_TOP_N","",
+	"BOT_BCB_CABLE_PRSNT_N","",
+	"TOP_BCB_CABLE_PRSNT_N","",
+	"CHASSIS0_LEAK_Q_N","",
+	"CHASSIS1_LEAK_Q_N","",
+	"LEAK0_DETECT","",
+	"LEAK1_DETECT","",
+	"MGMT_SFP_PRSNT_N","",
+	/*P0 - P7*/
+	"MGMT_SFP_TX_FAULT","",
+	"MGMT_SFP_RX_LOS","",
+	"","",
+	"","",
+	"","",
+	"","",
+	"","",
+	"","";
+};

From 4c855a0c7d3100a47640e5c58f39b4e5b4ce30d6 Mon Sep 17 00:00:00 2001
From: David Heidelberg <david@ixit.cz>
Date: Sat, 9 Dec 2023 18:15:41 +0100
Subject: [PATCH 025/707] ARM: tegra: Use gpio-fan matrix instead of array on
 Ouya

No functional changes.

Adjust to comply with dt-schema requirements and make possible to
validate values.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: David Heidelberg <david@ixit.cz>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 arch/arm/boot/dts/nvidia/tegra30-ouya.dts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/nvidia/tegra30-ouya.dts b/arch/arm/boot/dts/nvidia/tegra30-ouya.dts
index 7e3de26ca960d2..c284dd0a55ab31 100644
--- a/arch/arm/boot/dts/nvidia/tegra30-ouya.dts
+++ b/arch/arm/boot/dts/nvidia/tegra30-ouya.dts
@@ -4611,8 +4611,8 @@
 	fan: fan {
 		compatible = "gpio-fan";
 		gpios = <&gpio TEGRA_GPIO(J, 2) GPIO_ACTIVE_HIGH>;
-		gpio-fan,speed-map = <0    0
-				      4500 1>;
+		gpio-fan,speed-map = <0    0>,
+				     <4500 1>;
 		#cooling-cells = <2>;
 	};
 

From c4ae5addc4c53c504b1bdbd77ce72601ef51da8e Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 12 Nov 2023 08:04:14 +0100
Subject: [PATCH 026/707] soc/tegra: pmc: Remove some old and deprecated
 functions and constants

These TEGRA_IO_RAIL_... functions and constants have been deprecated in
commit 21b499105178 ("soc/tegra: pmc: Add I/O pad voltage support") in
2016-11.

There seems to be no users since kernel 4.16.

Remove them now.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/soc/tegra/pmc.c | 24 ------------------------
 include/soc/tegra/pmc.h | 18 ------------------
 2 files changed, 42 deletions(-)

diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c
index f432aa022ace06..6dfcc7f50ecea9 100644
--- a/drivers/soc/tegra/pmc.c
+++ b/drivers/soc/tegra/pmc.c
@@ -1777,30 +1777,6 @@ static int tegra_io_pad_get_voltage(struct tegra_pmc *pmc, enum tegra_io_pad id)
 	return TEGRA_IO_PAD_VOLTAGE_3V3;
 }
 
-/**
- * tegra_io_rail_power_on() - enable power to I/O rail
- * @id: Tegra I/O pad ID for which to enable power
- *
- * See also: tegra_io_pad_power_enable()
- */
-int tegra_io_rail_power_on(unsigned int id)
-{
-	return tegra_io_pad_power_enable(id);
-}
-EXPORT_SYMBOL(tegra_io_rail_power_on);
-
-/**
- * tegra_io_rail_power_off() - disable power to I/O rail
- * @id: Tegra I/O pad ID for which to disable power
- *
- * See also: tegra_io_pad_power_disable()
- */
-int tegra_io_rail_power_off(unsigned int id)
-{
-	return tegra_io_pad_power_disable(id);
-}
-EXPORT_SYMBOL(tegra_io_rail_power_off);
-
 #ifdef CONFIG_PM_SLEEP
 enum tegra_suspend_mode tegra_pmc_get_suspend_mode(void)
 {
diff --git a/include/soc/tegra/pmc.h b/include/soc/tegra/pmc.h
index aadb845d281dd7..c545875d0ff18e 100644
--- a/include/soc/tegra/pmc.h
+++ b/include/soc/tegra/pmc.h
@@ -148,10 +148,6 @@ enum tegra_io_pad {
 	TEGRA_IO_PAD_AO_HV,
 };
 
-/* deprecated, use TEGRA_IO_PAD_{HDMI,LVDS} instead */
-#define TEGRA_IO_RAIL_HDMI	TEGRA_IO_PAD_HDMI
-#define TEGRA_IO_RAIL_LVDS	TEGRA_IO_PAD_LVDS
-
 #ifdef CONFIG_SOC_TEGRA_PMC
 int tegra_powergate_power_on(unsigned int id);
 int tegra_powergate_power_off(unsigned int id);
@@ -164,10 +160,6 @@ int tegra_powergate_sequence_power_up(unsigned int id, struct clk *clk,
 int tegra_io_pad_power_enable(enum tegra_io_pad id);
 int tegra_io_pad_power_disable(enum tegra_io_pad id);
 
-/* deprecated, use tegra_io_pad_power_{enable,disable}() instead */
-int tegra_io_rail_power_on(unsigned int id);
-int tegra_io_rail_power_off(unsigned int id);
-
 void tegra_pmc_set_suspend_mode(enum tegra_suspend_mode mode);
 void tegra_pmc_enter_suspend_mode(enum tegra_suspend_mode mode);
 
@@ -211,16 +203,6 @@ static inline int tegra_io_pad_get_voltage(enum tegra_io_pad id)
 	return -ENOSYS;
 }
 
-static inline int tegra_io_rail_power_on(unsigned int id)
-{
-	return -ENOSYS;
-}
-
-static inline int tegra_io_rail_power_off(unsigned int id)
-{
-	return -ENOSYS;
-}
-
 static inline void tegra_pmc_set_suspend_mode(enum tegra_suspend_mode mode)
 {
 }

From 29939185ad8939050e5d7c85939af9c2255aeea6 Mon Sep 17 00:00:00 2001
From: Kartik <kkartik@nvidia.com>
Date: Tue, 17 Oct 2023 10:53:15 +0530
Subject: [PATCH 027/707] mm/util: Introduce kmemdup_array()

Introduce kmemdup_array() API to duplicate `n` number of elements
from a given array. This internally uses kmemdup to allocate and duplicate
the `src` array.

Signed-off-by: Kartik <kkartik@nvidia.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/linux/string.h |  1 +
 mm/util.c              | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/include/linux/string.h b/include/linux/string.h
index ce137830a0b99c..433c207a01daa6 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -220,6 +220,7 @@ extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
 extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
 extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
 extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp);
+extern void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp);
 
 extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
 extern void argv_free(char **argv);
diff --git a/mm/util.c b/mm/util.c
index aa01f6ea5a75b7..8d6d783520b946 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -135,6 +135,23 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
 }
 EXPORT_SYMBOL(kmemdup);
 
+/**
+ * kmemdup_array - duplicate a given array.
+ *
+ * @src: array to duplicate.
+ * @element_size: size of each element of array.
+ * @count: number of elements to duplicate from array.
+ * @gfp: GFP mask to use.
+ *
+ * Return: duplicated array of @src or %NULL in case of error,
+ * result is physically contiguous. Use kfree() to free.
+ */
+void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp)
+{
+	return kmemdup(src, size_mul(element_size, count), gfp);
+}
+EXPORT_SYMBOL(kmemdup_array);
+
 /**
  * kvmemdup - duplicate region of memory
  *

From c89a004b57ce1dbb0a906c8e8c6fe8f4bfae3ab1 Mon Sep 17 00:00:00 2001
From: Kartik <kkartik@nvidia.com>
Date: Tue, 17 Oct 2023 10:53:16 +0530
Subject: [PATCH 028/707] soc/tegra: fuse: Use dev_err_probe for probe failures

Currently, in tegra_fuse_probe() if clock/reset get fails, then the
driver prints an error if the error is not caused by -EPROBE_DEFER.
This can be improved by using dev_err_probe() instead.

So, return dev_err_probe() if clock/reset get fails.

Signed-off-by: Kartik <kkartik@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/soc/tegra/fuse/fuse-tegra.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c
index a2c28f493a75e5..98805885158e9d 100644
--- a/drivers/soc/tegra/fuse/fuse-tegra.c
+++ b/drivers/soc/tegra/fuse/fuse-tegra.c
@@ -131,13 +131,8 @@ static int tegra_fuse_probe(struct platform_device *pdev)
 	fuse->phys = res->start;
 
 	fuse->clk = devm_clk_get(&pdev->dev, "fuse");
-	if (IS_ERR(fuse->clk)) {
-		if (PTR_ERR(fuse->clk) != -EPROBE_DEFER)
-			dev_err(&pdev->dev, "failed to get FUSE clock: %ld",
-				PTR_ERR(fuse->clk));
-
-		return PTR_ERR(fuse->clk);
-	}
+	if (IS_ERR(fuse->clk))
+		return dev_err_probe(&pdev->dev, PTR_ERR(fuse->clk), "failed to get FUSE clock\n");
 
 	platform_set_drvdata(pdev, fuse);
 	fuse->dev = &pdev->dev;
@@ -179,12 +174,8 @@ static int tegra_fuse_probe(struct platform_device *pdev)
 	}
 
 	fuse->rst = devm_reset_control_get_optional(&pdev->dev, "fuse");
-	if (IS_ERR(fuse->rst)) {
-		err = PTR_ERR(fuse->rst);
-		dev_err(&pdev->dev, "failed to get FUSE reset: %pe\n",
-			fuse->rst);
-		return err;
-	}
+	if (IS_ERR(fuse->rst))
+		return dev_err_probe(&pdev->dev, PTR_ERR(fuse->rst), "failed to get FUSE reset\n");
 
 	/*
 	 * FUSE clock is enabled at a boot time, hence this resume/suspend

From 650324faffbe3f5342de1df0979258059dd881ed Mon Sep 17 00:00:00 2001
From: Kartik <kkartik@nvidia.com>
Date: Tue, 17 Oct 2023 10:53:17 +0530
Subject: [PATCH 029/707] soc/tegra: fuse: Refactor resource mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To prepare for adding ACPI support to the tegra-apbmisc driver,
relocate the code responsible for mapping memory resources from
the function ‘tegra_init_apbmisc’ to the function
‘tegra_init_apbmisc_resources.’ This adjustment will allow the
code to be shared between ‘tegra_init_apbmisc’ and the upcoming
‘tegra_acpi_init_apbmisc’ function.

Signed-off-by: Kartik <kkartik@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/soc/tegra/fuse/tegra-apbmisc.c | 37 +++++++++++++++-----------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/drivers/soc/tegra/fuse/tegra-apbmisc.c b/drivers/soc/tegra/fuse/tegra-apbmisc.c
index da970f3dbf3562..06c1b3a2c7eccc 100644
--- a/drivers/soc/tegra/fuse/tegra-apbmisc.c
+++ b/drivers/soc/tegra/fuse/tegra-apbmisc.c
@@ -160,9 +160,28 @@ void __init tegra_init_revision(void)
 	tegra_sku_info.platform = tegra_get_platform();
 }
 
-void __init tegra_init_apbmisc(void)
+static void tegra_init_apbmisc_resources(struct resource *apbmisc,
+					 struct resource *straps)
 {
 	void __iomem *strapping_base;
+
+	apbmisc_base = ioremap(apbmisc->start, resource_size(apbmisc));
+	if (apbmisc_base)
+		chipid = readl_relaxed(apbmisc_base + 4);
+	else
+		pr_err("failed to map APBMISC registers\n");
+
+	strapping_base = ioremap(straps->start, resource_size(straps));
+	if (strapping_base) {
+		strapping = readl_relaxed(strapping_base);
+		iounmap(strapping_base);
+	} else {
+		pr_err("failed to map strapping options registers\n");
+	}
+}
+
+void __init tegra_init_apbmisc(void)
+{
 	struct resource apbmisc, straps;
 	struct device_node *np;
 
@@ -219,21 +238,7 @@ void __init tegra_init_apbmisc(void)
 		}
 	}
 
-	apbmisc_base = ioremap(apbmisc.start, resource_size(&apbmisc));
-	if (!apbmisc_base) {
-		pr_err("failed to map APBMISC registers\n");
-	} else {
-		chipid = readl_relaxed(apbmisc_base + 4);
-	}
-
-	strapping_base = ioremap(straps.start, resource_size(&straps));
-	if (!strapping_base) {
-		pr_err("failed to map strapping options registers\n");
-	} else {
-		strapping = readl_relaxed(strapping_base);
-		iounmap(strapping_base);
-	}
-
+	tegra_init_apbmisc_resources(&apbmisc, &straps);
 	long_ram_code = of_property_read_bool(np, "nvidia,long-ram-code");
 
 put:

From ca7b63e8b99c647788375b5e8668424c5096cb50 Mon Sep 17 00:00:00 2001
From: Kartik <kkartik@nvidia.com>
Date: Tue, 17 Oct 2023 10:53:18 +0530
Subject: [PATCH 030/707] soc/tegra: fuse: Add tegra_acpi_init_apbmisc()

In preparation to ACPI support in Tegra fuse driver add function
tegra_acpi_init_apbmisc() to initialize tegra-apbmisc driver.
Also, document the reason of calling tegra_init_apbmisc() at early init.

Note that function tegra_acpi_init_apbmisc() is not placed in the __init
section, because it will be called during probe.

Signed-off-by: Kartik <kkartik@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/soc/tegra/fuse/fuse.h          |  1 +
 drivers/soc/tegra/fuse/tegra-apbmisc.c | 72 ++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/drivers/soc/tegra/fuse/fuse.h b/drivers/soc/tegra/fuse/fuse.h
index 90f23be738947a..a41e9f85281aa8 100644
--- a/drivers/soc/tegra/fuse/fuse.h
+++ b/drivers/soc/tegra/fuse/fuse.h
@@ -69,6 +69,7 @@ struct tegra_fuse {
 
 void tegra_init_revision(void);
 void tegra_init_apbmisc(void);
+void tegra_acpi_init_apbmisc(void);
 
 u32 __init tegra_fuse_read_spare(unsigned int spare);
 u32 __init tegra_fuse_read_early(unsigned int offset);
diff --git a/drivers/soc/tegra/fuse/tegra-apbmisc.c b/drivers/soc/tegra/fuse/tegra-apbmisc.c
index 06c1b3a2c7eccc..6457f80821bb95 100644
--- a/drivers/soc/tegra/fuse/tegra-apbmisc.c
+++ b/drivers/soc/tegra/fuse/tegra-apbmisc.c
@@ -3,9 +3,11 @@
  * Copyright (c) 2014-2023, NVIDIA CORPORATION.  All rights reserved.
  */
 
+#include <linux/acpi.h>
 #include <linux/export.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
+#include <linux/mod_devicetable.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 
@@ -180,6 +182,12 @@ static void tegra_init_apbmisc_resources(struct resource *apbmisc,
 	}
 }
 
+/**
+ * tegra_init_apbmisc - Initializes Tegra APBMISC and Strapping registers.
+ *
+ * This is called during early init as some of the old 32-bit ARM code needs
+ * information from the APBMISC registers very early during boot.
+ */
 void __init tegra_init_apbmisc(void)
 {
 	struct resource apbmisc, straps;
@@ -244,3 +252,67 @@ void __init tegra_init_apbmisc(void)
 put:
 	of_node_put(np);
 }
+
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id apbmisc_acpi_match[] = {
+	{ "NVDA2010" },
+	{ /* sentinel */ }
+};
+
+void tegra_acpi_init_apbmisc(void)
+{
+	struct resource *resources[2] = { NULL };
+	struct resource_entry *rentry;
+	struct acpi_device *adev = NULL;
+	struct list_head resource_list;
+	int rcount = 0;
+	int ret;
+
+	adev = acpi_dev_get_first_match_dev(apbmisc_acpi_match[0].id, NULL, -1);
+	if (!adev)
+		return;
+
+	INIT_LIST_HEAD(&resource_list);
+
+	ret = acpi_dev_get_memory_resources(adev, &resource_list);
+	if (ret < 0) {
+		pr_err("failed to get APBMISC memory resources");
+		goto out_put_acpi_dev;
+	}
+
+	/*
+	 * Get required memory resources.
+	 *
+	 * resources[0]: apbmisc.
+	 * resources[1]: straps.
+	 */
+	resource_list_for_each_entry(rentry, &resource_list) {
+		if (rcount >= ARRAY_SIZE(resources))
+			break;
+
+		resources[rcount++] = rentry->res;
+	}
+
+	if (!resources[0]) {
+		pr_err("failed to get APBMISC registers\n");
+		goto out_free_resource_list;
+	}
+
+	if (!resources[1]) {
+		pr_err("failed to get strapping options registers\n");
+		goto out_free_resource_list;
+	}
+
+	tegra_init_apbmisc_resources(resources[0], resources[1]);
+
+out_free_resource_list:
+	acpi_dev_free_resource_list(&resource_list);
+
+out_put_acpi_dev:
+	acpi_dev_put(adev);
+}
+#else
+void tegra_acpi_init_apbmisc(void)
+{
+}
+#endif

From 8a2ac683cbf4d11a12f6bb67b2afc9109078568a Mon Sep 17 00:00:00 2001
From: Kartik <kkartik@nvidia.com>
Date: Tue, 17 Oct 2023 10:53:19 +0530
Subject: [PATCH 031/707] soc/tegra: fuse: Add function to add lookups

Add helper function tegra_fuse_add_lookups() to register Tegra fuse
nvmem lookups. So, this can be shared between tegra_fuse_init() and
ACPI probe, which is to be introduced later.

Use kmemdup_array to duplicate fuse->soc->lookups.

Signed-off-by: Kartik <kkartik@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/soc/tegra/fuse/fuse-tegra.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c
index 98805885158e9d..4ebb5597a77b7e 100644
--- a/drivers/soc/tegra/fuse/fuse-tegra.c
+++ b/drivers/soc/tegra/fuse/fuse-tegra.c
@@ -113,6 +113,18 @@ static void tegra_fuse_restore(void *base)
 	fuse->clk = NULL;
 }
 
+static int tegra_fuse_add_lookups(struct tegra_fuse *fuse)
+{
+	fuse->lookups = kmemdup_array(fuse->soc->lookups, sizeof(*fuse->lookups),
+				      fuse->soc->num_lookups, GFP_KERNEL);
+	if (!fuse->lookups)
+		return -ENOMEM;
+
+	nvmem_add_cell_lookups(fuse->lookups, fuse->soc->num_lookups);
+
+	return 0;
+}
+
 static int tegra_fuse_probe(struct platform_device *pdev)
 {
 	void __iomem *base = fuse->base;
@@ -398,6 +410,7 @@ static int __init tegra_init_fuse(void)
 	const struct of_device_id *match;
 	struct device_node *np;
 	struct resource regs;
+	int err;
 
 	tegra_init_apbmisc();
 
@@ -495,15 +508,11 @@ static int __init tegra_init_fuse(void)
 	pr_debug("Tegra CPU Speedo ID %d, SoC Speedo ID %d\n",
 		 tegra_sku_info.cpu_speedo_id, tegra_sku_info.soc_speedo_id);
 
-	if (fuse->soc->lookups) {
-		size_t size = sizeof(*fuse->lookups) * fuse->soc->num_lookups;
-
-		fuse->lookups = kmemdup(fuse->soc->lookups, size, GFP_KERNEL);
-		if (fuse->lookups)
-			nvmem_add_cell_lookups(fuse->lookups, fuse->soc->num_lookups);
-	}
+	err = tegra_fuse_add_lookups(fuse);
+	if (err)
+		pr_err("failed to add FUSE lookups\n");
 
-	return 0;
+	return err;
 }
 early_initcall(tegra_init_fuse);
 

From bf66ea5d1fe29acc681a3422f5d1967b7486acac Mon Sep 17 00:00:00 2001
From: Kartik <kkartik@nvidia.com>
Date: Tue, 17 Oct 2023 10:53:20 +0530
Subject: [PATCH 032/707] soc/tegra: fuse: Add function to print SKU info

Add helper function tegra_fuse_print_sku_info() to print Tegra SKU
information. So, it can be shared between tegra_fuse_init() and
ACPI probe which is to be introduced later.

Signed-off-by: Kartik <kkartik@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/soc/tegra/fuse/fuse-tegra.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c
index 4ebb5597a77b7e..7a93c6512f7b53 100644
--- a/drivers/soc/tegra/fuse/fuse-tegra.c
+++ b/drivers/soc/tegra/fuse/fuse-tegra.c
@@ -113,6 +113,16 @@ static void tegra_fuse_restore(void *base)
 	fuse->clk = NULL;
 }
 
+static void tegra_fuse_print_sku_info(struct tegra_sku_info *tegra_sku_info)
+{
+	pr_info("Tegra Revision: %s SKU: %d CPU Process: %d SoC Process: %d\n",
+		tegra_revision_name[tegra_sku_info->revision],
+		tegra_sku_info->sku_id, tegra_sku_info->cpu_process_id,
+		tegra_sku_info->soc_process_id);
+	pr_debug("Tegra CPU Speedo ID %d, SoC Speedo ID %d\n",
+		tegra_sku_info->cpu_speedo_id, tegra_sku_info->soc_speedo_id);
+}
+
 static int tegra_fuse_add_lookups(struct tegra_fuse *fuse)
 {
 	fuse->lookups = kmemdup_array(fuse->soc->lookups, sizeof(*fuse->lookups),
@@ -501,12 +511,7 @@ static int __init tegra_init_fuse(void)
 
 	fuse->soc->init(fuse);
 
-	pr_info("Tegra Revision: %s SKU: %d CPU Process: %d SoC Process: %d\n",
-		tegra_revision_name[tegra_sku_info.revision],
-		tegra_sku_info.sku_id, tegra_sku_info.cpu_process_id,
-		tegra_sku_info.soc_process_id);
-	pr_debug("Tegra CPU Speedo ID %d, SoC Speedo ID %d\n",
-		 tegra_sku_info.cpu_speedo_id, tegra_sku_info.soc_speedo_id);
+	tegra_fuse_print_sku_info(&tegra_sku_info);
 
 	err = tegra_fuse_add_lookups(fuse);
 	if (err)

From c5b2d43e67bb8de33e0e8216d7703fdcd62227bf Mon Sep 17 00:00:00 2001
From: Kartik <kkartik@nvidia.com>
Date: Tue, 17 Oct 2023 10:53:21 +0530
Subject: [PATCH 033/707] soc/tegra: fuse: Add ACPI support for Tegra194 and
 Tegra234

Add ACPI support for Tegra194 & Tegra243 SoC's. This requires
following modifications to the probe when ACPI boot is used:
 - Initialize soc data.
 - Add nvmem lookups.
 - Register soc device.
 - use devm_clk_get_optional() instead of devm_clk_get() to get
   fuse->clk, as fuse clocks are not required when using ACPI boot.

Also, drop '__init' keyword for tegra_soc_device_register() as this is also
used by tegra_fuse_probe() and use dev_err_probe() wherever applicable.

Signed-off-by: Kartik <kkartik@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/soc/tegra/fuse/fuse-tegra.c | 52 +++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 3 deletions(-)

diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c
index 7a93c6512f7b53..1c758f121f916c 100644
--- a/drivers/soc/tegra/fuse/fuse-tegra.c
+++ b/drivers/soc/tegra/fuse/fuse-tegra.c
@@ -3,11 +3,13 @@
  * Copyright (c) 2013-2023, NVIDIA CORPORATION.  All rights reserved.
  */
 
+#include <linux/acpi.h>
 #include <linux/clk.h>
 #include <linux/device.h>
 #include <linux/kobject.h>
 #include <linux/init.h>
 #include <linux/io.h>
+#include <linux/mod_devicetable.h>
 #include <linux/nvmem-consumer.h>
 #include <linux/nvmem-provider.h>
 #include <linux/of.h>
@@ -152,7 +154,38 @@ static int tegra_fuse_probe(struct platform_device *pdev)
 		return PTR_ERR(fuse->base);
 	fuse->phys = res->start;
 
-	fuse->clk = devm_clk_get(&pdev->dev, "fuse");
+	/* Initialize the soc data and lookups if using ACPI boot. */
+	if (is_acpi_node(dev_fwnode(&pdev->dev)) && !fuse->soc) {
+		u8 chip;
+
+		tegra_acpi_init_apbmisc();
+
+		chip = tegra_get_chip_id();
+		switch (chip) {
+#if defined(CONFIG_ARCH_TEGRA_194_SOC)
+		case TEGRA194:
+			fuse->soc = &tegra194_fuse_soc;
+			break;
+#endif
+#if defined(CONFIG_ARCH_TEGRA_234_SOC)
+		case TEGRA234:
+			fuse->soc = &tegra234_fuse_soc;
+			break;
+#endif
+		default:
+			return dev_err_probe(&pdev->dev, -EINVAL, "Unsupported SoC: %02x\n", chip);
+		}
+
+		fuse->soc->init(fuse);
+		tegra_fuse_print_sku_info(&tegra_sku_info);
+		tegra_soc_device_register();
+
+		err = tegra_fuse_add_lookups(fuse);
+		if (err)
+			return dev_err_probe(&pdev->dev, err, "failed to add FUSE lookups\n");
+	}
+
+	fuse->clk = devm_clk_get_optional(&pdev->dev, "fuse");
 	if (IS_ERR(fuse->clk))
 		return dev_err_probe(&pdev->dev, PTR_ERR(fuse->clk), "failed to get FUSE clock\n");
 
@@ -275,10 +308,17 @@ static const struct dev_pm_ops tegra_fuse_pm = {
 	SET_SYSTEM_SLEEP_PM_OPS(tegra_fuse_suspend, tegra_fuse_resume)
 };
 
+static const struct acpi_device_id tegra_fuse_acpi_match[] = {
+	{ "NVDA200F" },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(acpi, tegra_fuse_acpi_match);
+
 static struct platform_driver tegra_fuse_driver = {
 	.driver = {
 		.name = "tegra-fuse",
 		.of_match_table = tegra_fuse_match,
+		.acpi_match_table = tegra_fuse_acpi_match,
 		.pm = &tegra_fuse_pm,
 		.suppress_bind_attrs = true,
 	},
@@ -300,7 +340,13 @@ u32 __init tegra_fuse_read_early(unsigned int offset)
 
 int tegra_fuse_readl(unsigned long offset, u32 *value)
 {
-	if (!fuse->read || !fuse->clk)
+	/*
+	 * Wait for fuse->clk to be initialized if device-tree boot is used.
+	 */
+	if (is_of_node(dev_fwnode(fuse->dev)) && !fuse->clk)
+		return -EPROBE_DEFER;
+
+	if (!fuse->read)
 		return -EPROBE_DEFER;
 
 	if (IS_ERR(fuse->clk))
@@ -383,7 +429,7 @@ const struct attribute_group tegra194_soc_attr_group = {
 };
 #endif
 
-struct device * __init tegra_soc_device_register(void)
+struct device *tegra_soc_device_register(void)
 {
 	struct soc_device_attribute *attr;
 	struct soc_device *dev;

From dee509eb9cd593b7bcb1c1f1f5f2d7e75e389290 Mon Sep 17 00:00:00 2001
From: Kartik <kkartik@nvidia.com>
Date: Tue, 17 Oct 2023 10:53:22 +0530
Subject: [PATCH 034/707] soc/tegra: fuse: Add support for Tegra241

Add support for Tegra241 which use ACPI boot.

Signed-off-by: Kartik <kkartik@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/soc/tegra/Kconfig              |  5 +++++
 drivers/soc/tegra/fuse/fuse-tegra.c    |  5 +++++
 drivers/soc/tegra/fuse/fuse-tegra30.c  | 20 ++++++++++++++++++++
 drivers/soc/tegra/fuse/fuse.h          |  4 ++++
 drivers/soc/tegra/fuse/tegra-apbmisc.c |  1 +
 include/soc/tegra/fuse.h               |  1 +
 6 files changed, 36 insertions(+)

diff --git a/drivers/soc/tegra/Kconfig b/drivers/soc/tegra/Kconfig
index f16beeabaa92bb..33512558af9f7f 100644
--- a/drivers/soc/tegra/Kconfig
+++ b/drivers/soc/tegra/Kconfig
@@ -133,6 +133,11 @@ config ARCH_TEGRA_234_SOC
 	help
 	  Enable support for the NVIDIA Tegra234 SoC.
 
+config ARCH_TEGRA_241_SOC
+	bool "NVIDIA Tegra241 SoC"
+	help
+	  Enable support for the NVIDIA Tegra241 SoC.
+
 endif
 endif
 
diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c
index 1c758f121f916c..233b8e7bb41bf9 100644
--- a/drivers/soc/tegra/fuse/fuse-tegra.c
+++ b/drivers/soc/tegra/fuse/fuse-tegra.c
@@ -171,6 +171,11 @@ static int tegra_fuse_probe(struct platform_device *pdev)
 		case TEGRA234:
 			fuse->soc = &tegra234_fuse_soc;
 			break;
+#endif
+#if defined(CONFIG_ARCH_TEGRA_241_SOC)
+		case TEGRA241:
+			fuse->soc = &tegra241_fuse_soc;
+			break;
 #endif
 		default:
 			return dev_err_probe(&pdev->dev, -EINVAL, "Unsupported SoC: %02x\n", chip);
diff --git a/drivers/soc/tegra/fuse/fuse-tegra30.c b/drivers/soc/tegra/fuse/fuse-tegra30.c
index e94d46372a6396..2070d36c510dcb 100644
--- a/drivers/soc/tegra/fuse/fuse-tegra30.c
+++ b/drivers/soc/tegra/fuse/fuse-tegra30.c
@@ -678,3 +678,23 @@ const struct tegra_fuse_soc tegra234_fuse_soc = {
 	.clk_suspend_on = false,
 };
 #endif
+
+#if defined(CONFIG_ARCH_TEGRA_241_SOC)
+static const struct tegra_fuse_info tegra241_fuse_info = {
+	.read = tegra30_fuse_read,
+	.size = 0x16008,
+	.spare = 0xcf0,
+};
+
+static const struct nvmem_keepout tegra241_fuse_keepouts[] = {
+	{ .start = 0xc, .end = 0x1600c }
+};
+
+const struct tegra_fuse_soc tegra241_fuse_soc = {
+	.init = tegra30_fuse_init,
+	.info = &tegra241_fuse_info,
+	.keepouts = tegra241_fuse_keepouts,
+	.num_keepouts = ARRAY_SIZE(tegra241_fuse_keepouts),
+	.soc_attr_group = &tegra194_soc_attr_group,
+};
+#endif
diff --git a/drivers/soc/tegra/fuse/fuse.h b/drivers/soc/tegra/fuse/fuse.h
index a41e9f85281aa8..f3b705327c20f8 100644
--- a/drivers/soc/tegra/fuse/fuse.h
+++ b/drivers/soc/tegra/fuse/fuse.h
@@ -136,4 +136,8 @@ extern const struct tegra_fuse_soc tegra194_fuse_soc;
 extern const struct tegra_fuse_soc tegra234_fuse_soc;
 #endif
 
+#ifdef CONFIG_ARCH_TEGRA_241_SOC
+extern const struct tegra_fuse_soc tegra241_fuse_soc;
+#endif
+
 #endif
diff --git a/drivers/soc/tegra/fuse/tegra-apbmisc.c b/drivers/soc/tegra/fuse/tegra-apbmisc.c
index 6457f80821bb95..e2ca5d55fd3125 100644
--- a/drivers/soc/tegra/fuse/tegra-apbmisc.c
+++ b/drivers/soc/tegra/fuse/tegra-apbmisc.c
@@ -64,6 +64,7 @@ bool tegra_is_silicon(void)
 	switch (tegra_get_chip_id()) {
 	case TEGRA194:
 	case TEGRA234:
+	case TEGRA241:
 	case TEGRA264:
 		if (tegra_get_platform() == 0)
 			return true;
diff --git a/include/soc/tegra/fuse.h b/include/soc/tegra/fuse.h
index 3a513be502437f..8f421b9f7585ca 100644
--- a/include/soc/tegra/fuse.h
+++ b/include/soc/tegra/fuse.h
@@ -17,6 +17,7 @@
 #define TEGRA186	0x18
 #define TEGRA194	0x19
 #define TEGRA234	0x23
+#define TEGRA241	0x24
 #define TEGRA264	0x26
 
 #define TEGRA_FUSE_SKU_CALIB_0	0xf0

From 58d4cb6b73be15a9119b24d5d8f581eaa0cc9ff0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 15 Dec 2023 10:01:44 -0500
Subject: [PATCH 035/707] btrfs: do not allow non subvolume root targets for
 snapshot

Our btrfs subvolume snapshot <source> <destination> utility enforces
that <source> is the root of the subvolume, however this isn't enforced
in the kernel.  Update the kernel to also enforce this limitation to
avoid problems with other users of this ioctl that don't have the
appropriate checks in place.

Reported-by: Martin Michaelis <code@mgjm.de>
CC: stable@vger.kernel.org # 4.14+
Reviewed-by: Neal Gompa <neal@gompa.dev>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4e50b62db2a8fe..a1743904202b78 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1290,6 +1290,15 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
 			 * are limited to own subvolumes only
 			 */
 			ret = -EPERM;
+		} else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) {
+			/*
+			 * Snapshots must be made with the src_inode referring
+			 * to the subvolume inode, otherwise the permission
+			 * checking above is useless because we may have
+			 * permission on a lower directory but not the subvol
+			 * itself.
+			 */
+			ret = -EINVAL;
 		} else {
 			ret = btrfs_mksnapshot(&file->f_path, idmap,
 					       name, namelen,

From f7b487648986ecc0510996c5c638c61f5f811ccc Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:15 -0800
Subject: [PATCH 036/707] lib/find: add atomic find_bit() primitives

Add helpers around test_and_{set,clear}_bit() that allow to search for
clear or set bits and flip them atomically.

The target patterns may look like this:

	for (idx = 0; idx < nbits; idx++)
		if (test_and_clear_bit(idx, bitmap))
			do_something(idx);

Or like this:

	do {
		bit = find_first_bit(bitmap, nbits);
		if (bit >= nbits)
			return nbits;
	} while (!test_and_clear_bit(bit, bitmap));
	return bit;

In both cases, the opencoded loop may be converted to a single function
or iterator call. Correspondingly:

	for_each_test_and_clear_bit(idx, bitmap, nbits)
		do_something(idx);

Or:
	return find_and_clear_bit(bitmap, nbits);

Obviously, the less routine code people have to write themself, the
less probability to make a mistake.

Those are not only handy helpers but also resolve a non-trivial
issue of using non-atomic find_bit() together with atomic
test_and_{set,clear)_bit().

The trick is that find_bit() implies that the bitmap is a regular
non-volatile piece of memory, and compiler is allowed to use such
optimization techniques like re-fetching memory instead of caching it.

For example, find_first_bit() is implemented like this:

      for (idx = 0; idx * BITS_PER_LONG < sz; idx++) {
              val = addr[idx];
              if (val) {
                      sz = min(idx * BITS_PER_LONG + __ffs(val), sz);
                      break;
              }
      }

On register-memory architectures, like x86, compiler may decide to
access memory twice - first time to compare against 0, and second time
to fetch its value to pass it to __ffs().

When running find_first_bit() on volatile memory, the memory may get
changed in-between, and for instance, it may lead to passing 0 to
__ffs(), which is undefined. This is a potentially dangerous call.

find_and_clear_bit() as a wrapper around test_and_clear_bit()
naturally treats underlying bitmap as a volatile memory and prevents
compiler from such optimizations.

Now that KCSAN is catching exactly this type of situations and warns on
undercover memory modifications. We can use it to reveal improper usage
of find_bit(), and convert it to atomic find_and_*_bit() as appropriate.

In some cases concurrent operations with plain find_bit() are acceptable.
For example:

 - two threads running find_*_bit(): safe wrt ffs(0) and returns correct
   value, because underlying bitmap is unchanged;
 - find_next_bit() in parallel with set or clear_bit(), when modifying
   a bit prior to the start bit to search: safe and correct;
 - find_first_bit() in parallel with set_bit(): safe, but may return wrong
   bit number;
 - find_first_zero_bit() in parallel with clear_bit(): same as above.

In last 2 cases find_bit() may not return a correct bit number, but
it may be OK if caller requires any (not exactly the first) set or clear
bit, correspondingly.

In such cases, KCSAN may be safely silenced with data_race(). But in most
cases where KCSAN detects concurrency people should carefully review their
code and likely protect critical sections or switch to atomic
find_and_bit(), as appropriate.

The 1st patch of the series adds the following atomic primitives:

	find_and_set_bit(addr, nbits);
	find_and_set_next_bit(addr, nbits, start);
	...

Here find_and_{set,clear} part refers to the corresponding
test_and_{set,clear}_bit function. Suffixes like _wrap or _lock
derive their semantics from corresponding find() or test() functions.

For brevity, the naming omits the fact that we search for zero bit in
find_and_set, and correspondingly search for set bit in find_and_clear
functions.

The patch also adds iterators with atomic semantics, like
for_each_test_and_set_bit(). Here, the naming rule is to simply prefix
corresponding atomic operation with 'for_each'.

CC: Bart Van Assche <bvanassche@acm.org>
CC: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/find.h | 293 +++++++++++++++++++++++++++++++++++++++++++
 lib/find_bit.c       |  85 +++++++++++++
 2 files changed, 378 insertions(+)

diff --git a/include/linux/find.h b/include/linux/find.h
index c69598e383c161..50eeeed5d8a34b 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -32,6 +32,16 @@ extern unsigned long _find_first_and_bit(const unsigned long *addr1,
 extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
 extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
 
+unsigned long _find_and_set_bit(volatile unsigned long *addr, unsigned long nbits);
+unsigned long _find_and_set_next_bit(volatile unsigned long *addr, unsigned long nbits,
+				unsigned long start);
+unsigned long _find_and_set_bit_lock(volatile unsigned long *addr, unsigned long nbits);
+unsigned long _find_and_set_next_bit_lock(volatile unsigned long *addr, unsigned long nbits,
+					  unsigned long start);
+unsigned long _find_and_clear_bit(volatile unsigned long *addr, unsigned long nbits);
+unsigned long _find_and_clear_next_bit(volatile unsigned long *addr, unsigned long nbits,
+				unsigned long start);
+
 #ifdef __BIG_ENDIAN
 unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size);
 unsigned long _find_next_zero_bit_le(const  unsigned long *addr, unsigned
@@ -460,6 +470,267 @@ unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size,
 	return bit < start ? bit : size;
 }
 
+/**
+ * find_and_set_bit - Find a zero bit and set it atomically
+ * @addr: The address to base the search on
+ * @nbits: The bitmap size in bits
+ *
+ * This function is designed to operate in concurrent access environment.
+ *
+ * Because of concurrency and volatile nature of underlying bitmap, it's not
+ * guaranteed that the found bit is the 1st bit in the bitmap. It's also not
+ * guaranteed that if @nbits is returned, the bitmap is empty.
+ *
+ * The function does guarantee that if returned value is in range [0 .. @nbits),
+ * the acquired bit belongs to the caller exclusively.
+ *
+ * Returns: found and set bit, or @nbits if no bits found
+ */
+static inline
+unsigned long find_and_set_bit(volatile unsigned long *addr, unsigned long nbits)
+{
+	if (small_const_nbits(nbits)) {
+		unsigned long val, ret;
+
+		do {
+			val = *addr | ~GENMASK(nbits - 1, 0);
+			if (val == ~0UL)
+				return nbits;
+			ret = ffz(val);
+		} while (test_and_set_bit(ret, addr));
+
+		return ret;
+	}
+
+	return _find_and_set_bit(addr, nbits);
+}
+
+
+/**
+ * find_and_set_next_bit - Find a zero bit and set it, starting from @offset
+ * @addr: The address to base the search on
+ * @nbits: The bitmap nbits in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * This function is designed to operate in concurrent access environment.
+ *
+ * Because of concurrency and volatile nature of underlying bitmap, it's not
+ * guaranteed that the found bit is the 1st bit in the bitmap, starting from @offset.
+ * It's also not guaranteed that if @nbits is returned, the bitmap is empty.
+ *
+ * The function does guarantee that if returned value is in range [@offset .. @nbits),
+ * the acquired bit belongs to the caller exclusively.
+ *
+ * Returns: found and set bit, or @nbits if no bits found
+ */
+static inline
+unsigned long find_and_set_next_bit(volatile unsigned long *addr,
+				    unsigned long nbits, unsigned long offset)
+{
+	if (small_const_nbits(nbits)) {
+		unsigned long val, ret;
+
+		do {
+			val = *addr | ~GENMASK(nbits - 1, offset);
+			if (val == ~0UL)
+				return nbits;
+			ret = ffz(val);
+		} while (test_and_set_bit(ret, addr));
+
+		return ret;
+	}
+
+	return _find_and_set_next_bit(addr, nbits, offset);
+}
+
+/**
+ * find_and_set_bit_wrap - find and set bit starting at @offset, wrapping around zero
+ * @addr: The first address to base the search on
+ * @nbits: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * Returns: the bit number for the next clear bit, or first clear bit up to @offset,
+ * while atomically setting it. If no bits are found, returns @nbits.
+ */
+static inline
+unsigned long find_and_set_bit_wrap(volatile unsigned long *addr,
+					unsigned long nbits, unsigned long offset)
+{
+	unsigned long bit = find_and_set_next_bit(addr, nbits, offset);
+
+	if (bit < nbits || offset == 0)
+		return bit;
+
+	bit = find_and_set_bit(addr, offset);
+	return bit < offset ? bit : nbits;
+}
+
+/**
+ * find_and_set_bit_lock - find a zero bit, then set it atomically with lock
+ * @addr: The address to base the search on
+ * @nbits: The bitmap nbits in bits
+ *
+ * This function is designed to operate in concurrent access environment.
+ *
+ * Because of concurrency and volatile nature of underlying bitmap, it's not
+ * guaranteed that the found bit is the 1st bit in the bitmap. It's also not
+ * guaranteed that if @nbits is returned, the bitmap is empty.
+ *
+ * The function does guarantee that if returned value is in range [0 .. @nbits),
+ * the acquired bit belongs to the caller exclusively.
+ *
+ * Returns: found and set bit, or @nbits if no bits found
+ */
+static inline
+unsigned long find_and_set_bit_lock(volatile unsigned long *addr, unsigned long nbits)
+{
+	if (small_const_nbits(nbits)) {
+		unsigned long val, ret;
+
+		do {
+			val = *addr | ~GENMASK(nbits - 1, 0);
+			if (val == ~0UL)
+				return nbits;
+			ret = ffz(val);
+		} while (test_and_set_bit_lock(ret, addr));
+
+		return ret;
+	}
+
+	return _find_and_set_bit_lock(addr, nbits);
+}
+
+/**
+ * find_and_set_next_bit_lock - find a zero bit and set it atomically with lock
+ * @addr: The address to base the search on
+ * @nbits: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * This function is designed to operate in concurrent access environment.
+ *
+ * Because of concurrency and volatile nature of underlying bitmap, it's not
+ * guaranteed that the found bit is the 1st bit in the range. It's also not
+ * guaranteed that if @nbits is returned, the bitmap is empty.
+ *
+ * The function does guarantee that if returned value is in range [@offset .. @nbits),
+ * the acquired bit belongs to the caller exclusively.
+ *
+ * Returns: found and set bit, or @nbits if no bits found
+ */
+static inline
+unsigned long find_and_set_next_bit_lock(volatile unsigned long *addr,
+					 unsigned long nbits, unsigned long offset)
+{
+	if (small_const_nbits(nbits)) {
+		unsigned long val, ret;
+
+		do {
+			val = *addr | ~GENMASK(nbits - 1, offset);
+			if (val == ~0UL)
+				return nbits;
+			ret = ffz(val);
+		} while (test_and_set_bit_lock(ret, addr));
+
+		return ret;
+	}
+
+	return _find_and_set_next_bit_lock(addr, nbits, offset);
+}
+
+/**
+ * find_and_set_bit_wrap_lock - find zero bit starting at @ofset and set it
+ *				with lock, and wrap around zero if nothing found
+ * @addr: The first address to base the search on
+ * @nbits: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * Returns: the bit number for the next set bit, or first set bit up to @offset
+ * If no bits are set, returns @nbits.
+ */
+static inline
+unsigned long find_and_set_bit_wrap_lock(volatile unsigned long *addr,
+					unsigned long nbits, unsigned long offset)
+{
+	unsigned long bit = find_and_set_next_bit_lock(addr, nbits, offset);
+
+	if (bit < nbits || offset == 0)
+		return bit;
+
+	bit = find_and_set_bit_lock(addr, offset);
+	return bit < offset ? bit : nbits;
+}
+
+/**
+ * find_and_clear_bit - Find a set bit and clear it atomically
+ * @addr: The address to base the search on
+ * @nbits: The bitmap nbits in bits
+ *
+ * This function is designed to operate in concurrent access environment.
+ *
+ * Because of concurrency and volatile nature of underlying bitmap, it's not
+ * guaranteed that the found bit is the 1st bit in the bitmap. It's also not
+ * guaranteed that if @nbits is returned, the bitmap is empty.
+ *
+ * The function does guarantee that if returned value is in range [0 .. @nbits),
+ * the acquired bit belongs to the caller exclusively.
+ *
+ * Returns: found and cleared bit, or @nbits if no bits found
+ */
+static inline unsigned long find_and_clear_bit(volatile unsigned long *addr, unsigned long nbits)
+{
+	if (small_const_nbits(nbits)) {
+		unsigned long val, ret;
+
+		do {
+			val = *addr & GENMASK(nbits - 1, 0);
+			if (val == 0)
+				return nbits;
+			ret = __ffs(val);
+		} while (!test_and_clear_bit(ret, addr));
+
+		return ret;
+	}
+
+	return _find_and_clear_bit(addr, nbits);
+}
+
+/**
+ * find_and_clear_next_bit - Find a set bit next after @offset, and clear it atomically
+ * @addr: The address to base the search on
+ * @nbits: The bitmap nbits in bits
+ * @offset: bit offset at which to start searching
+ *
+ * This function is designed to operate in concurrent access environment.
+ *
+ * Because of concurrency and volatile nature of underlying bitmap, it's not
+ * guaranteed that the found bit is the 1st bit in the range It's also not
+ * guaranteed that if @nbits is returned, there's no set bits after @offset.
+ *
+ * The function does guarantee that if returned value is in range [@offset .. @nbits),
+ * the acquired bit belongs to the caller exclusively.
+ *
+ * Returns: found and cleared bit, or @nbits if no bits found
+ */
+static inline
+unsigned long find_and_clear_next_bit(volatile unsigned long *addr,
+					unsigned long nbits, unsigned long offset)
+{
+	if (small_const_nbits(nbits)) {
+		unsigned long val, ret;
+
+		do {
+			val = *addr & GENMASK(nbits - 1, offset);
+			if (val == 0)
+				return nbits;
+			ret = __ffs(val);
+		} while (!test_and_clear_bit(ret, addr));
+
+		return ret;
+	}
+
+	return _find_and_clear_next_bit(addr, nbits, offset);
+}
+
 /**
  * find_next_clump8 - find next 8-bit clump with set bits in a memory region
  * @clump: location to store copy of found clump
@@ -577,6 +848,28 @@ unsigned long find_next_bit_le(const void *addr, unsigned
 #define for_each_set_bit_from(bit, addr, size) \
 	for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
 
+/* same as for_each_set_bit() but atomically clears each found bit */
+#define for_each_test_and_clear_bit(bit, addr, size) \
+	for ((bit) = 0; \
+	     (bit) = find_and_clear_next_bit((addr), (size), (bit)), (bit) < (size); \
+	     (bit)++)
+
+/* same as for_each_set_bit_from() but atomically clears each found bit */
+#define for_each_test_and_clear_bit_from(bit, addr, size) \
+	for (; (bit) = find_and_clear_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)
+
+/* same as for_each_clear_bit() but atomically sets each found bit */
+#define for_each_test_and_set_bit(bit, addr, size) \
+	for ((bit) = 0; \
+	     (bit) = find_and_set_next_bit((addr), (size), (bit)), (bit) < (size); \
+	     (bit)++)
+
+/* same as for_each_clear_bit_from() but atomically clears each found bit */
+#define for_each_test_and_set_bit_from(bit, addr, size) \
+	for (; \
+	     (bit) = find_and_set_next_bit((addr), (size), (bit)), (bit) < (size); \
+	     (bit)++)
+
 #define for_each_clear_bit(bit, addr, size) \
 	for ((bit) = 0;									\
 	     (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size);		\
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 32f99e9a670e64..c9b6b9f966108f 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -116,6 +116,91 @@ unsigned long _find_first_and_bit(const unsigned long *addr1,
 EXPORT_SYMBOL(_find_first_and_bit);
 #endif
 
+unsigned long _find_and_set_bit(volatile unsigned long *addr, unsigned long nbits)
+{
+	unsigned long bit;
+
+	do {
+		bit = FIND_FIRST_BIT(~addr[idx], /* nop */, nbits);
+		if (bit >= nbits)
+			return nbits;
+	} while (test_and_set_bit(bit, addr));
+
+	return bit;
+}
+EXPORT_SYMBOL(_find_and_set_bit);
+
+unsigned long _find_and_set_next_bit(volatile unsigned long *addr,
+				     unsigned long nbits, unsigned long start)
+{
+	unsigned long bit;
+
+	do {
+		bit = FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start);
+		if (bit >= nbits)
+			return nbits;
+	} while (test_and_set_bit(bit, addr));
+
+	return bit;
+}
+EXPORT_SYMBOL(_find_and_set_next_bit);
+
+unsigned long _find_and_set_bit_lock(volatile unsigned long *addr, unsigned long nbits)
+{
+	unsigned long bit;
+
+	do {
+		bit = FIND_FIRST_BIT(~addr[idx], /* nop */, nbits);
+		if (bit >= nbits)
+			return nbits;
+	} while (test_and_set_bit_lock(bit, addr));
+
+	return bit;
+}
+EXPORT_SYMBOL(_find_and_set_bit_lock);
+
+unsigned long _find_and_set_next_bit_lock(volatile unsigned long *addr,
+					  unsigned long nbits, unsigned long start)
+{
+	unsigned long bit;
+
+	do {
+		bit = FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start);
+		if (bit >= nbits)
+			return nbits;
+	} while (test_and_set_bit_lock(bit, addr));
+
+	return bit;
+}
+EXPORT_SYMBOL(_find_and_set_next_bit_lock);
+
+unsigned long _find_and_clear_bit(volatile unsigned long *addr, unsigned long nbits)
+{
+	unsigned long bit;
+
+	do {
+		bit = FIND_FIRST_BIT(addr[idx], /* nop */, nbits);
+		if (bit >= nbits)
+			return nbits;
+	} while (!test_and_clear_bit(bit, addr));
+
+	return bit;
+}
+EXPORT_SYMBOL(_find_and_clear_bit);
+
+unsigned long _find_and_clear_next_bit(volatile unsigned long *addr,
+					unsigned long nbits, unsigned long start)
+{
+	do {
+		start =  FIND_NEXT_BIT(addr[idx], /* nop */, nbits, start);
+		if (start >= nbits)
+			return nbits;
+	} while (!test_and_clear_bit(start, addr));
+
+	return start;
+}
+EXPORT_SYMBOL(_find_and_clear_next_bit);
+
 #ifndef find_first_zero_bit
 /*
  * Find the first cleared bit in a memory region.

From 9297e20670743257f6c3fe7ebd6be5802b1dc8c7 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:16 -0800
Subject: [PATCH 037/707] lib/find: add test for atomic find_bit() ops

Add basic functionality test for new API.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 lib/test_bitmap.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 65f22c2578b066..277e1ca9fd2847 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -221,6 +221,65 @@ static void __init test_zero_clear(void)
 	expect_eq_pbl("", bmap, 1024);
 }
 
+static void __init test_find_and_bit(void)
+{
+	unsigned long w, w_part, bit, cnt = 0;
+	DECLARE_BITMAP(bmap, EXP1_IN_BITS);
+
+	/*
+	 * Test find_and_clear{_next}_bit() and corresponding
+	 * iterators
+	 */
+	bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+	w = bitmap_weight(bmap, EXP1_IN_BITS);
+
+	for_each_test_and_clear_bit(bit, bmap, EXP1_IN_BITS)
+		cnt++;
+
+	expect_eq_uint(w, cnt);
+	expect_eq_uint(0, bitmap_weight(bmap, EXP1_IN_BITS));
+
+	bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+	w = bitmap_weight(bmap, EXP1_IN_BITS);
+	w_part = bitmap_weight(bmap, EXP1_IN_BITS / 3);
+
+	cnt = 0;
+	bit = EXP1_IN_BITS / 3;
+	for_each_test_and_clear_bit_from(bit, bmap, EXP1_IN_BITS)
+		cnt++;
+
+	expect_eq_uint(bitmap_weight(bmap, EXP1_IN_BITS), bitmap_weight(bmap, EXP1_IN_BITS / 3));
+	expect_eq_uint(w_part, bitmap_weight(bmap, EXP1_IN_BITS));
+	expect_eq_uint(w - w_part, cnt);
+
+	/*
+	 * Test find_and_set{_next}_bit() and corresponding
+	 * iterators
+	 */
+	bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+	w = bitmap_weight(bmap, EXP1_IN_BITS);
+	cnt = 0;
+
+	for_each_test_and_set_bit(bit, bmap, EXP1_IN_BITS)
+		cnt++;
+
+	expect_eq_uint(EXP1_IN_BITS - w, cnt);
+	expect_eq_uint(EXP1_IN_BITS, bitmap_weight(bmap, EXP1_IN_BITS));
+
+	bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+	w = bitmap_weight(bmap, EXP1_IN_BITS);
+	w_part = bitmap_weight(bmap, EXP1_IN_BITS / 3);
+	cnt = 0;
+
+	bit = EXP1_IN_BITS / 3;
+	for_each_test_and_set_bit_from(bit, bmap, EXP1_IN_BITS)
+		cnt++;
+
+	expect_eq_uint(EXP1_IN_BITS - bitmap_weight(bmap, EXP1_IN_BITS),
+			EXP1_IN_BITS / 3 - bitmap_weight(bmap, EXP1_IN_BITS / 3));
+	expect_eq_uint(EXP1_IN_BITS * 2 / 3 - (w - w_part), cnt);
+}
+
 static void __init test_find_nth_bit(void)
 {
 	unsigned long b, bit, cnt = 0;
@@ -1273,6 +1332,8 @@ static void __init selftest(void)
 	test_for_each_clear_bitrange_from();
 	test_for_each_set_clump8();
 	test_for_each_set_bit_wrap();
+
+	test_find_and_bit();
 }
 
 KSTM_MODULE_LOADERS(test_bitmap);

From 0af7b0df61f906c57568b8730df70058820a7613 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:17 -0800
Subject: [PATCH 038/707] lib/sbitmap; optimize __sbitmap_get_word() by using
 find_and_set_bit()

__sbitmap_get_word() opencodes either find_and_set_bit_wrap(), or
find_and_set_next_bit() depending on wrap parameter. Simplify it by using
atomic find_bit() API.

While here, simplify sbitmap_find_bit_in_word(), which calls it.

CC: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 lib/sbitmap.c | 46 +++++++++-------------------------------------
 1 file changed, 9 insertions(+), 37 deletions(-)

diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index d0a5081dfd122e..8ecd830ba9e896 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -133,38 +133,13 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
 }
 EXPORT_SYMBOL_GPL(sbitmap_resize);
 
-static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
+static inline int __sbitmap_get_word(unsigned long *word, unsigned long depth,
 			      unsigned int hint, bool wrap)
 {
-	int nr;
-
-	/* don't wrap if starting from 0 */
-	wrap = wrap && hint;
-
-	while (1) {
-		nr = find_next_zero_bit(word, depth, hint);
-		if (unlikely(nr >= depth)) {
-			/*
-			 * We started with an offset, and we didn't reset the
-			 * offset to 0 in a failure case, so start from 0 to
-			 * exhaust the map.
-			 */
-			if (hint && wrap) {
-				hint = 0;
-				continue;
-			}
-			return -1;
-		}
+	if (wrap)
+		return find_and_set_bit_wrap_lock(word, depth, hint);
 
-		if (!test_and_set_bit_lock(nr, word))
-			break;
-
-		hint = nr + 1;
-		if (hint >= depth - 1)
-			hint = 0;
-	}
-
-	return nr;
+	return find_and_set_next_bit_lock(word, depth, hint);
 }
 
 static int sbitmap_find_bit_in_word(struct sbitmap_word *map,
@@ -175,15 +150,12 @@ static int sbitmap_find_bit_in_word(struct sbitmap_word *map,
 	int nr;
 
 	do {
-		nr = __sbitmap_get_word(&map->word, depth,
-					alloc_hint, wrap);
-		if (nr != -1)
-			break;
-		if (!sbitmap_deferred_clear(map))
-			break;
-	} while (1);
+		nr = __sbitmap_get_word(&map->word, depth, alloc_hint, wrap);
+		if (nr < depth)
+			return nr;
+	} while (sbitmap_deferred_clear(map));
 
-	return nr;
+	return -1;
 }
 
 static int sbitmap_find_bit(struct sbitmap *sb,

From fc3bdc592a724edeb6546f39f168067571007d7a Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:18 -0800
Subject: [PATCH 039/707] watch_queue: optimize post_one_notification() by
 using find_and_clear_bit()

post_one_notification() searches for a set bit in wqueue->notes_bitmap,
and after some housekeeping work clears it, firing a BUG() if someone
else cleared the bit in-between.

We can allocate the bit atomically with an atomic find_and_clear_bit(),
and remove the BUG() possibility entirely.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 kernel/watch_queue.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 778b4056700ff5..07edd4a2b4636b 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -112,7 +112,7 @@ static bool post_one_notification(struct watch_queue *wqueue,
 	if (pipe_full(head, tail, pipe->ring_size))
 		goto lost;
 
-	note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes);
+	note = find_and_clear_bit(wqueue->notes_bitmap, wqueue->nr_notes);
 	if (note >= wqueue->nr_notes)
 		goto lost;
 
@@ -133,10 +133,6 @@ static bool post_one_notification(struct watch_queue *wqueue,
 	buf->flags = PIPE_BUF_FLAG_WHOLE;
 	smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */
 
-	if (!test_and_clear_bit(note, wqueue->notes_bitmap)) {
-		spin_unlock_irq(&pipe->rd_wait.lock);
-		BUG();
-	}
 	wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 	done = true;
 

From cd6c08c6647d524d71f51428b78fe72590d42c16 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:19 -0800
Subject: [PATCH 040/707] sched: add cpumask_find_and_set() and use it in
 __mm_cid_get()

__mm_cid_get() uses __mm_cid_try_get() helper to atomically acquire a
bit in mm cid mask. Now that we have atomic find_and_set_bit(), we can
easily extend it to cpumasks and use in the scheduler code.

cpumask_find_and_set() considers cid mask as a volatile region of memory,
as it actually is in this case. So, if it's changed while search is in
progress, KCSAN wouldn't fire warning on it.

CC: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
---
 include/linux/cpumask.h | 12 ++++++++++++
 kernel/sched/sched.h    | 14 +++++---------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index cfb545841a2c74..c2acced8be4ece 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -271,6 +271,18 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
 		small_cpumask_bits, n + 1);
 }
 
+/**
+ * cpumask_find_and_set - find the first unset cpu in a cpumask and
+ *			  set it atomically
+ * @srcp: the cpumask pointer
+ *
+ * Return: >= nr_cpu_ids if nothing is found.
+ */
+static inline unsigned int cpumask_find_and_set(volatile struct cpumask *srcp)
+{
+	return find_and_set_bit(cpumask_bits(srcp), small_cpumask_bits);
+}
+
 /**
  * for_each_cpu - iterate over every cpu in a mask
  * @cpu: the (optionally unsigned) integer iterator
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2e5a95486a4222..2ce9112de89be1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3347,23 +3347,19 @@ static inline void mm_cid_put(struct mm_struct *mm)
 
 static inline int __mm_cid_try_get(struct mm_struct *mm)
 {
-	struct cpumask *cpumask;
-	int cid;
+	struct cpumask *cpumask = mm_cidmask(mm);
+	int cid = nr_cpu_ids;
 
-	cpumask = mm_cidmask(mm);
 	/*
 	 * Retry finding first zero bit if the mask is temporarily
 	 * filled. This only happens during concurrent remote-clear
 	 * which owns a cid without holding a rq lock.
 	 */
-	for (;;) {
-		cid = cpumask_first_zero(cpumask);
-		if (cid < nr_cpu_ids)
-			break;
+	while (cid >= nr_cpu_ids) {
+		cid = cpumask_find_and_set(cpumask);
 		cpu_relax();
 	}
-	if (cpumask_test_and_set_cpu(cid, cpumask))
-		return -1;
+
 	return cid;
 }
 

From 991411e2febc2f472d7ace069bb371d1bf2f6df4 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:20 -0800
Subject: [PATCH 041/707] mips: sgi-ip30: optimize heart_alloc_int() by using
 find_and_set_bit()

heart_alloc_int() opencodes find_and_set_bit(). Simplify it by using the
dedicated function, and make an nice one-liner.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 arch/mips/sgi-ip30/ip30-irq.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/arch/mips/sgi-ip30/ip30-irq.c b/arch/mips/sgi-ip30/ip30-irq.c
index 423c32cb66ed52..3c4d4e947817fb 100644
--- a/arch/mips/sgi-ip30/ip30-irq.c
+++ b/arch/mips/sgi-ip30/ip30-irq.c
@@ -28,17 +28,9 @@ static DEFINE_PER_CPU(unsigned long, irq_enable_mask);
 
 static inline int heart_alloc_int(void)
 {
-	int bit;
+	int bit = find_and_set_bit(heart_irq_map, HEART_NUM_IRQS);
 
-again:
-	bit = find_first_zero_bit(heart_irq_map, HEART_NUM_IRQS);
-	if (bit >= HEART_NUM_IRQS)
-		return -ENOSPC;
-
-	if (test_and_set_bit(bit, heart_irq_map))
-		goto again;
-
-	return bit;
+	return bit < HEART_NUM_IRQS ? bit : -ENOSPC;
 }
 
 static void ip30_error_irq(struct irq_desc *desc)

From 448a89c116ca108606670621012cc5424016410f Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:21 -0800
Subject: [PATCH 042/707] sparc: optimize alloc_msi() by using
 find_and_set_bit()

alloc_msi() opencodes find_and_set_bit(). Simplify it by using the
dedicated function, and make an nice one-liner.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 arch/sparc/kernel/pci_msi.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/sparc/kernel/pci_msi.c b/arch/sparc/kernel/pci_msi.c
index fc7402948b7bc0..91105c788d1d9d 100644
--- a/arch/sparc/kernel/pci_msi.c
+++ b/arch/sparc/kernel/pci_msi.c
@@ -96,14 +96,9 @@ static u32 pick_msiq(struct pci_pbm_info *pbm)
 
 static int alloc_msi(struct pci_pbm_info *pbm)
 {
-	int i;
-
-	for (i = 0; i < pbm->msi_num; i++) {
-		if (!test_and_set_bit(i, pbm->msi_bitmap))
-			return i + pbm->msi_first;
-	}
+	int i = find_and_set_bit(pbm->msi_bitmap, pbm->msi_num);
 
-	return -ENOENT;
+	return i < pbm->msi_num ? i + pbm->msi_first : -ENOENT;
 }
 
 static void free_msi(struct pci_pbm_info *pbm, int msi_num)

From e905d8a7d76b2b45f70ff0c8584d2ce99ee4c387 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:22 -0800
Subject: [PATCH 043/707] perf/arm: use atomic find_bit() API

Simplify subsystem by use atomic find_bit() or atomic API where
applicable.

CC: Will Deacon <will@kernel.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/perf/arm-cci.c        | 24 ++++++------------------
 drivers/perf/arm-ccn.c        | 10 ++--------
 drivers/perf/arm_dmc620_pmu.c |  9 ++-------
 drivers/perf/arm_pmuv3.c      |  8 ++------
 4 files changed, 12 insertions(+), 39 deletions(-)

diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
index 61de861eaf91e3..cb15b4cee5f75e 100644
--- a/drivers/perf/arm-cci.c
+++ b/drivers/perf/arm-cci.c
@@ -320,12 +320,9 @@ static int cci400_get_event_idx(struct cci_pmu *cci_pmu,
 		return CCI400_PMU_CYCLE_CNTR_IDX;
 	}
 
-	for (idx = CCI400_PMU_CNTR0_IDX; idx <= CCI_PMU_CNTR_LAST(cci_pmu); ++idx)
-		if (!test_and_set_bit(idx, hw->used_mask))
-			return idx;
-
-	/* No counters available */
-	return -EAGAIN;
+	idx = find_and_set_next_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1,
+							CCI400_PMU_CNTR0_IDX);
+	return idx < CCI_PMU_CNTR_LAST(cci_pmu) + 1 ? idx : -EAGAIN;
 }
 
 static int cci400_validate_hw_event(struct cci_pmu *cci_pmu, unsigned long hw_event)
@@ -802,13 +799,8 @@ static int pmu_get_event_idx(struct cci_pmu_hw_events *hw, struct perf_event *ev
 	if (cci_pmu->model->get_event_idx)
 		return cci_pmu->model->get_event_idx(cci_pmu, hw, cci_event);
 
-	/* Generic code to find an unused idx from the mask */
-	for (idx = 0; idx <= CCI_PMU_CNTR_LAST(cci_pmu); idx++)
-		if (!test_and_set_bit(idx, hw->used_mask))
-			return idx;
-
-	/* No counters available */
-	return -EAGAIN;
+	idx = find_and_set_bit(hw->used_mask, CCI_PMU_CNTR_LAST(cci_pmu) + 1);
+	return idx < CCI_PMU_CNTR_LAST(cci_pmu) + 1 ? idx : -EAGAIN;
 }
 
 static int pmu_map_event(struct perf_event *event)
@@ -861,12 +853,8 @@ static void pmu_free_irq(struct cci_pmu *cci_pmu)
 {
 	int i;
 
-	for (i = 0; i < cci_pmu->nr_irqs; i++) {
-		if (!test_and_clear_bit(i, &cci_pmu->active_irqs))
-			continue;
-
+	for_each_test_and_clear_bit(i, &cci_pmu->active_irqs, cci_pmu->nr_irqs)
 		free_irq(cci_pmu->irqs[i], cci_pmu);
-	}
 }
 
 static u32 pmu_read_counter(struct perf_event *event)
diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c
index 728d13d8e98ac9..d657701b1f236c 100644
--- a/drivers/perf/arm-ccn.c
+++ b/drivers/perf/arm-ccn.c
@@ -589,15 +589,9 @@ static const struct attribute_group *arm_ccn_pmu_attr_groups[] = {
 
 static int arm_ccn_pmu_alloc_bit(unsigned long *bitmap, unsigned long size)
 {
-	int bit;
-
-	do {
-		bit = find_first_zero_bit(bitmap, size);
-		if (bit >= size)
-			return -EAGAIN;
-	} while (test_and_set_bit(bit, bitmap));
+	int bit = find_and_set_bit(bitmap, size);
 
-	return bit;
+	return bit < size ? bit : -EAGAIN;
 }
 
 /* All RN-I and RN-D nodes have identical PMUs */
diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
index 30cea685957470..e41c84dabc3ebe 100644
--- a/drivers/perf/arm_dmc620_pmu.c
+++ b/drivers/perf/arm_dmc620_pmu.c
@@ -303,13 +303,8 @@ static int dmc620_get_event_idx(struct perf_event *event)
 		end_idx = DMC620_PMU_MAX_COUNTERS;
 	}
 
-	for (idx = start_idx; idx < end_idx; ++idx) {
-		if (!test_and_set_bit(idx, dmc620_pmu->used_mask))
-			return idx;
-	}
-
-	/* The counters are all in use. */
-	return -EAGAIN;
+	idx = find_and_set_next_bit(dmc620_pmu->used_mask, end_idx, start_idx);
+	return idx < end_idx ? idx : -EAGAIN;
 }
 
 static inline
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 6ca7be05229c10..f046ad9e71f1aa 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -825,13 +825,9 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 static int armv8pmu_get_single_idx(struct pmu_hw_events *cpuc,
 				    struct arm_pmu *cpu_pmu)
 {
-	int idx;
+	int idx = find_and_set_next_bit(cpuc->used_mask, cpu_pmu->num_events, ARMV8_IDX_COUNTER0);
 
-	for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; idx++) {
-		if (!test_and_set_bit(idx, cpuc->used_mask))
-			return idx;
-	}
-	return -EAGAIN;
+	return idx < cpu_pmu->num_events ? idx : -EAGAIN;
 }
 
 static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc,

From f5f61c6f9f2771aeabdb796302a378e50080d6bf Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:23 -0800
Subject: [PATCH 044/707] drivers/perf: optimize ali_drw_get_counter_idx() by
 using find_and_set_bit()

The function searches used_mask for a set bit in a for-loop bit by bit.
Simplify it by using atomic find_and_set_bit().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Will Deacon <will@kernel.org>
---
 drivers/perf/alibaba_uncore_drw_pmu.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/perf/alibaba_uncore_drw_pmu.c b/drivers/perf/alibaba_uncore_drw_pmu.c
index 19d459a36be55c..2a3b7701d568bd 100644
--- a/drivers/perf/alibaba_uncore_drw_pmu.c
+++ b/drivers/perf/alibaba_uncore_drw_pmu.c
@@ -274,15 +274,9 @@ static const struct attribute_group *ali_drw_pmu_attr_groups[] = {
 static int ali_drw_get_counter_idx(struct perf_event *event)
 {
 	struct ali_drw_pmu *drw_pmu = to_ali_drw_pmu(event->pmu);
-	int idx;
+	int idx = find_and_set_bit(drw_pmu->used_mask, ALI_DRW_PMU_COMMON_MAX_COUNTERS);
 
-	for (idx = 0; idx < ALI_DRW_PMU_COMMON_MAX_COUNTERS; ++idx) {
-		if (!test_and_set_bit(idx, drw_pmu->used_mask))
-			return idx;
-	}
-
-	/* The counters are all in use. */
-	return -EBUSY;
+	return idx < ALI_DRW_PMU_COMMON_MAX_COUNTERS ? idx : -EBUSY;
 }
 
 static u64 ali_drw_pmu_read_counter(struct perf_event *event)

From 37cd1b38270a3eb1555b533ad597252f5bfd9ecc Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:24 -0800
Subject: [PATCH 045/707] dmaengine: idxd: optimize perfmon_assign_event()

The function searches used_mask for a set bit in a for-loop bit by bit.
Simplify it by using atomic find_and_set_bit(), and make a nice
one-liner.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Vinod Koul <vkoul@kernel.org>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
---
 drivers/dma/idxd/perfmon.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/idxd/perfmon.c b/drivers/dma/idxd/perfmon.c
index fdda6d60426295..4dd9c0d979c388 100644
--- a/drivers/dma/idxd/perfmon.c
+++ b/drivers/dma/idxd/perfmon.c
@@ -134,13 +134,9 @@ static void perfmon_assign_hw_event(struct idxd_pmu *idxd_pmu,
 static int perfmon_assign_event(struct idxd_pmu *idxd_pmu,
 				struct perf_event *event)
 {
-	int i;
-
-	for (i = 0; i < IDXD_PMU_EVENT_MAX; i++)
-		if (!test_and_set_bit(i, idxd_pmu->used_mask))
-			return i;
+	int i = find_and_set_bit(idxd_pmu->used_mask, IDXD_PMU_EVENT_MAX);
 
-	return -EINVAL;
+	return i < IDXD_PMU_EVENT_MAX ? i : -EINVAL;
 }
 
 /*

From 10922d08df496bc8e0963ce810936f30ad56c81c Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:25 -0800
Subject: [PATCH 046/707] ath10k: optimize ath10k_snoc_napi_poll()

ath10k_snoc_napi_poll() traverses pending_ce_irqs bitmap bit by bit.
Simplify it by using for_each_test_and_clear_bit() iterator.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/net/wireless/ath/ath10k/snoc.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
index 2c39bad7ebfb9a..a1db5a973780c4 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.c
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -1237,11 +1237,10 @@ static int ath10k_snoc_napi_poll(struct napi_struct *ctx, int budget)
 		return done;
 	}
 
-	for (ce_id = 0; ce_id < CE_COUNT; ce_id++)
-		if (test_and_clear_bit(ce_id, ar_snoc->pending_ce_irqs)) {
-			ath10k_ce_per_engine_service(ar, ce_id);
-			ath10k_ce_enable_interrupt(ar, ce_id);
-		}
+	for_each_test_and_clear_bit(ce_id, ar_snoc->pending_ce_irqs, CE_COUNT) {
+		ath10k_ce_per_engine_service(ar, ce_id);
+		ath10k_ce_enable_interrupt(ar, ce_id);
+	}
 
 	done = ath10k_htt_txrx_compl_task(ar, budget);
 

From f55f49707defc7939df686487535ac5702de9a67 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:26 -0800
Subject: [PATCH 047/707] wifi: rtw88: optimize the driver by using atomic
 iterator

rtw_pci_tx_kick_off() and rtw89_pci_tx_kick_off_pending() traverse bitmaps
bit by bit. Simplify it by using atomic for_each_test_and_clear_bit()
iterator.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/net/wireless/realtek/rtw88/pci.c | 5 ++---
 drivers/net/wireless/realtek/rtw89/pci.c | 5 +----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtw88/pci.c b/drivers/net/wireless/realtek/rtw88/pci.c
index 2bfc0e822b8d0b..a0d69c75a38185 100644
--- a/drivers/net/wireless/realtek/rtw88/pci.c
+++ b/drivers/net/wireless/realtek/rtw88/pci.c
@@ -789,9 +789,8 @@ static void rtw_pci_tx_kick_off(struct rtw_dev *rtwdev)
 	struct rtw_pci *rtwpci = (struct rtw_pci *)rtwdev->priv;
 	enum rtw_tx_queue_type queue;
 
-	for (queue = 0; queue < RTK_MAX_TX_QUEUE_NUM; queue++)
-		if (test_and_clear_bit(queue, rtwpci->tx_queued))
-			rtw_pci_tx_kick_off_queue(rtwdev, queue);
+	for_each_test_and_clear_bit(queue, rtwpci->tx_queued, RTK_MAX_TX_QUEUE_NUM)
+		rtw_pci_tx_kick_off_queue(rtwdev, queue);
 }
 
 static int rtw_pci_tx_write_data(struct rtw_dev *rtwdev,
diff --git a/drivers/net/wireless/realtek/rtw89/pci.c b/drivers/net/wireless/realtek/rtw89/pci.c
index 14ddb0d39e6374..184d41b774d7c3 100644
--- a/drivers/net/wireless/realtek/rtw89/pci.c
+++ b/drivers/net/wireless/realtek/rtw89/pci.c
@@ -1077,10 +1077,7 @@ static void rtw89_pci_tx_kick_off_pending(struct rtw89_dev *rtwdev)
 	struct rtw89_pci_tx_ring *tx_ring;
 	int txch;
 
-	for (txch = 0; txch < RTW89_TXCH_NUM; txch++) {
-		if (!test_and_clear_bit(txch, rtwpci->kick_map))
-			continue;
-
+	for_each_test_and_clear_bit(txch, rtwpci->kick_map, RTW89_TXCH_NUM) {
 		tx_ring = &rtwpci->tx_rings[txch];
 		__rtw89_pci_tx_kick_off(rtwdev, tx_ring);
 	}

From 252479be16f72ecd6a0cf0ab88d79cac70eee826 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:27 -0800
Subject: [PATCH 048/707] KVM: x86: hyper-v: optimize and cleanup
 kvm_hv_process_stimers()

The function traverses stimer_pending_bitmap in a for-loop bit by bit.
Simplify it by using atomic for_each_test_and_clear_bit().

Because there are only 4 bits, using for_each_test_and_clear_bit() will
still generate inline code, so no excessive bloating with the new API.

While here, refactor the logic by decreasing indentation level.

CC: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Acked-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/hyperv.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 238afd7335e46d..d541524ca49f74 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -870,27 +870,27 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
 	if (!hv_vcpu)
 		return;
 
-	for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
-		if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
-			stimer = &hv_vcpu->stimer[i];
-			if (stimer->config.enable) {
-				exp_time = stimer->exp_time;
-
-				if (exp_time) {
-					time_now =
-						get_time_ref_counter(vcpu->kvm);
-					if (time_now >= exp_time)
-						stimer_expiration(stimer);
-				}
-
-				if ((stimer->config.enable) &&
-				    stimer->count) {
-					if (!stimer->msg_pending)
-						stimer_start(stimer);
-				} else
-					stimer_cleanup(stimer);
-			}
+	for_each_test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap,
+				    ARRAY_SIZE(hv_vcpu->stimer)) {
+		stimer = &hv_vcpu->stimer[i];
+		if (!stimer->config.enable)
+			continue;
+
+		exp_time = stimer->exp_time;
+
+		if (exp_time) {
+			time_now = get_time_ref_counter(vcpu->kvm);
+			if (time_now >= exp_time)
+				stimer_expiration(stimer);
 		}
+
+		if (stimer->config.enable && stimer->count) {
+			if (!stimer->msg_pending)
+				stimer_start(stimer);
+		} else {
+			stimer_cleanup(stimer);
+		}
+	}
 }
 
 void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)

From 020c02d58ef080a486c05f0adf8dd146e2549a74 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:28 -0800
Subject: [PATCH 049/707] PCI: hv: Optimize hv_get_dom_num() by using
 find_and_set_bit()

The function traverses bitmap with for_each_clear_bit() just to allocate
a bit atomically. Simplify it by using dedicated find_and_set_bit().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Acked-by: Wei Liu <wei.liu@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/controller/pci-hyperv.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 30c7dfeccb16f5..033b1fb7f4eb44 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -3605,12 +3605,9 @@ static u16 hv_get_dom_num(u16 dom)
 	if (test_and_set_bit(dom, hvpci_dom_map) == 0)
 		return dom;
 
-	for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
-		if (test_and_set_bit(i, hvpci_dom_map) == 0)
-			return i;
-	}
+	i = find_and_set_bit(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
 
-	return HVPCI_DOM_INVALID;
+	return i < HVPCI_DOM_MAP_SIZE ? i : HVPCI_DOM_INVALID;
 }
 
 /**

From 57dd83bdbe3c0b3a0e83b339cd777568a179e329 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:29 -0800
Subject: [PATCH 050/707] scsi: core: optimize scsi_evt_emit() by using an
 atomic iterator

A plain loop in scsi_evt_thread() opencodes optimized atomic bit traversing
macro. Simplify it by using the dedicated iterator.

CC: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/scsi/scsi_lib.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index cf3864f7209309..a4c5c9b4bfc94e 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2494,14 +2494,13 @@ static void scsi_evt_emit(struct scsi_device *sdev, struct scsi_event *evt)
 void scsi_evt_thread(struct work_struct *work)
 {
 	struct scsi_device *sdev;
-	enum scsi_device_event evt_type;
+	enum scsi_device_event evt_type = SDEV_EVT_FIRST;
 	LIST_HEAD(event_list);
 
 	sdev = container_of(work, struct scsi_device, event_work);
 
-	for (evt_type = SDEV_EVT_FIRST; evt_type <= SDEV_EVT_LAST; evt_type++)
-		if (test_and_clear_bit(evt_type, sdev->pending_events))
-			sdev_evt_send_simple(sdev, evt_type, GFP_KERNEL);
+	for_each_test_and_clear_bit_from(evt_type, sdev->pending_events, SDEV_EVT_LAST + 1)
+		sdev_evt_send_simple(sdev, evt_type, GFP_KERNEL);
 
 	while (1) {
 		struct scsi_event *evt;

From b0bfc29429fd4d989b0d7e6901c60e453d888e45 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:30 -0800
Subject: [PATCH 051/707] scsi: mpi3mr: optimize the driver by using
 find_and_set_bit()

mpi3mr_dev_rmhs_send_tm() and mpi3mr_send_event_ack() opencode
find_and_set_bit(). Simplify them by using dedicated function.

CC: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/scsi/mpi3mr/mpi3mr_os.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c
index 040031eb0c12d4..11139a2008fdac 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_os.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_os.c
@@ -2276,13 +2276,9 @@ static void mpi3mr_dev_rmhs_send_tm(struct mpi3mr_ioc *mrioc, u16 handle,
 	if (drv_cmd)
 		goto issue_cmd;
 	do {
-		cmd_idx = find_first_zero_bit(mrioc->devrem_bitmap,
-		    MPI3MR_NUM_DEVRMCMD);
-		if (cmd_idx < MPI3MR_NUM_DEVRMCMD) {
-			if (!test_and_set_bit(cmd_idx, mrioc->devrem_bitmap))
-				break;
-			cmd_idx = MPI3MR_NUM_DEVRMCMD;
-		}
+		cmd_idx = find_and_set_bit(mrioc->devrem_bitmap, MPI3MR_NUM_DEVRMCMD);
+		if (cmd_idx < MPI3MR_NUM_DEVRMCMD)
+			break;
 	} while (retrycount--);
 
 	if (cmd_idx >= MPI3MR_NUM_DEVRMCMD) {
@@ -2417,14 +2413,9 @@ static void mpi3mr_send_event_ack(struct mpi3mr_ioc *mrioc, u8 event,
 	    "sending event ack in the top half for event(0x%02x), event_ctx(0x%08x)\n",
 	    event, event_ctx);
 	do {
-		cmd_idx = find_first_zero_bit(mrioc->evtack_cmds_bitmap,
-		    MPI3MR_NUM_EVTACKCMD);
-		if (cmd_idx < MPI3MR_NUM_EVTACKCMD) {
-			if (!test_and_set_bit(cmd_idx,
-			    mrioc->evtack_cmds_bitmap))
-				break;
-			cmd_idx = MPI3MR_NUM_EVTACKCMD;
-		}
+		cmd_idx = find_and_set_bit(mrioc->evtack_cmds_bitmap, MPI3MR_NUM_EVTACKCMD);
+		if (cmd_idx < MPI3MR_NUM_EVTACKCMD)
+			break;
 	} while (retrycount--);
 
 	if (cmd_idx >= MPI3MR_NUM_EVTACKCMD) {

From 2cedc5c4cbed533323b2de3147dcae85d1b656f6 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:31 -0800
Subject: [PATCH 052/707] scsi: qedi: optimize qedi_get_task_idx() by using
 find_and_set_bit()

qedi_get_task_idx() opencodes find_and_set_bit(). Simplify it and make the
whole function a simiple almost one-liner.

CC: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/scsi/qedi/qedi_main.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c
index cd0180b1f5b9da..2f940c6898ef3d 100644
--- a/drivers/scsi/qedi/qedi_main.c
+++ b/drivers/scsi/qedi/qedi_main.c
@@ -1824,20 +1824,13 @@ int qedi_get_task_idx(struct qedi_ctx *qedi)
 {
 	s16 tmp_idx;
 
-again:
-	tmp_idx = find_first_zero_bit(qedi->task_idx_map,
-				      MAX_ISCSI_TASK_ENTRIES);
+	tmp_idx = find_and_set_bit(qedi->task_idx_map, MAX_ISCSI_TASK_ENTRIES);
 
 	if (tmp_idx >= MAX_ISCSI_TASK_ENTRIES) {
 		QEDI_ERR(&qedi->dbg_ctx, "FW task context pool is full.\n");
 		tmp_idx = -1;
-		goto err_idx;
 	}
 
-	if (test_and_set_bit(tmp_idx, qedi->task_idx_map))
-		goto again;
-
-err_idx:
 	return tmp_idx;
 }
 

From 1e9e099525e5fea93c761c289a8f30542b306da4 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:32 -0800
Subject: [PATCH 053/707] powerpc: optimize arch code by using atomic
 find_bit() API

Use find_and_{set,clear}_bit() where appropriate and simplify the logic.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 arch/powerpc/mm/book3s32/mmu_context.c     | 10 ++---
 arch/powerpc/platforms/pasemi/dma_lib.c    | 45 +++++-----------------
 arch/powerpc/platforms/powernv/pci-sriov.c | 12 ++----
 3 files changed, 17 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/mmu_context.c b/arch/powerpc/mm/book3s32/mmu_context.c
index 1922f9a6b05850..7db19f173c2ed6 100644
--- a/arch/powerpc/mm/book3s32/mmu_context.c
+++ b/arch/powerpc/mm/book3s32/mmu_context.c
@@ -50,13 +50,11 @@ static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
 
 unsigned long __init_new_context(void)
 {
-	unsigned long ctx = next_mmu_context;
+	unsigned long ctx;
 
-	while (test_and_set_bit(ctx, context_map)) {
-		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
-		if (ctx > LAST_CONTEXT)
-			ctx = 0;
-	}
+	ctx = find_and_set_next_bit(context_map, LAST_CONTEXT + 1, next_mmu_context);
+	if (ctx > LAST_CONTEXT)
+		ctx = 0;
 	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
 
 	return ctx;
diff --git a/arch/powerpc/platforms/pasemi/dma_lib.c b/arch/powerpc/platforms/pasemi/dma_lib.c
index 1be1f18f6f0982..906dabee013249 100644
--- a/arch/powerpc/platforms/pasemi/dma_lib.c
+++ b/arch/powerpc/platforms/pasemi/dma_lib.c
@@ -118,14 +118,9 @@ static int pasemi_alloc_tx_chan(enum pasemi_dmachan_type type)
 		limit = MAX_TXCH;
 		break;
 	}
-retry:
-	bit = find_next_bit(txch_free, MAX_TXCH, start);
-	if (bit >= limit)
-		return -ENOSPC;
-	if (!test_and_clear_bit(bit, txch_free))
-		goto retry;
-
-	return bit;
+
+	bit = find_and_clear_next_bit(txch_free, MAX_TXCH, start);
+	return bit < limit ? bit : -ENOSPC;
 }
 
 static void pasemi_free_tx_chan(int chan)
@@ -136,15 +131,9 @@ static void pasemi_free_tx_chan(int chan)
 
 static int pasemi_alloc_rx_chan(void)
 {
-	int bit;
-retry:
-	bit = find_first_bit(rxch_free, MAX_RXCH);
-	if (bit >= MAX_TXCH)
-		return -ENOSPC;
-	if (!test_and_clear_bit(bit, rxch_free))
-		goto retry;
-
-	return bit;
+	int bit = find_and_clear_bit(rxch_free, MAX_RXCH);
+
+	return bit < MAX_TXCH ? bit : -ENOSPC;
 }
 
 static void pasemi_free_rx_chan(int chan)
@@ -374,16 +363,9 @@ EXPORT_SYMBOL(pasemi_dma_free_buf);
  */
 int pasemi_dma_alloc_flag(void)
 {
-	int bit;
+	int bit = find_and_clear_bit(flags_free, MAX_FLAGS);
 
-retry:
-	bit = find_first_bit(flags_free, MAX_FLAGS);
-	if (bit >= MAX_FLAGS)
-		return -ENOSPC;
-	if (!test_and_clear_bit(bit, flags_free))
-		goto retry;
-
-	return bit;
+	return bit < MAX_FLAGS ? bit : -ENOSPC;
 }
 EXPORT_SYMBOL(pasemi_dma_alloc_flag);
 
@@ -439,16 +421,9 @@ EXPORT_SYMBOL(pasemi_dma_clear_flag);
  */
 int pasemi_dma_alloc_fun(void)
 {
-	int bit;
-
-retry:
-	bit = find_first_bit(fun_free, MAX_FLAGS);
-	if (bit >= MAX_FLAGS)
-		return -ENOSPC;
-	if (!test_and_clear_bit(bit, fun_free))
-		goto retry;
+	int bit = find_and_clear_bit(fun_free, MAX_FLAGS);
 
-	return bit;
+	return bit < MAX_FLAGS ? bit : -ENOSPC;
 }
 EXPORT_SYMBOL(pasemi_dma_alloc_fun);
 
diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
index 59882da3e74253..640e387e6d839c 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -397,18 +397,12 @@ static int64_t pnv_ioda_map_m64_single(struct pnv_phb *phb,
 
 static int pnv_pci_alloc_m64_bar(struct pnv_phb *phb, struct pnv_iov_data *iov)
 {
-	int win;
+	int win = find_and_set_bit(&phb->ioda.m64_bar_alloc, phb->ioda.m64_bar_idx + 1);
 
-	do {
-		win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
-				phb->ioda.m64_bar_idx + 1, 0);
-
-		if (win >= phb->ioda.m64_bar_idx + 1)
-			return -1;
-	} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+	if (win >= phb->ioda.m64_bar_idx + 1)
+		return -1;
 
 	set_bit(win, iov->used_m64_bar_mask);
-
 	return win;
 }
 

From 2ebbcd7bcb17f6695329416e82256ce87c362eaa Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:33 -0800
Subject: [PATCH 054/707] iommu: optimize subsystem by using atomic find_bit()
 API

Simplify  __arm_smmu_alloc_bitmap() and msm_iommu_alloc_ctx() by using
a dedicated API, and make them nice one-liner wrappers.

While here, refactor msm_iommu_attach_dev() and msm_iommu_alloc_ctx()
so that error codes don't mismatch.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/iommu/arm/arm-smmu/arm-smmu.h | 10 ++--------
 drivers/iommu/msm_iommu.c             | 18 ++++--------------
 2 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 703fd5817ec11f..004a4704ebf15c 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -453,15 +453,9 @@ struct arm_smmu_impl {
 
 static inline int __arm_smmu_alloc_bitmap(unsigned long *map, int start, int end)
 {
-	int idx;
+	int idx = find_and_set_next_bit(map, end, start);
 
-	do {
-		idx = find_next_zero_bit(map, end, start);
-		if (idx == end)
-			return -ENOSPC;
-	} while (test_and_set_bit(idx, map));
-
-	return idx;
+	return idx < end ? idx : -ENOSPC;
 }
 
 static inline void __iomem *arm_smmu_page(struct arm_smmu_device *smmu, int n)
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index f86af9815d6f98..67124f4228b1f0 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -185,17 +185,9 @@ static const struct iommu_flush_ops msm_iommu_flush_ops = {
 	.tlb_add_page = __flush_iotlb_page,
 };
 
-static int msm_iommu_alloc_ctx(unsigned long *map, int start, int end)
+static int msm_iommu_alloc_ctx(struct msm_iommu_dev *iommu)
 {
-	int idx;
-
-	do {
-		idx = find_next_zero_bit(map, end, start);
-		if (idx == end)
-			return -ENOSPC;
-	} while (test_and_set_bit(idx, map));
-
-	return idx;
+	return find_and_set_bit(iommu->context_map, iommu->ncb);
 }
 
 static void msm_iommu_free_ctx(unsigned long *map, int idx)
@@ -418,10 +410,8 @@ static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
 					ret = -EEXIST;
 					goto fail;
 				}
-				master->num =
-					msm_iommu_alloc_ctx(iommu->context_map,
-							    0, iommu->ncb);
-				if (IS_ERR_VALUE(master->num)) {
+				master->num = msm_iommu_alloc_ctx(iommu);
+				if (master->num >= iommu->ncb) {
 					ret = -ENODEV;
 					goto fail;
 				}

From 4678bace092ce861e09a6ab4dd3f0e043bdb49a5 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:34 -0800
Subject: [PATCH 055/707] media: radio-shark: optimize the driver by using
 atomic find_bit() API

Despite that it's only 2- or 3-bit maps, convert for-loop followed by
test_bit() to for_each_test_and_clear_bit() as it makes the code cleaner.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/radio/radio-shark.c  | 5 +----
 drivers/media/radio/radio-shark2.c | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/media/radio/radio-shark.c b/drivers/media/radio/radio-shark.c
index 127a3be0e0f070..0c50b3a9623e88 100644
--- a/drivers/media/radio/radio-shark.c
+++ b/drivers/media/radio/radio-shark.c
@@ -158,10 +158,7 @@ static void shark_led_work(struct work_struct *work)
 		container_of(work, struct shark_device, led_work);
 	int i, res, brightness, actual_len;
 
-	for (i = 0; i < 3; i++) {
-		if (!test_and_clear_bit(i, &shark->brightness_new))
-			continue;
-
+	for_each_test_and_clear_bit(i, &shark->brightness_new, 3) {
 		brightness = atomic_read(&shark->brightness[i]);
 		memset(shark->transfer_buffer, 0, TB_LEN);
 		if (i != RED_LED) {
diff --git a/drivers/media/radio/radio-shark2.c b/drivers/media/radio/radio-shark2.c
index f1c5c0a6a335cb..d9ef241e177806 100644
--- a/drivers/media/radio/radio-shark2.c
+++ b/drivers/media/radio/radio-shark2.c
@@ -145,10 +145,7 @@ static void shark_led_work(struct work_struct *work)
 		container_of(work, struct shark_device, led_work);
 	int i, res, brightness, actual_len;
 
-	for (i = 0; i < 2; i++) {
-		if (!test_and_clear_bit(i, &shark->brightness_new))
-			continue;
-
+	for_each_test_and_clear_bit(i, &shark->brightness_new, 2) {
 		brightness = atomic_read(&shark->brightness[i]);
 		memset(shark->transfer_buffer, 0, TB_LEN);
 		shark->transfer_buffer[0] = 0x83 + i;

From 7b39dbf951db07db3001f6113db0765c9598c00a Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:35 -0800
Subject: [PATCH 056/707] sfc: optimize the driver by using atomic find_bit()
 API

SFC code traverses rps_slot_map and rxq_retry_mask bit by bit. Simplify
it by using dedicated atomic find_bit() functions, as they skip already
clear bits.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Edward Cree <ecree.xilinx@gmail.com>
---
 drivers/net/ethernet/sfc/rx_common.c         |  4 +---
 drivers/net/ethernet/sfc/siena/rx_common.c   |  4 +---
 drivers/net/ethernet/sfc/siena/siena_sriov.c | 14 ++++++--------
 3 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c
index d2f35ee15effeb..0112968b3fe7c6 100644
--- a/drivers/net/ethernet/sfc/rx_common.c
+++ b/drivers/net/ethernet/sfc/rx_common.c
@@ -950,9 +950,7 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
 	int rc;
 
 	/* find a free slot */
-	for (slot_idx = 0; slot_idx < EFX_RPS_MAX_IN_FLIGHT; slot_idx++)
-		if (!test_and_set_bit(slot_idx, &efx->rps_slot_map))
-			break;
+	slot_idx = find_and_set_bit(&efx->rps_slot_map, EFX_RPS_MAX_IN_FLIGHT);
 	if (slot_idx >= EFX_RPS_MAX_IN_FLIGHT)
 		return -EBUSY;
 
diff --git a/drivers/net/ethernet/sfc/siena/rx_common.c b/drivers/net/ethernet/sfc/siena/rx_common.c
index 4579f43484c367..160b16aa74862b 100644
--- a/drivers/net/ethernet/sfc/siena/rx_common.c
+++ b/drivers/net/ethernet/sfc/siena/rx_common.c
@@ -958,9 +958,7 @@ int efx_siena_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
 	int rc;
 
 	/* find a free slot */
-	for (slot_idx = 0; slot_idx < EFX_RPS_MAX_IN_FLIGHT; slot_idx++)
-		if (!test_and_set_bit(slot_idx, &efx->rps_slot_map))
-			break;
+	slot_idx = find_and_set_bit(&efx->rps_slot_map, EFX_RPS_MAX_IN_FLIGHT);
 	if (slot_idx >= EFX_RPS_MAX_IN_FLIGHT)
 		return -EBUSY;
 
diff --git a/drivers/net/ethernet/sfc/siena/siena_sriov.c b/drivers/net/ethernet/sfc/siena/siena_sriov.c
index 8353c15dc23336..554b799288b8e2 100644
--- a/drivers/net/ethernet/sfc/siena/siena_sriov.c
+++ b/drivers/net/ethernet/sfc/siena/siena_sriov.c
@@ -722,14 +722,12 @@ static int efx_vfdi_fini_all_queues(struct siena_vf *vf)
 					     efx_vfdi_flush_wake(vf),
 					     timeout);
 		rxqs_count = 0;
-		for (index = 0; index < count; ++index) {
-			if (test_and_clear_bit(index, vf->rxq_retry_mask)) {
-				atomic_dec(&vf->rxq_retry_count);
-				MCDI_SET_ARRAY_DWORD(
-					inbuf, FLUSH_RX_QUEUES_IN_QID_OFST,
-					rxqs_count, vf_offset + index);
-				rxqs_count++;
-			}
+		for_each_test_and_clear_bit(index, vf->rxq_retry_mask, count) {
+			atomic_dec(&vf->rxq_retry_count);
+			MCDI_SET_ARRAY_DWORD(
+				inbuf, FLUSH_RX_QUEUES_IN_QID_OFST,
+				rxqs_count, vf_offset + index);
+			rxqs_count++;
 		}
 	}
 

From 468cf2a9e8267d38c3756ea6576db439d9e219c1 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:36 -0800
Subject: [PATCH 057/707] tty: nozomi: optimize interrupt_handler()

In the exit path of interrupt_handler(), dc->flip map is traversed bit
by bit to find and clear set bits and call tty_flip_buffer_push() for
corresponding ports.

Simplify it by using for_each_test_and_clear_bit(), as it skips already
clear bits.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/tty/nozomi.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/tty/nozomi.c b/drivers/tty/nozomi.c
index 02cd40147b3a80..de0503247391a5 100644
--- a/drivers/tty/nozomi.c
+++ b/drivers/tty/nozomi.c
@@ -1220,9 +1220,8 @@ static irqreturn_t interrupt_handler(int irq, void *dev_id)
 exit_handler:
 	spin_unlock(&dc->spin_mutex);
 
-	for (a = 0; a < NOZOMI_MAX_PORTS; a++)
-		if (test_and_clear_bit(a, &dc->flip))
-			tty_flip_buffer_push(&dc->port[a].port);
+	for_each_test_and_clear_bit(a, &dc->flip, NOZOMI_MAX_PORTS)
+		tty_flip_buffer_push(&dc->port[a].port);
 
 	return IRQ_HANDLED;
 none:

From d744113d7dacab078a5272378ee6a2c478d50cc5 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:37 -0800
Subject: [PATCH 058/707] usb: cdc-acm: optimize acm_softint()

acm_softint() uses for-loop to traverse urbs_in_error_delay bitmap
bit by bit to find and clear set bits.

Simplify it by using for_each_test_and_clear_bit(), because it doesn't
test already clear bits.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Oliver Neukum <oneukum@suse.com>
---
 drivers/usb/class/cdc-acm.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index a1f4e1ead97ff4..8664b63050b0c7 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -613,9 +613,8 @@ static void acm_softint(struct work_struct *work)
 	}
 
 	if (test_and_clear_bit(ACM_ERROR_DELAY, &acm->flags)) {
-		for (i = 0; i < acm->rx_buflimit; i++)
-			if (test_and_clear_bit(i, &acm->urbs_in_error_delay))
-				acm_submit_read_urb(acm, i, GFP_KERNEL);
+		for_each_test_and_clear_bit(i, &acm->urbs_in_error_delay, acm->rx_buflimit)
+			acm_submit_read_urb(acm, i, GFP_KERNEL);
 	}
 
 	if (test_and_clear_bit(EVENT_TTY_WAKEUP, &acm->flags))

From f3687f2f7db4f0f464ab5cb43595c0964655eded Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:38 -0800
Subject: [PATCH 059/707] block: null_blk: replace get_tag() with a generic
 find_and_set_bit_lock()

get_tag() opencodes find_and_set_bit(). Simplify the code by getting
rid of it.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 drivers/block/null_blk/main.c | 41 +++++++++++------------------------
 1 file changed, 13 insertions(+), 28 deletions(-)

diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 3021d58ca51c1f..671dbb9ab928af 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -760,19 +760,6 @@ static void put_tag(struct nullb_queue *nq, unsigned int tag)
 		wake_up(&nq->wait);
 }
 
-static unsigned int get_tag(struct nullb_queue *nq)
-{
-	unsigned int tag;
-
-	do {
-		tag = find_first_zero_bit(nq->tag_map, nq->queue_depth);
-		if (tag >= nq->queue_depth)
-			return -1U;
-	} while (test_and_set_bit_lock(tag, nq->tag_map));
-
-	return tag;
-}
-
 static void free_cmd(struct nullb_cmd *cmd)
 {
 	put_tag(cmd->nq, cmd->tag);
@@ -782,24 +769,22 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer);
 
 static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
 {
+	unsigned int tag = find_and_set_bit_lock(nq->tag_map, nq->queue_depth);
 	struct nullb_cmd *cmd;
-	unsigned int tag;
-
-	tag = get_tag(nq);
-	if (tag != -1U) {
-		cmd = &nq->cmds[tag];
-		cmd->tag = tag;
-		cmd->error = BLK_STS_OK;
-		cmd->nq = nq;
-		if (nq->dev->irqmode == NULL_IRQ_TIMER) {
-			hrtimer_init(&cmd->timer, CLOCK_MONOTONIC,
-				     HRTIMER_MODE_REL);
-			cmd->timer.function = null_cmd_timer_expired;
-		}
-		return cmd;
+
+	if (tag >= nq->queue_depth)
+		return NULL;
+
+	cmd = &nq->cmds[tag];
+	cmd->tag = tag;
+	cmd->error = BLK_STS_OK;
+	cmd->nq = nq;
+	if (nq->dev->irqmode == NULL_IRQ_TIMER) {
+		hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		cmd->timer.function = null_cmd_timer_expired;
 	}
 
-	return NULL;
+	return cmd;
 }
 
 static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, struct bio *bio)

From fe80b801ee439640f9a1b9df80b7ae0bc4d4bfb8 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:39 -0800
Subject: [PATCH 060/707] RDMA/rtrs: optimize __rtrs_get_permit() by using
 find_and_set_bit_lock()

The function opencodes find_and_set_bit_lock() with a while-loop polling
on test_and_set_bit_lock(). Use the dedicated function instead.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/infiniband/ulp/rtrs/rtrs-clt.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
index 07261523c55473..2f3b0ad42e8aa7 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
@@ -72,18 +72,9 @@ __rtrs_get_permit(struct rtrs_clt_sess *clt, enum rtrs_clt_con_type con_type)
 	struct rtrs_permit *permit;
 	int bit;
 
-	/*
-	 * Adapted from null_blk get_tag(). Callers from different cpus may
-	 * grab the same bit, since find_first_zero_bit is not atomic.
-	 * But then the test_and_set_bit_lock will fail for all the
-	 * callers but one, so that they will loop again.
-	 * This way an explicit spinlock is not required.
-	 */
-	do {
-		bit = find_first_zero_bit(clt->permits_map, max_depth);
-		if (bit >= max_depth)
-			return NULL;
-	} while (test_and_set_bit_lock(bit, clt->permits_map));
+	bit = find_and_set_bit_lock(clt->permits_map, max_depth);
+	if (bit >= max_depth)
+		return NULL;
 
 	permit = get_permit(clt, bit);
 	WARN_ON(permit->mem_id != bit);

From fea0ea785cef13d881cdc3c5136d98851f14335f Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:40 -0800
Subject: [PATCH 061/707] mISDN: optimize get_free_devid()

get_free_devid() traverses each bit in device_ids in an open-coded loop.
Simplify it by using the dedicated find_and_set_bit().

It makes the whole function a nice one-liner, and because MAX_DEVICE_ID
is a small constant-time value (63), on 64-bit platforms find_and_set_bit()
call will be optimized to:

	ffs();
	test_and_set_bit().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/isdn/mISDN/core.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/isdn/mISDN/core.c b/drivers/isdn/mISDN/core.c
index ab8513a7acd52d..c829c4eac0e23e 100644
--- a/drivers/isdn/mISDN/core.c
+++ b/drivers/isdn/mISDN/core.c
@@ -197,14 +197,9 @@ get_mdevice_count(void)
 static int
 get_free_devid(void)
 {
-	u_int	i;
+	int i = find_and_set_bit((u_long *)&device_ids, MAX_DEVICE_ID + 1);
 
-	for (i = 0; i <= MAX_DEVICE_ID; i++)
-		if (!test_and_set_bit(i, (u_long *)&device_ids))
-			break;
-	if (i > MAX_DEVICE_ID)
-		return -EBUSY;
-	return i;
+	return i <= MAX_DEVICE_ID ? i : -EBUSY;
 }
 
 int

From fbde99eaa647bc93ca3fbf78f4cec929beabf0f7 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:41 -0800
Subject: [PATCH 062/707] media: em28xx: cx231xx: optimize drivers by using
 find_and_set_bit()

Functions in the media/usb drivers opencode find_and_set_bit(). Simplify
them by using the function.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/usb/cx231xx/cx231xx-cards.c | 16 ++++------
 drivers/media/usb/em28xx/em28xx-cards.c   | 37 +++++++++--------------
 2 files changed, 21 insertions(+), 32 deletions(-)

diff --git a/drivers/media/usb/cx231xx/cx231xx-cards.c b/drivers/media/usb/cx231xx/cx231xx-cards.c
index 92efe6c1f47bae..b314603932d772 100644
--- a/drivers/media/usb/cx231xx/cx231xx-cards.c
+++ b/drivers/media/usb/cx231xx/cx231xx-cards.c
@@ -1708,16 +1708,12 @@ static int cx231xx_usb_probe(struct usb_interface *interface,
 		return -ENODEV;
 
 	/* Check to see next free device and mark as used */
-	do {
-		nr = find_first_zero_bit(&cx231xx_devused, CX231XX_MAXBOARDS);
-		if (nr >= CX231XX_MAXBOARDS) {
-			/* No free device slots */
-			dev_err(d,
-				"Supports only %i devices.\n",
-				CX231XX_MAXBOARDS);
-			return -ENOMEM;
-		}
-	} while (test_and_set_bit(nr, &cx231xx_devused));
+	nr = find_and_set_bit(&cx231xx_devused, CX231XX_MAXBOARDS);
+	if (nr >= CX231XX_MAXBOARDS) {
+		/* No free device slots */
+		dev_err(d, "Supports only %i devices.\n", CX231XX_MAXBOARDS);
+		return -ENOMEM;
+	}
 
 	udev = usb_get_dev(interface_to_usbdev(interface));
 
diff --git a/drivers/media/usb/em28xx/em28xx-cards.c b/drivers/media/usb/em28xx/em28xx-cards.c
index 4d037c92af7c58..af4809fe74a857 100644
--- a/drivers/media/usb/em28xx/em28xx-cards.c
+++ b/drivers/media/usb/em28xx/em28xx-cards.c
@@ -3684,17 +3684,14 @@ static int em28xx_duplicate_dev(struct em28xx *dev)
 		return -ENOMEM;
 	}
 	/* Check to see next free device and mark as used */
-	do {
-		nr = find_first_zero_bit(em28xx_devused, EM28XX_MAXBOARDS);
-		if (nr >= EM28XX_MAXBOARDS) {
-			/* No free device slots */
-			dev_warn(&dev->intf->dev, ": Supports only %i em28xx boards.\n",
-				 EM28XX_MAXBOARDS);
-			kfree(sec_dev);
-			dev->dev_next = NULL;
-			return -ENOMEM;
-		}
-	} while (test_and_set_bit(nr, em28xx_devused));
+	nr = find_and_set_bit(em28xx_devused, EM28XX_MAXBOARDS);
+	if (nr >= EM28XX_MAXBOARDS) {
+		/* No free device slots */
+		dev_warn(&dev->intf->dev, ": Supports only %i em28xx boards.\n", EM28XX_MAXBOARDS);
+		kfree(sec_dev);
+		dev->dev_next = NULL;
+		return -ENOMEM;
+	}
 	sec_dev->devno = nr;
 	snprintf(sec_dev->name, 28, "em28xx #%d", nr);
 	sec_dev->dev_next = NULL;
@@ -3827,17 +3824,13 @@ static int em28xx_usb_probe(struct usb_interface *intf,
 	udev = usb_get_dev(interface_to_usbdev(intf));
 
 	/* Check to see next free device and mark as used */
-	do {
-		nr = find_first_zero_bit(em28xx_devused, EM28XX_MAXBOARDS);
-		if (nr >= EM28XX_MAXBOARDS) {
-			/* No free device slots */
-			dev_err(&intf->dev,
-				"Driver supports up to %i em28xx boards.\n",
-			       EM28XX_MAXBOARDS);
-			retval = -ENOMEM;
-			goto err_no_slot;
-		}
-	} while (test_and_set_bit(nr, em28xx_devused));
+	nr = find_and_set_bit(em28xx_devused, EM28XX_MAXBOARDS);
+	if (nr >= EM28XX_MAXBOARDS) {
+		/* No free device slots */
+		dev_err(&intf->dev, "Driver supports up to %i em28xx boards.\n", EM28XX_MAXBOARDS);
+		retval = -ENOMEM;
+		goto err_no_slot;
+	}
 
 	/* Don't register audio interfaces */
 	if (intf->altsetting[0].desc.bInterfaceClass == USB_CLASS_AUDIO) {

From 35a11cd220c710b8c42203cf07aea6f6f873f6a5 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:42 -0800
Subject: [PATCH 063/707] ethernet: rocker: optimize
 ofdpa_port_internal_vlan_id_get()

Optimize ofdpa_port_internal_vlan_id_get() by using find_and_set_bit(),
instead of polling every bit from bitmap in a for-loop.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/net/ethernet/rocker/rocker_ofdpa.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 826990459fa443..449be8af7ffce6 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -2249,14 +2249,11 @@ static __be16 ofdpa_port_internal_vlan_id_get(struct ofdpa_port *ofdpa_port,
 	found = entry;
 	hash_add(ofdpa->internal_vlan_tbl, &found->entry, found->ifindex);
 
-	for (i = 0; i < OFDPA_N_INTERNAL_VLANS; i++) {
-		if (test_and_set_bit(i, ofdpa->internal_vlan_bitmap))
-			continue;
+	i = find_and_set_bit(ofdpa->internal_vlan_bitmap, OFDPA_N_INTERNAL_VLANS);
+	if (i < OFDPA_N_INTERNAL_VLANS)
 		found->vlan_id = htons(OFDPA_INTERNAL_VLAN_ID_BASE + i);
-		goto found;
-	}
-
-	netdev_err(ofdpa_port->dev, "Out of internal VLAN IDs\n");
+	else
+		netdev_err(ofdpa_port->dev, "Out of internal VLAN IDs\n");
 
 found:
 	found->ref_count++;

From e63a961be48f2855a76d034e23471995ad6b9972 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:43 -0800
Subject: [PATCH 064/707] serial: sc12is7xx: optimize sc16is7xx_alloc_line()

Instead of polling every bit in sc16is7xx_lines, use a dedicated
find_and_set_bit(), and make the function a simple one-liner.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/tty/serial/sc16is7xx.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
index db2bb1c0d36c26..6a463988d5e002 100644
--- a/drivers/tty/serial/sc16is7xx.c
+++ b/drivers/tty/serial/sc16is7xx.c
@@ -427,15 +427,9 @@ static void sc16is7xx_port_update(struct uart_port *port, u8 reg,
 
 static int sc16is7xx_alloc_line(void)
 {
-	int i;
-
 	BUILD_BUG_ON(SC16IS7XX_MAX_DEVS > BITS_PER_LONG);
 
-	for (i = 0; i < SC16IS7XX_MAX_DEVS; i++)
-		if (!test_and_set_bit(i, &sc16is7xx_lines))
-			break;
-
-	return i;
+	return find_and_set_bit(&sc16is7xx_lines, SC16IS7XX_MAX_DEVS);
 }
 
 static void sc16is7xx_power(struct uart_port *port, int on)

From 137ce860bc80dd529ab6b04dde2a6e761b32b3ec Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:44 -0800
Subject: [PATCH 065/707] bluetooth: optimize cmtp_alloc_block_id()

Instead of polling every bit in blockids, use a dedicated
find_and_set_bit(), and make the function a simple one-liner.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 net/bluetooth/cmtp/core.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 90d130588a3e51..b1330acbbff366 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -88,15 +88,9 @@ static void __cmtp_copy_session(struct cmtp_session *session, struct cmtp_connin
 
 static inline int cmtp_alloc_block_id(struct cmtp_session *session)
 {
-	int i, id = -1;
+	int id = find_and_set_bit(&session->blockids, 16);
 
-	for (i = 0; i < 16; i++)
-		if (!test_and_set_bit(i, &session->blockids)) {
-			id = i;
-			break;
-		}
-
-	return id;
+	return id < 16 ? id : -1;
 }
 
 static inline void cmtp_free_block_id(struct cmtp_session *session, int id)

From 668284d460b55af73b9a001aad57b25eb1f674b7 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:45 -0800
Subject: [PATCH 066/707] net: smc: optimize smc_wr_tx_get_free_slot_index()

Simplify the function by using find_and_set_bit() and make it a simple
almost one-liner.

While here, drop explicit initialization of *idx, because it's already
initialized by the caller in case of ENOLINK, or set properly with
->wr_tx_mask, if nothing is found, in case of EBUSY.

CC: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
Reviewed-by: Wen Gu <guwen@linux.alibaba.com>
---
 net/smc/smc_wr.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 0021065a600a03..b6f0cfc527882f 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -170,15 +170,11 @@ void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
 
 static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
 {
-	*idx = link->wr_tx_cnt;
 	if (!smc_link_sendable(link))
 		return -ENOLINK;
-	for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
-		if (!test_and_set_bit(*idx, link->wr_tx_mask))
-			return 0;
-	}
-	*idx = link->wr_tx_cnt;
-	return -EBUSY;
+
+	*idx = find_and_set_bit(link->wr_tx_mask, link->wr_tx_cnt);
+	return *idx < link->wr_tx_cnt ? 0 : -EBUSY;
 }
 
 /**

From 78cdf2d0f4565c5866eb1ec2105397835003f15d Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:46 -0800
Subject: [PATCH 067/707] ALSA: use atomic find_bit() functions where
 applicable

ALSA code tests each bit in bitmaps in a for() loop. Switch it to
using dedicated atomic find_bit() API.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/hda_codec.c |  7 +++----
 sound/usb/caiaq/audio.c   | 13 +++++--------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c
index 01718b1fc9a7f8..29254005f3941a 100644
--- a/sound/pci/hda/hda_codec.c
+++ b/sound/pci/hda/hda_codec.c
@@ -3275,10 +3275,9 @@ static int get_empty_pcm_device(struct hda_bus *bus, unsigned int type)
 
 #ifdef CONFIG_SND_DYNAMIC_MINORS
 	/* non-fixed slots starting from 10 */
-	for (i = 10; i < 32; i++) {
-		if (!test_and_set_bit(i, bus->pcm_dev_bits))
-			return i;
-	}
+	i = find_and_set_next_bit(bus->pcm_dev_bits, 32, 10);
+	if (i < 32)
+		return i;
 #endif
 
 	dev_warn(bus->card->dev, "Too many %s devices\n",
diff --git a/sound/usb/caiaq/audio.c b/sound/usb/caiaq/audio.c
index 4981753652a7fe..74dfcf32b439d2 100644
--- a/sound/usb/caiaq/audio.c
+++ b/sound/usb/caiaq/audio.c
@@ -610,7 +610,7 @@ static void read_completed(struct urb *urb)
 	struct snd_usb_caiaq_cb_info *info = urb->context;
 	struct snd_usb_caiaqdev *cdev;
 	struct device *dev;
-	struct urb *out = NULL;
+	struct urb *out;
 	int i, frame, len, send_it = 0, outframe = 0;
 	unsigned long flags;
 	size_t offset = 0;
@@ -625,17 +625,14 @@ static void read_completed(struct urb *urb)
 		return;
 
 	/* find an unused output urb that is unused */
-	for (i = 0; i < N_URBS; i++)
-		if (test_and_set_bit(i, &cdev->outurb_active_mask) == 0) {
-			out = cdev->data_urbs_out[i];
-			break;
-		}
-
-	if (!out) {
+	i = find_and_set_bit(&cdev->outurb_active_mask, N_URBS);
+	if (i >= N_URBS) {
 		dev_err(dev, "Unable to find an output urb to use\n");
 		goto requeue;
 	}
 
+	out = cdev->data_urbs_out[i];
+
 	/* read the recently received packet and send back one which has
 	 * the same layout */
 	for (frame = 0; frame < FRAMES_PER_URB; frame++) {

From 4d56bf2e0c2321894cdb654b96944710fbd62314 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:47 -0800
Subject: [PATCH 068/707] m68k: optimize get_mmu_context()

get_mmu_context() opencodes atomic find_and_set_bit_wrap(). Simplify
it by using find_and_set_bit_wrap().

CC: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Greg Ungerer <gerg@linux-m68k.org>
---
 arch/m68k/include/asm/mmu_context.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/m68k/include/asm/mmu_context.h b/arch/m68k/include/asm/mmu_context.h
index 141bbdfad96019..0419ad87a1c122 100644
--- a/arch/m68k/include/asm/mmu_context.h
+++ b/arch/m68k/include/asm/mmu_context.h
@@ -35,12 +35,11 @@ static inline void get_mmu_context(struct mm_struct *mm)
 		atomic_inc(&nr_free_contexts);
 		steal_context();
 	}
-	ctx = next_mmu_context;
-	while (test_and_set_bit(ctx, context_map)) {
-		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
-		if (ctx > LAST_CONTEXT)
-			ctx = 0;
-	}
+
+	do {
+		ctx = find_and_set_bit_wrap(context_map, LAST_CONTEXT + 1, next_mmu_context);
+	} while (ctx > LAST_CONTEXT);
+
 	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
 	mm->context = ctx;
 	context_mm[ctx] = mm;

From 18eda5a178066852986b0380f201295cefa582e7 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:48 -0800
Subject: [PATCH 069/707] microblaze: optimize get_mmu_context()

Simplify get_mmu_context() by using find_and_set_bit_wrap().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 arch/microblaze/include/asm/mmu_context_mm.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/microblaze/include/asm/mmu_context_mm.h b/arch/microblaze/include/asm/mmu_context_mm.h
index c2c77f70845562..209c3a62353a99 100644
--- a/arch/microblaze/include/asm/mmu_context_mm.h
+++ b/arch/microblaze/include/asm/mmu_context_mm.h
@@ -82,12 +82,11 @@ static inline void get_mmu_context(struct mm_struct *mm)
 		return;
 	while (atomic_dec_if_positive(&nr_free_contexts) < 0)
 		steal_context();
-	ctx = next_mmu_context;
-	while (test_and_set_bit(ctx, context_map)) {
-		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
-		if (ctx > LAST_CONTEXT)
-			ctx = 0;
-	}
+
+	do {
+		ctx = find_and_set_bit_wrap(context_map, LAST_CONTEXT + 1, next_mmu_context);
+	} while (ctx > LAST_CONTEXT);
+
 	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
 	mm->context = ctx;
 	context_mm[ctx] = mm;

From 5e95ee6fd52b06432da9636032ac6986112feec1 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 11 Dec 2023 18:27:49 -0800
Subject: [PATCH 070/707] sh: mach-x3proto: optimize ilsel_enable()

Simplify ilsel_enable() by using find_and_set_bit().

CC: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 arch/sh/boards/mach-x3proto/ilsel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/sh/boards/mach-x3proto/ilsel.c b/arch/sh/boards/mach-x3proto/ilsel.c
index f0d5eb41521a49..7fadc479a80bf7 100644
--- a/arch/sh/boards/mach-x3proto/ilsel.c
+++ b/arch/sh/boards/mach-x3proto/ilsel.c
@@ -99,8 +99,8 @@ int ilsel_enable(ilsel_source_t set)
 	}
 
 	do {
-		bit = find_first_zero_bit(&ilsel_level_map, ILSEL_LEVELS);
-	} while (test_and_set_bit(bit, &ilsel_level_map));
+		bit = find_and_set_bit(&ilsel_level_map, ILSEL_LEVELS);
+	} while (bit >= ILSEL_LEVELS);
 
 	__ilsel_enable(set, bit);
 

From f655182f9e9edda559b41a1f8b3b9c944443694a Mon Sep 17 00:00:00 2001
From: Kartik <kkartik@nvidia.com>
Date: Wed, 20 Dec 2023 11:40:13 +0530
Subject: [PATCH 071/707] soc/tegra: fuse: Define tegra194_soc_attr_group for
 Tegra241

Tegra241 SoC data uses tegra194_soc_attr_group, which is only defined
if config CONFIG_ARCH_TEGRA_194_SOC or CONFIG_ARCH_TEGRA_234_SOC or
both are enabled. This causes a build failure if both of these configs
are disabled and CONFIG_ARCH_TEGRA_241_SOC is enabled.

Define tegra194_soc_attr_group if CONFIG_ARCH_TEGRA_241_SOC is enabled.

Signed-off-by: Kartik <kkartik@nvidia.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/soc/tegra/fuse/fuse-tegra.c | 3 ++-
 drivers/soc/tegra/fuse/fuse.h       | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c
index 233b8e7bb41bf9..c34efa5bf44c2f 100644
--- a/drivers/soc/tegra/fuse/fuse-tegra.c
+++ b/drivers/soc/tegra/fuse/fuse-tegra.c
@@ -407,7 +407,8 @@ const struct attribute_group tegra_soc_attr_group = {
 };
 
 #if IS_ENABLED(CONFIG_ARCH_TEGRA_194_SOC) || \
-    IS_ENABLED(CONFIG_ARCH_TEGRA_234_SOC)
+    IS_ENABLED(CONFIG_ARCH_TEGRA_234_SOC) || \
+    IS_ENABLED(CONFIG_ARCH_TEGRA_241_SOC)
 static ssize_t platform_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
 {
diff --git a/drivers/soc/tegra/fuse/fuse.h b/drivers/soc/tegra/fuse/fuse.h
index f3b705327c20f8..9fee6ad6ad9e98 100644
--- a/drivers/soc/tegra/fuse/fuse.h
+++ b/drivers/soc/tegra/fuse/fuse.h
@@ -124,7 +124,8 @@ extern const struct tegra_fuse_soc tegra186_fuse_soc;
 #endif
 
 #if IS_ENABLED(CONFIG_ARCH_TEGRA_194_SOC) || \
-    IS_ENABLED(CONFIG_ARCH_TEGRA_234_SOC)
+    IS_ENABLED(CONFIG_ARCH_TEGRA_234_SOC) || \
+    IS_ENABLED(CONFIG_ARCH_TEGRA_241_SOC)
 extern const struct attribute_group tegra194_soc_attr_group;
 #endif
 

From 071ad962baf5e857fd965595421cf6fb588610ed Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 22 Dec 2023 16:04:02 +0200
Subject: [PATCH 072/707] bitmap: Step down as a reviewer

Too many things are going on, and reviewing BITMAP related code
seems not the best I can do, hence step down as a reviewer of
the BITMAP library.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 788be9ab5b733a..51983ed2d4e483 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3553,7 +3553,6 @@ F:	include/uapi/linux/bfs_fs.h
 
 BITMAP API
 M:	Yury Norov <yury.norov@gmail.com>
-R:	Andy Shevchenko <andriy.shevchenko@linux.intel.com>
 R:	Rasmus Villemoes <linux@rasmusvillemoes.dk>
 S:	Maintained
 F:	include/linux/bitfield.h

From cceb1ba628230a5fb076d7155731572d0393a903 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Wed, 27 Dec 2023 11:10:03 +0100
Subject: [PATCH 073/707] Bluetooth: hci_bcm4377: do not mark valid bd_addr as
 invalid

A recent commit restored the original (and still documented) semantics
for the HCI_QUIRK_USE_BDADDR_PROPERTY quirk so that the device address
is considered invalid unless an address is provided by firmware.

This specifically means that this flag must only be set for devices with
invalid addresses, but the Broadcom BCM4377 driver has so far been
setting this flag unconditionally.

Fortunately the driver already checks for invalid addresses during setup
and sets the HCI_QUIRK_INVALID_BDADDR flag, which can simply be replaced
with HCI_QUIRK_USE_BDADDR_PROPERTY to indicate that the default address
is invalid but can be overridden by firmware (long term, this should
probably just always be allowed).

Fixes: 6945795bc81a ("Bluetooth: fix use-bdaddr-property quirk")
Cc: stable@vger.kernel.org      # 6.5
Reported-by: Felix Zhang <mrman@mrman314.tech>
Link: https://lore.kernel.org/r/77419ffacc5b4875e920e038332575a2a5bff29f.camel@mrman314.tech/
Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Reported-by: Felix Zhang <mrman@mrman314.tech>
Reviewed-by: Neal Gompa <neal@gompa.dev>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/hci_bcm4377.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/bluetooth/hci_bcm4377.c b/drivers/bluetooth/hci_bcm4377.c
index a617578356953c..9a7243d5db71ff 100644
--- a/drivers/bluetooth/hci_bcm4377.c
+++ b/drivers/bluetooth/hci_bcm4377.c
@@ -1417,7 +1417,7 @@ static int bcm4377_check_bdaddr(struct bcm4377_data *bcm4377)
 
 	bda = (struct hci_rp_read_bd_addr *)skb->data;
 	if (!bcm4377_is_valid_bdaddr(bcm4377, &bda->bdaddr))
-		set_bit(HCI_QUIRK_INVALID_BDADDR, &bcm4377->hdev->quirks);
+		set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &bcm4377->hdev->quirks);
 
 	kfree_skb(skb);
 	return 0;
@@ -2368,7 +2368,6 @@ static int bcm4377_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	hdev->set_bdaddr = bcm4377_hci_set_bdaddr;
 	hdev->setup = bcm4377_hci_setup;
 
-	set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks);
 	if (bcm4377->hw->broken_mws_transport_config)
 		set_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &hdev->quirks);
 	if (bcm4377->hw->broken_ext_scan)

From 8f9bb7a1b81ba24c52b11e7b8af6684c01659b78 Mon Sep 17 00:00:00 2001
From: Max Chou <max.chou@realtek.com>
Date: Tue, 26 Dec 2023 19:45:17 +0800
Subject: [PATCH 074/707] Bluetooth: btrtl: Add the support for
 RTL8852BT/RTL8852BE-VT

Add the support for RTL8852BT/RTL8852BE-VT BT controller on USB interface.
The necessary firmware will be submitted to linux-firmware project.

The device info from /sys/kernel/debug/usb/devices as below.

T:  Bus=02 Lev=02 Prnt=02 Port=05 Cnt=01 Dev#=  8 Spd=12   MxCh= 0
D:  Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=0bda ProdID=8520 Rev= 0.00
S:  Manufacturer=Realtek
S:  Product=Bluetooth Radio
S:  SerialNumber=00e04c000001
C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=1ms
E:  Ad=02(O) Atr=02(Bulk) MxPS=  64 Ivl=0ms
E:  Ad=82(I) Atr=02(Bulk) MxPS=  64 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms

Signed-off-by: Max Chou <max.chou@realtek.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btrtl.c | 14 ++++++++++++++
 drivers/bluetooth/btusb.c |  3 +++
 2 files changed, 17 insertions(+)

diff --git a/drivers/bluetooth/btrtl.c b/drivers/bluetooth/btrtl.c
index 277d039ecbb429..cc50de69e8dc98 100644
--- a/drivers/bluetooth/btrtl.c
+++ b/drivers/bluetooth/btrtl.c
@@ -69,6 +69,7 @@ enum btrtl_chip_id {
 	CHIP_ID_8852B = 20,
 	CHIP_ID_8852C = 25,
 	CHIP_ID_8851B = 36,
+	CHIP_ID_8852BT = 47,
 };
 
 struct id_table {
@@ -307,6 +308,15 @@ static const struct id_table ic_id_table[] = {
 	  .fw_name  = "rtl_bt/rtl8851bu_fw",
 	  .cfg_name = "rtl_bt/rtl8851bu_config",
 	  .hw_info  = "rtl8851bu" },
+
+	/* 8852BT/8852BE-VT */
+	{ IC_INFO(RTL_ROM_LMP_8852A, 0x87, 0xc, HCI_USB),
+	  .config_needed = false,
+	  .has_rom_version = true,
+	  .has_msft_ext = true,
+	  .fw_name  = "rtl_bt/rtl8852btu_fw",
+	  .cfg_name = "rtl_bt/rtl8852btu_config",
+	  .hw_info  = "rtl8852btu" },
 	};
 
 static const struct id_table *btrtl_match_ic(u16 lmp_subver, u16 hci_rev,
@@ -645,6 +655,7 @@ static int rtlbt_parse_firmware(struct hci_dev *hdev,
 		{ RTL_ROM_LMP_8852A, 20 },	/* 8852B */
 		{ RTL_ROM_LMP_8852A, 25 },	/* 8852C */
 		{ RTL_ROM_LMP_8851B, 36 },	/* 8851B */
+		{ RTL_ROM_LMP_8852A, 47 },	/* 8852BT */
 	};
 
 	if (btrtl_dev->fw_len <= 8)
@@ -1275,6 +1286,7 @@ void btrtl_set_quirks(struct hci_dev *hdev, struct btrtl_device_info *btrtl_dev)
 	case CHIP_ID_8852B:
 	case CHIP_ID_8852C:
 	case CHIP_ID_8851B:
+	case CHIP_ID_8852BT:
 		set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks);
 		set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks);
 
@@ -1505,6 +1517,8 @@ MODULE_FIRMWARE("rtl_bt/rtl8852bs_fw.bin");
 MODULE_FIRMWARE("rtl_bt/rtl8852bs_config.bin");
 MODULE_FIRMWARE("rtl_bt/rtl8852bu_fw.bin");
 MODULE_FIRMWARE("rtl_bt/rtl8852bu_config.bin");
+MODULE_FIRMWARE("rtl_bt/rtl8852btu_fw.bin");
+MODULE_FIRMWARE("rtl_bt/rtl8852btu_config.bin");
 MODULE_FIRMWARE("rtl_bt/rtl8852cu_fw.bin");
 MODULE_FIRMWARE("rtl_bt/rtl8852cu_fw_v2.bin");
 MODULE_FIRMWARE("rtl_bt/rtl8852cu_config.bin");
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 7835170b1d6618..a0a317bac0954f 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -553,6 +553,9 @@ static const struct usb_device_id quirks_table[] = {
 	{ USB_DEVICE(0x13d3, 0x3572), .driver_info = BTUSB_REALTEK |
 						     BTUSB_WIDEBAND_SPEECH },
 
+	/* Realtek 8852BT/8852BE-VT Bluetooth devices */
+	{ USB_DEVICE(0x0bda, 0x8520), .driver_info = BTUSB_REALTEK |
+						     BTUSB_WIDEBAND_SPEECH },
 	/* Realtek Bluetooth devices */
 	{ USB_VENDOR_AND_INTERFACE_INFO(0x0bda, 0xe0, 0x01, 0x01),
 	  .driver_info = BTUSB_REALTEK },

From c7ee0bc8db325ec829bbe2cd0114071489ed915f Mon Sep 17 00:00:00 2001
From: Neeraj Sanjay Kale <neeraj.sanjaykale@nxp.com>
Date: Wed, 27 Dec 2023 18:59:27 +0530
Subject: [PATCH 075/707] Bluetooth: btnxpuart: Resolve TX timeout error in
 power save stress test

This fixes the tx timeout issue seen while running a stress test on
btnxpuart for couple of hours, such that the interval between two HCI
commands coincide with the power save timeout value of 2 seconds.

Test procedure using bash script:
<load btnxpuart.ko>
hciconfig hci0 up
//Enable Power Save feature
hcitool -i hci0 cmd 3f 23 02 00 00
while (true)
do
    hciconfig hci0 leadv
    sleep 2
    hciconfig hci0 noleadv
    sleep 2
done

Error log, after adding few more debug prints:
Bluetooth: btnxpuart_queue_skb(): 01 0A 20 01 00
Bluetooth: hci0: Set UART break: on, status=0
Bluetooth: hci0: btnxpuart_tx_wakeup() tx_work scheduled
Bluetooth: hci0: btnxpuart_tx_work() dequeue: 01 0A 20 01 00
Can't set advertise mode on hci0: Connection timed out (110)
Bluetooth: hci0: command 0x200a tx timeout

When the power save mechanism turns on UART break, and btnxpuart_tx_work()
is scheduled simultaneously, psdata->ps_state is read as PS_STATE_AWAKE,
which prevents the psdata->work from being scheduled, which is responsible
to turn OFF UART break.

This issue is fixed by adding a ps_lock mutex around UART break on/off as
well as around ps_state read/write.
btnxpuart_tx_wakeup() will now read updated ps_state value. If ps_state is
PS_STATE_SLEEP, it will first schedule psdata->work, and then it will
reschedule itself once UART break has been turned off and ps_state is
PS_STATE_AWAKE.

Tested above script for 50,000 iterations and TX timeout error was not
observed anymore.

Signed-off-by: Neeraj Sanjay Kale <neeraj.sanjaykale@nxp.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btnxpuart.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c
index b7c56be078f815..7f88b6f52f2663 100644
--- a/drivers/bluetooth/btnxpuart.c
+++ b/drivers/bluetooth/btnxpuart.c
@@ -126,6 +126,7 @@ struct ps_data {
 	struct hci_dev *hdev;
 	struct work_struct work;
 	struct timer_list ps_timer;
+	struct mutex ps_lock;
 };
 
 struct wakeup_cmd_payload {
@@ -317,6 +318,9 @@ static void ps_start_timer(struct btnxpuart_dev *nxpdev)
 
 	if (psdata->cur_psmode == PS_MODE_ENABLE)
 		mod_timer(&psdata->ps_timer, jiffies + msecs_to_jiffies(psdata->h2c_ps_interval));
+
+	if (psdata->ps_state == PS_STATE_AWAKE && psdata->ps_cmd == PS_CMD_ENTER_PS)
+		cancel_work_sync(&psdata->work);
 }
 
 static void ps_cancel_timer(struct btnxpuart_dev *nxpdev)
@@ -337,6 +341,7 @@ static void ps_control(struct hci_dev *hdev, u8 ps_state)
 	    !test_bit(BTNXPUART_SERDEV_OPEN, &nxpdev->tx_state))
 		return;
 
+	mutex_lock(&psdata->ps_lock);
 	switch (psdata->cur_h2c_wakeupmode) {
 	case WAKEUP_METHOD_DTR:
 		if (ps_state == PS_STATE_AWAKE)
@@ -350,12 +355,15 @@ static void ps_control(struct hci_dev *hdev, u8 ps_state)
 			status = serdev_device_break_ctl(nxpdev->serdev, 0);
 		else
 			status = serdev_device_break_ctl(nxpdev->serdev, -1);
+		msleep(20); /* Allow chip to detect UART-break and enter sleep */
 		bt_dev_dbg(hdev, "Set UART break: %s, status=%d",
 			   str_on_off(ps_state == PS_STATE_SLEEP), status);
 		break;
 	}
 	if (!status)
 		psdata->ps_state = ps_state;
+	mutex_unlock(&psdata->ps_lock);
+
 	if (ps_state == PS_STATE_AWAKE)
 		btnxpuart_tx_wakeup(nxpdev);
 }
@@ -391,17 +399,25 @@ static void ps_setup(struct hci_dev *hdev)
 
 	psdata->hdev = hdev;
 	INIT_WORK(&psdata->work, ps_work_func);
+	mutex_init(&psdata->ps_lock);
 	timer_setup(&psdata->ps_timer, ps_timeout_func, 0);
 }
 
-static void ps_wakeup(struct btnxpuart_dev *nxpdev)
+static bool ps_wakeup(struct btnxpuart_dev *nxpdev)
 {
 	struct ps_data *psdata = &nxpdev->psdata;
+	u8 ps_state;
 
-	if (psdata->ps_state != PS_STATE_AWAKE) {
+	mutex_lock(&psdata->ps_lock);
+	ps_state = psdata->ps_state;
+	mutex_unlock(&psdata->ps_lock);
+
+	if (ps_state != PS_STATE_AWAKE) {
 		psdata->ps_cmd = PS_CMD_EXIT_PS;
 		schedule_work(&psdata->work);
+		return true;
 	}
+	return false;
 }
 
 static int send_ps_cmd(struct hci_dev *hdev, void *data)
@@ -1171,7 +1187,6 @@ static struct sk_buff *nxp_dequeue(void *data)
 {
 	struct btnxpuart_dev *nxpdev = (struct btnxpuart_dev *)data;
 
-	ps_wakeup(nxpdev);
 	ps_start_timer(nxpdev);
 	return skb_dequeue(&nxpdev->txq);
 }
@@ -1186,6 +1201,9 @@ static void btnxpuart_tx_work(struct work_struct *work)
 	struct sk_buff *skb;
 	int len;
 
+	if (ps_wakeup(nxpdev))
+		return;
+
 	while ((skb = nxp_dequeue(nxpdev))) {
 		len = serdev_device_write_buf(serdev, skb->data, skb->len);
 		hdev->stat.byte_tx += len;

From 4b60f3880d23d7f5d4f632ccd4fb14e2d7f2631e Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Tue, 19 Dec 2023 01:02:28 +0900
Subject: [PATCH 076/707] btrfs: zoned: factor out prepare_allocation_zoned()

Factor out prepare_allocation_zoned() for further extension. While at
it, optimize the if-branch a bit.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f396aba92c5796..d260b970bec775 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4298,6 +4298,24 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
+static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+				    struct find_free_extent_ctl *ffe_ctl)
+{
+	if (ffe_ctl->for_treelog) {
+		spin_lock(&fs_info->treelog_bg_lock);
+		if (fs_info->treelog_bg)
+			ffe_ctl->hint_byte = fs_info->treelog_bg;
+		spin_unlock(&fs_info->treelog_bg_lock);
+	} else if (ffe_ctl->for_data_reloc) {
+		spin_lock(&fs_info->relocation_bg_lock);
+		if (fs_info->data_reloc_bg)
+			ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+		spin_unlock(&fs_info->relocation_bg_lock);
+	}
+
+	return 0;
+}
+
 static int prepare_allocation(struct btrfs_fs_info *fs_info,
 			      struct find_free_extent_ctl *ffe_ctl,
 			      struct btrfs_space_info *space_info,
@@ -4308,19 +4326,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
 		return prepare_allocation_clustered(fs_info, ffe_ctl,
 						    space_info, ins);
 	case BTRFS_EXTENT_ALLOC_ZONED:
-		if (ffe_ctl->for_treelog) {
-			spin_lock(&fs_info->treelog_bg_lock);
-			if (fs_info->treelog_bg)
-				ffe_ctl->hint_byte = fs_info->treelog_bg;
-			spin_unlock(&fs_info->treelog_bg_lock);
-		}
-		if (ffe_ctl->for_data_reloc) {
-			spin_lock(&fs_info->relocation_bg_lock);
-			if (fs_info->data_reloc_bg)
-				ffe_ctl->hint_byte = fs_info->data_reloc_bg;
-			spin_unlock(&fs_info->relocation_bg_lock);
-		}
-		return 0;
+		return prepare_allocation_zoned(fs_info, ffe_ctl);
 	default:
 		BUG();
 	}

From 62ec22633c6a748d8e7ee2e171491b155e3242a7 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Tue, 19 Dec 2023 01:02:29 +0900
Subject: [PATCH 077/707] btrfs: zoned: optimize hint byte for zoned allocator

Writing sequentially to a huge file on btrfs on a SMR HDD revealed a
decline of the performance (220 MiB/s to 30 MiB/s after 500 minutes).

The performance goes down because of increased latency of the extent
allocation, which is induced by a traversing of a lot of full block groups.

So, this patch optimizes the ffe_ctl->hint_byte by choosing a block group
with sufficient size from the active block group list, which does not
contain full block groups.

After applying the patch, the performance is maintained well.

Fixes: 2eda57089ea3 ("btrfs: zoned: implement sequential extent allocation")
CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d260b970bec775..6d680031211a1c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4311,6 +4311,24 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
 		if (fs_info->data_reloc_bg)
 			ffe_ctl->hint_byte = fs_info->data_reloc_bg;
 		spin_unlock(&fs_info->relocation_bg_lock);
+	} else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+		struct btrfs_block_group *block_group;
+
+		spin_lock(&fs_info->zone_active_bgs_lock);
+		list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
+			/*
+			 * No lock is OK here because avail is monotinically
+			 * decreasing, and this is just a hint.
+			 */
+			u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+
+			if (block_group_bits(block_group, ffe_ctl->flags) &&
+			    avail >= ffe_ctl->num_bytes) {
+				ffe_ctl->hint_byte = block_group->start;
+				break;
+			}
+		}
+		spin_unlock(&fs_info->zone_active_bgs_lock);
 	}
 
 	return 0;

From a5592dcde6c61257f00c78830aa6a2d044aabf43 Mon Sep 17 00:00:00 2001
From: Dmitry Antipov <dmantipov@yandex.ru>
Date: Thu, 21 Dec 2023 11:47:45 +0300
Subject: [PATCH 078/707] btrfs: fix kvcalloc() arguments order in
 btrfs_ioctl_send()

When compiling with gcc version 14.0.0 20231220 (experimental)
and W=1, I've noticed the following warning:

fs/btrfs/send.c: In function 'btrfs_ioctl_send':
fs/btrfs/send.c:8208:44: warning: 'kvcalloc' sizes specified with 'sizeof'
in the earlier argument and not in the later argument [-Wcalloc-transposed-args]
 8208 |         sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
      |                                            ^

Since 'n' and 'size' arguments of 'kvcalloc()' are multiplied to
calculate the final size, their actual order doesn't affect the result
and so this is not a bug. But it's still worth to fix it.

Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4e36550618e580..2d7519a6ce72d3 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -8205,8 +8205,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
 		goto out;
 	}
 
-	sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
-				     arg->clone_sources_count + 1,
+	sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1,
+				     sizeof(*sctx->clone_roots),
 				     GFP_KERNEL);
 	if (!sctx->clone_roots) {
 		ret = -ENOMEM;

From aa434e486204ddac5c4a6ea20e8085d482c27e01 Mon Sep 17 00:00:00 2001
From: Fedor Pchelkin <pchelkin@ispras.ru>
Date: Wed, 3 Jan 2024 13:31:27 +0300
Subject: [PATCH 079/707] btrfs: ref-verify: free ref cache before clearing
 mount opt

As clearing REF_VERIFY mount option indicates there were some errors in a
ref-verify process, a ref cache is not relevant anymore and should be
freed.

btrfs_free_ref_cache() requires REF_VERIFY option being set so call
it just before clearing the mount option.

Found by Linux Verification Center (linuxtesting.org) with Syzkaller.

Reported-by: syzbot+be14ed7728594dc8bd42@syzkaller.appspotmail.com
Fixes: fd708b81d972 ("Btrfs: add a extent ref verify tool")
CC: stable@vger.kernel.org # 5.4+
Closes: https://lore.kernel.org/lkml/000000000000e5a65c05ee832054@google.com/
Reported-by: syzbot+c563a3c79927971f950f@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/lkml/0000000000007fe09705fdc6086c@google.com/
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ref-verify.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 6486f0d7e9931b..8c4fc98ca9ce7d 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -889,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
 out_unlock:
 	spin_unlock(&fs_info->ref_verify_lock);
 out:
-	if (ret)
+	if (ret) {
+		btrfs_free_ref_cache(fs_info);
 		btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+	}
 	return ret;
 }
 
@@ -1021,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
 		}
 	}
 	if (ret) {
-		btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
 		btrfs_free_ref_cache(fs_info);
+		btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
 	}
 	btrfs_free_path(path);
 	return ret;

From 690381ca057e24c0ad192f35a28cf0ab6f009be5 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Fri, 22 Dec 2023 01:46:16 +0900
Subject: [PATCH 080/707] btrfs: fix unbalanced unlock of mapping_tree_lock

The error path of btrfs_get_chunk_map() releases
fs_info->mapping_tree_lock. But, it is taken and released in
btrfs_find_chunk_map(). So, there is no need to do so.

Fixes: 7dc66abb5a47 ("btrfs: use a dedicated data structure for chunk maps")
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4c32497311d2ff..d67785be2c778c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3087,7 +3087,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
 	map = btrfs_find_chunk_map(fs_info, logical, length);
 
 	if (unlikely(!map)) {
-		read_unlock(&fs_info->mapping_tree_lock);
 		btrfs_crit(fs_info,
 			   "unable to find chunk map for logical %llu length %llu",
 			   logical, length);
@@ -3095,7 +3094,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
 	}
 
 	if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
-		read_unlock(&fs_info->mapping_tree_lock);
 		btrfs_crit(fs_info,
 			   "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
 			   logical, logical + length, map->start,

From c31b9639579d137855cbda6fa4045b499571cc0b Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Fri, 22 Dec 2023 13:56:34 +0900
Subject: [PATCH 081/707] btrfs: zoned: fix lock ordering in
 btrfs_zone_activate()

The btrfs CI reported a lockdep warning as follows by running generic
generic/129.

   WARNING: possible circular locking dependency detected
   6.7.0-rc5+ #1 Not tainted
   ------------------------------------------------------
   kworker/u5:5/793427 is trying to acquire lock:
   ffff88813256d028 (&cache->lock){+.+.}-{2:2}, at: btrfs_zone_finish_one_bg+0x5e/0x130
   but task is already holding lock:
   ffff88810a23a318 (&fs_info->zone_active_bgs_lock){+.+.}-{2:2}, at: btrfs_zone_finish_one_bg+0x34/0x130
   which lock already depends on the new lock.

   the existing dependency chain (in reverse order) is:
   -> #1 (&fs_info->zone_active_bgs_lock){+.+.}-{2:2}:
   ...
   -> #0 (&cache->lock){+.+.}-{2:2}:
   ...

This is because we take fs_info->zone_active_bgs_lock after a block_group's
lock in btrfs_zone_activate() while doing the opposite in other places.

Fix the issue by expanding the fs_info->zone_active_bgs_lock's critical
section and taking it before a block_group's lock.

Fixes: a7e1ac7bdc5a ("btrfs: zoned: reserve zones for an active metadata/system block group")
CC: stable@vger.kernel.org # 6.6
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zoned.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 12066afc235c31..ac9bbe0c4ffe69 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -2072,6 +2072,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
 
 	map = block_group->physical_map;
 
+	spin_lock(&fs_info->zone_active_bgs_lock);
 	spin_lock(&block_group->lock);
 	if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
 		ret = true;
@@ -2084,7 +2085,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
 		goto out_unlock;
 	}
 
-	spin_lock(&fs_info->zone_active_bgs_lock);
 	for (i = 0; i < map->num_stripes; i++) {
 		struct btrfs_zoned_device_info *zinfo;
 		int reserved = 0;
@@ -2104,20 +2104,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
 		 */
 		if (atomic_read(&zinfo->active_zones_left) <= reserved) {
 			ret = false;
-			spin_unlock(&fs_info->zone_active_bgs_lock);
 			goto out_unlock;
 		}
 
 		if (!btrfs_dev_set_active_zone(device, physical)) {
 			/* Cannot activate the zone */
 			ret = false;
-			spin_unlock(&fs_info->zone_active_bgs_lock);
 			goto out_unlock;
 		}
 		if (!is_data)
 			zinfo->reserved_active_zones--;
 	}
-	spin_unlock(&fs_info->zone_active_bgs_lock);
 
 	/* Successfully activated all the zones */
 	set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
@@ -2125,8 +2122,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
 
 	/* For the active block group list */
 	btrfs_get_block_group(block_group);
-
-	spin_lock(&fs_info->zone_active_bgs_lock);
 	list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
 	spin_unlock(&fs_info->zone_active_bgs_lock);
 
@@ -2134,6 +2129,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
 
 out_unlock:
 	spin_unlock(&block_group->lock);
+	spin_unlock(&fs_info->zone_active_bgs_lock);
 	return ret;
 }
 

From 626cef40faf0b363be5d552800adb5845774662b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Tue, 2 Jan 2024 19:08:08 +0100
Subject: [PATCH 082/707] Bluetooth: hci_sync: Check the correct flag before
 starting a scan
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There's a very confusing mistake in the code starting a HCI inquiry: We're
calling hci_dev_test_flag() to test for HCI_INQUIRY, but hci_dev_test_flag()
checks hdev->dev_flags instead of hdev->flags. HCI_INQUIRY is a bit that's
set on hdev->flags, not on hdev->dev_flags though.

HCI_INQUIRY equals the integer 7, and in hdev->dev_flags, 7 means
HCI_BONDABLE, so we were actually checking for HCI_BONDABLE here.

The mistake is only present in the synchronous code for starting an inquiry,
not in the async one. Also devices are typically bondable while doing an
inquiry, so that might be the reason why nobody noticed it so far.

Fixes: abfeea476c68 ("Bluetooth: hci_sync: Convert MGMT_OP_START_DISCOVERY")
Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_sync.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index a6fc8a2a5c673d..b3141e3f9cf620 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -5559,7 +5559,7 @@ static int hci_inquiry_sync(struct hci_dev *hdev, u8 length)
 
 	bt_dev_dbg(hdev, "");
 
-	if (hci_dev_test_flag(hdev, HCI_INQUIRY))
+	if (test_bit(HCI_INQUIRY, &hdev->flags))
 		return 0;
 
 	hci_dev_lock(hdev);

From 4b85ee4187306be98113b8fa290d535ae6efa812 Mon Sep 17 00:00:00 2001
From: Ying Hsu <yinghsu@chromium.org>
Date: Thu, 4 Jan 2024 11:56:32 +0000
Subject: [PATCH 083/707] Bluetooth: Avoid potential use-after-free in
 hci_error_reset

While handling the HCI_EV_HARDWARE_ERROR event, if the underlying
BT controller is not responding, the GPIO reset mechanism would
free the hci_dev and lead to a use-after-free in hci_error_reset.

Here's the call trace observed on a ChromeOS device with Intel AX201:
   queue_work_on+0x3e/0x6c
   __hci_cmd_sync_sk+0x2ee/0x4c0 [bluetooth <HASH:3b4a6>]
   ? init_wait_entry+0x31/0x31
   __hci_cmd_sync+0x16/0x20 [bluetooth <HASH:3b4a 6>]
   hci_error_reset+0x4f/0xa4 [bluetooth <HASH:3b4a 6>]
   process_one_work+0x1d8/0x33f
   worker_thread+0x21b/0x373
   kthread+0x13a/0x152
   ? pr_cont_work+0x54/0x54
   ? kthread_blkcg+0x31/0x31
    ret_from_fork+0x1f/0x30

This patch holds the reference count on the hci_dev while processing
a HCI_EV_HARDWARE_ERROR event to avoid potential crash.

Fixes: c7741d16a57c ("Bluetooth: Perform a power cycle when receiving hardware error event")
Signed-off-by: Ying Hsu <yinghsu@chromium.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_core.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 65601aa52e0d8b..2821a42cefdc6e 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1049,6 +1049,7 @@ static void hci_error_reset(struct work_struct *work)
 {
 	struct hci_dev *hdev = container_of(work, struct hci_dev, error_reset);
 
+	hci_dev_hold(hdev);
 	BT_DBG("%s", hdev->name);
 
 	if (hdev->hw_error)
@@ -1056,10 +1057,10 @@ static void hci_error_reset(struct work_struct *work)
 	else
 		bt_dev_err(hdev, "hardware error 0x%2.2x", hdev->hw_error_code);
 
-	if (hci_dev_do_close(hdev))
-		return;
+	if (!hci_dev_do_close(hdev))
+		hci_dev_do_open(hdev);
 
-	hci_dev_do_open(hdev);
+	hci_dev_put(hdev);
 }
 
 void hci_uuids_clear(struct hci_dev *hdev)

From 0bcd317e8b31833d36cd9843902905aafbd70017 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 5 Jan 2024 10:43:26 -0500
Subject: [PATCH 084/707] Bluetooth: hci_sync: Fix accept_list when attempting
 to suspend

During suspend, only wakeable devices can be in acceptlist, so if the
device was previously added it needs to be removed otherwise the device
can end up waking up the system prematurely.

Fixes: 3b42055388c3 ("Bluetooth: hci_sync: Fix attempting to suspend with unfiltered passive scan")
Signed-off-by: Clancy Shang <clancy.shang@quectel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
---
 net/bluetooth/hci_sync.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index b3141e3f9cf620..5716345a26dfb7 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -2206,8 +2206,11 @@ static int hci_le_add_accept_list_sync(struct hci_dev *hdev,
 
 	/* During suspend, only wakeable devices can be in acceptlist */
 	if (hdev->suspended &&
-	    !(params->flags & HCI_CONN_FLAG_REMOTE_WAKEUP))
+	    !(params->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) {
+		hci_le_del_accept_list_sync(hdev, &params->addr,
+					    params->addr_type);
 		return 0;
+	}
 
 	/* Select filter policy to accept all advertising */
 	if (*num_entries >= hdev->le_accept_list_size)

From 6ec00b0737fe9108ec8a1995e20349b91a89ce07 Mon Sep 17 00:00:00 2001
From: Yuxuan Hu <20373622@buaa.edu.cn>
Date: Wed, 3 Jan 2024 17:10:43 +0800
Subject: [PATCH 085/707] Bluetooth: rfcomm: Fix null-ptr-deref in
 rfcomm_check_security

During our fuzz testing of the connection and disconnection process at the
RFCOMM layer, we discovered this bug. By comparing the packets from a
normal connection and disconnection process with the testcase that
triggered a KASAN report. We analyzed the cause of this bug as follows:

1. In the packets captured during a normal connection, the host sends a
`Read Encryption Key Size` type of `HCI_CMD` packet
(Command Opcode: 0x1408) to the controller to inquire the length of
encryption key.After receiving this packet, the controller immediately
replies with a Command Completepacket (Event Code: 0x0e) to return the
Encryption Key Size.

2. In our fuzz test case, the timing of the controller's response to this
packet was delayed to an unexpected point: after the RFCOMM and L2CAP
layers had disconnected but before the HCI layer had disconnected.

3. After receiving the Encryption Key Size Response at the time described
in point 2, the host still called the rfcomm_check_security function.
However, by this time `struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn;`
had already been released, and when the function executed
`return hci_conn_security(conn->hcon, d->sec_level, auth_type, d->out);`,
specifically when accessing `conn->hcon`, a null-ptr-deref error occurred.

To fix this bug, check if `sk->sk_state` is BT_CLOSED before calling
rfcomm_recv_frame in rfcomm_process_rx.

Signed-off-by: Yuxuan Hu <20373622@buaa.edu.cn>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/rfcomm/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 053ef8f25fae47..1d34d849703329 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -1941,7 +1941,7 @@ static struct rfcomm_session *rfcomm_process_rx(struct rfcomm_session *s)
 	/* Get data directly from socket receive queue without copying it. */
 	while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
 		skb_orphan(skb);
-		if (!skb_linearize(skb)) {
+		if (!skb_linearize(skb) && sk->sk_state != BT_CLOSED) {
 			s = rfcomm_recv_frame(s, skb);
 			if (!s)
 				break;

From 31f8df9e1976d1e0ae1588303d856d966d6a2847 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 8 Jan 2024 13:50:20 +1030
Subject: [PATCH 086/707] btrfs: remove the pg_offset parameter from
 btrfs_get_extent()

The parameter @pg_offset of btrfs_get_extent() is only utilized for
inlined extent, and we already have an ASSERT() and tree-checker, to
make sure we can only get inline extent at file offset 0.

Any invalid inline extent with non-zero file offset would be rejected by
tree-checker in the first place.

Thus the @pg_offset parameter is not really necessary, just remove it.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h       |  3 +--
 fs/btrfs/extent_io.c         | 10 ++++-----
 fs/btrfs/file.c              | 11 +++++-----
 fs/btrfs/inode.c             | 16 ++++++---------
 fs/btrfs/tests/inode-tests.c | 40 ++++++++++++++++++------------------
 5 files changed, 36 insertions(+), 44 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7f7c5a92d2b879..83d78a6f3aa2f3 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -490,8 +490,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
 			      struct btrfs_root *root, struct btrfs_path *path);
 struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
-				    struct page *page, size_t pg_offset,
-				    u64 start, u64 len);
+				    struct page *page, u64 start, u64 len);
 int btrfs_update_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_inode *inode);
 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a0ffd41c5cc19f..0ea8c401e3a88a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -970,8 +970,7 @@ void clear_page_extent_mapped(struct page *page)
 	folio_detach_private(folio);
 }
 
-static struct extent_map *
-__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
+static struct extent_map *__get_extent_map(struct inode *inode, struct page *page,
 		 u64 start, u64 len, struct extent_map **em_cached)
 {
 	struct extent_map *em;
@@ -988,7 +987,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
 		*em_cached = NULL;
 	}
 
-	em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
+	em = btrfs_get_extent(BTRFS_I(inode), page, start, len);
 	if (em_cached && !IS_ERR(em)) {
 		BUG_ON(*em_cached);
 		refcount_inc(&em->refs);
@@ -1051,8 +1050,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 			end_page_read(page, true, cur, iosize);
 			break;
 		}
-		em = __get_extent_map(inode, page, pg_offset, cur,
-				      end - cur + 1, em_cached);
+		em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached);
 		if (IS_ERR(em)) {
 			unlock_extent(tree, cur, end, NULL);
 			end_page_read(page, false, cur, end + 1 - cur);
@@ -1371,7 +1369,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 			continue;
 		}
 
-		em = btrfs_get_extent(inode, NULL, 0, cur, len);
+		em = btrfs_get_extent(inode, NULL, cur, len);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR_OR_ZERO(em);
 			goto out_error;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 38dfcac4760990..f8e1a7ce3d39ae 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2176,7 +2176,7 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
 	struct extent_map *em;
 	int ret = 0;
 
-	em = btrfs_get_extent(inode, NULL, 0,
+	em = btrfs_get_extent(inode, NULL,
 			      round_down(*start, fs_info->sectorsize),
 			      round_up(*len, fs_info->sectorsize));
 	if (IS_ERR(em))
@@ -2835,7 +2835,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
 	int ret;
 
 	offset = round_down(offset, sectorsize);
-	em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(inode, NULL, offset, sectorsize);
 	if (IS_ERR(em))
 		return PTR_ERR(em);
 
@@ -2866,7 +2866,7 @@ static int btrfs_zero_range(struct inode *inode,
 	u64 bytes_to_reserve = 0;
 	bool space_reserved = false;
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start,
 			      alloc_end - alloc_start);
 	if (IS_ERR(em)) {
 		ret = PTR_ERR(em);
@@ -2909,8 +2909,7 @@ static int btrfs_zero_range(struct inode *inode,
 
 	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
 	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
-		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
-				      sectorsize);
+		em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR(em);
 			goto out;
@@ -3126,7 +3125,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 
 	/* First, check if we exceed the qgroup limit */
 	while (cur_offset < alloc_end) {
-		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
+		em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset,
 				      alloc_end - cur_offset);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR(em);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b3e39610cc95a3..1002ebde14da20 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2632,7 +2632,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
 		u64 em_len;
 		int ret = 0;
 
-		em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
+		em = btrfs_get_extent(inode, NULL, search_start, search_len);
 		if (IS_ERR(em))
 			return PTR_ERR(em);
 
@@ -4888,8 +4888,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
 					   &cached_state);
 	cur_offset = hole_start;
 	while (1) {
-		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
-				      block_end - cur_offset);
+		em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
 		if (IS_ERR(em)) {
 			err = PTR_ERR(em);
 			em = NULL;
@@ -6737,7 +6736,6 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path
  *
  * @inode:	file to search in
  * @page:	page to read extent data into if the extent is inline
- * @pg_offset:	offset into @page to copy to
  * @start:	file offset
  * @len:	length of range starting at @start
  *
@@ -6751,8 +6749,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path
  * Return: ERR_PTR on error, non-NULL extent_map on success.
  */
 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
-				    struct page *page, size_t pg_offset,
-				    u64 start, u64 len)
+				    struct page *page, u64 start, u64 len)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	int ret = 0;
@@ -6895,7 +6892,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 		 * ensured by tree-checker and inline extent creation path.
 		 * Thus all members representing file offsets should be zero.
 		 */
-		ASSERT(pg_offset == 0);
 		ASSERT(extent_start == 0);
 		ASSERT(em->start == 0);
 
@@ -7536,7 +7532,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 	if (ret < 0)
 		goto err;
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
 	if (IS_ERR(em)) {
 		ret = PTR_ERR(em);
 		goto unlock_err;
@@ -10125,7 +10121,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
 		cond_resched();
 	}
 
-	em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
+	em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
 	if (IS_ERR(em)) {
 		ret = PTR_ERR(em);
 		goto out_unlock_extent;
@@ -10698,7 +10694,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 		struct btrfs_block_group *bg;
 		u64 len = isize - start;
 
-		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
+		em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR(em);
 			goto out;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 9957de9f7806d1..99da9d34b77aed 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -258,7 +258,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 
 	/* First with no extents */
 	BTRFS_I(inode)->root = root;
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize);
 	if (IS_ERR(em)) {
 		em = NULL;
 		test_err("got an error when we shouldn't have");
@@ -278,7 +278,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	 */
 	setup_file_extents(root, sectorsize);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, (u64)-1);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -316,7 +316,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -339,7 +339,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	free_extent_map(em);
 
 	/* Regular extent */
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -367,7 +367,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	free_extent_map(em);
 
 	/* The next 3 are split extents */
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -396,7 +396,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -418,7 +418,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -452,7 +452,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	free_extent_map(em);
 
 	/* Prealloc extent */
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -481,7 +481,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	free_extent_map(em);
 
 	/* The next 3 are a half written prealloc extent */
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -511,7 +511,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -544,7 +544,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -579,7 +579,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	free_extent_map(em);
 
 	/* Now for the compressed extent */
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -613,7 +613,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	free_extent_map(em);
 
 	/* Split compressed extent */
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -648,7 +648,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -675,7 +675,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -710,7 +710,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	free_extent_map(em);
 
 	/* A hole between regular extents but no hole extent */
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -737,7 +737,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -770,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 	offset = em->start + em->len;
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -850,7 +850,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
 	insert_inode_item_key(root);
 	insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize,
 		      sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1);
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 2 * sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;
@@ -872,7 +872,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
 	}
 	free_extent_map(em);
 
-	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize);
 	if (IS_ERR(em)) {
 		test_err("got an error when we shouldn't have");
 		goto out;

From f48705f473cea37efeeaa6a197ae12730c112863 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Sun, 7 Jan 2024 19:02:47 +0100
Subject: [PATCH 087/707] Bluetooth: Remove HCI_POWER_OFF_TIMEOUT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With commit cf75ad8b41d2 ("Bluetooth: hci_sync: Convert MGMT_SET_POWERED"),
the power off sequence got refactored so that this timeout was no longer
necessary, let's remove the leftover define from the header too.

Fixes: cf75ad8b41d2 ("Bluetooth: hci_sync: Convert MGMT_SET_POWERED")
Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index bdee5d649cc61d..f7918c7551834b 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -437,7 +437,6 @@ enum {
 #define HCI_NCMD_TIMEOUT	msecs_to_jiffies(4000)	/* 4 seconds */
 #define HCI_ACL_TX_TIMEOUT	msecs_to_jiffies(45000)	/* 45 seconds */
 #define HCI_AUTO_OFF_TIMEOUT	msecs_to_jiffies(2000)	/* 2 seconds */
-#define HCI_POWER_OFF_TIMEOUT	msecs_to_jiffies(5000)	/* 5 seconds */
 #define HCI_LE_CONN_TIMEOUT	msecs_to_jiffies(20000)	/* 20 seconds */
 #define HCI_LE_AUTOCONN_TIMEOUT	msecs_to_jiffies(4000)	/* 4 seconds */
 

From 2e7a6a997c9a9e5a7d15c09f22d0add5672c0906 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Sun, 7 Jan 2024 19:02:48 +0100
Subject: [PATCH 088/707] Bluetooth: mgmt: Remove leftover queuing of power_off
 work
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Queuing of power_off work was introduced in these functions with commits
8b064a3ad377 ("Bluetooth: Clean up HCI state when doing power off") and
c9910d0fb4fc ("Bluetooth: Fix disconnecting connections in non-connected
states") in an effort to clean up state and do things like disconnecting
devices before actually powering off the device.

After that, commit a3172b7eb4a2 ("Bluetooth: Add timer to force power off")
introduced a timeout to ensure that the device actually got powered off,
even if some of the cleanup work would never complete.

This code later got refactored with commit cf75ad8b41d2 ("Bluetooth:
hci_sync: Convert MGMT_SET_POWERED"), which made powering off the device
synchronous and removed the need for initiating the power_off work from
other places. The timeout mentioned above got removed too, because we now
also made use of the command timeout during power on/off.

These days the power_off work still exists, but it only seems to only be
used for HCI_AUTO_OFF functionality, which is why we never noticed
those two leftover places where we queue power_off work. So let's remove
that code.

Fixes: cf75ad8b41d2 ("Bluetooth: hci_sync: Convert MGMT_SET_POWERED")
Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/mgmt.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index bb72ff6eb22f4b..d1c55e409659f0 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -9764,14 +9764,6 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr,
 	struct mgmt_ev_device_disconnected ev;
 	struct sock *sk = NULL;
 
-	/* The connection is still in hci_conn_hash so test for 1
-	 * instead of 0 to know if this is the last one.
-	 */
-	if (mgmt_powering_down(hdev) && hci_conn_count(hdev) == 1) {
-		cancel_delayed_work(&hdev->power_off);
-		queue_work(hdev->req_workqueue, &hdev->power_off.work);
-	}
-
 	if (!mgmt_connected)
 		return;
 
@@ -9828,14 +9820,6 @@ void mgmt_connect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
 {
 	struct mgmt_ev_connect_failed ev;
 
-	/* The connection is still in hci_conn_hash so test for 1
-	 * instead of 0 to know if this is the last one.
-	 */
-	if (mgmt_powering_down(hdev) && hci_conn_count(hdev) == 1) {
-		cancel_delayed_work(&hdev->power_off);
-		queue_work(hdev->req_workqueue, &hdev->power_off.work);
-	}
-
 	bacpy(&ev.addr.bdaddr, bdaddr);
 	ev.addr.type = link_to_bdaddr(link_type, addr_type);
 	ev.status = mgmt_status(status);

From 18035d4098a0129131f3f5dcd0d3e233fe647e10 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 8 Jan 2024 12:30:44 +1030
Subject: [PATCH 089/707] btrfs: remove unused variable bio_offset from
 end_bbio_data_read()

The variable @bio_offset was introduced in commit 7ffd27e378d2 ("btrfs:
pass bio_offset to check_data_csum() directly"), when we are still using
the same endio function for both data and metadata.

Later we had several changes to data and metadata endio functions:

- Data verification is handled by btrfs bio layer

- Split data and metadata endio paths

Now for data path we no longer do any verification in
end_bbio_data_read(), as the verification is handled by btrfs bio layer
already.

Thus there is no need for such bio_offset variable.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0ea8c401e3a88a..3dc387edc164a9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -596,11 +596,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
 	struct bio *bio = &bbio->bio;
 	struct processed_extent processed = { 0 };
 	struct folio_iter fi;
-	/*
-	 * The offset to the beginning of a bio, since one bio can never be
-	 * larger than UINT_MAX, u32 here is enough.
-	 */
-	u32 bio_offset = 0;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
 	bio_for_each_folio_all(fi, &bbio->bio) {
@@ -667,10 +662,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
 		end_page_read(folio_page(folio, 0), uptodate, start, len);
 		endio_readpage_release_extent(&processed, BTRFS_I(inode),
 					      start, end, uptodate);
-
-		ASSERT(bio_offset + len > bio_offset);
-		bio_offset += len;
-
 	}
 	/* Release the last extent */
 	endio_readpage_release_extent(&processed, NULL, 0, 0, false);

From 08236d11031b9001d2223d708094e5e4882a9ddc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 5 Jan 2024 16:05:55 +1030
Subject: [PATCH 090/707] btrfs: cache folio size and shift in extent_buffer

After the conversion to folio interfaces (but without the patch to
enable larger folio allocation), there is a LTP report about observable
performance drop on metadata heavy operations.

This drop is caused by the extra code of calculating the
folio_size()/folio_shift(), instead of the old hard coded
PAGE_SIZE/PAGE_SHIFT.

To slightly reduce the overhead, just cache both folio_size and
folio_shift in extent_buffer.

The two new members (u32 folio_size and u8 folio_shift) is stored inside
the holes of extent_buffer. (folio_size is shared with len, which is
reduced to u32).

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.c | 12 ++++++------
 fs/btrfs/ctree.c     |  2 +-
 fs/btrfs/disk-io.c   |  2 +-
 fs/btrfs/extent_io.c | 38 +++++++++++++++++++++-----------------
 fs/btrfs/extent_io.h | 16 +++++++++++++---
 5 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index 1925a0919ca62f..6eb850ad37d2ae 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -63,8 +63,8 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token,		\
 	const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
 	const unsigned long oil = get_eb_offset_in_folio(token->eb,	\
 							 member_offset);\
-	const int unit_size = folio_size(token->eb->folios[0]);		\
-	const int unit_shift = folio_shift(token->eb->folios[0]);	\
+	const int unit_size = token->eb->folio_size;			\
+	const int unit_shift = token->eb->folio_shift;			\
 	const int size = sizeof(u##bits);				\
 	u8 lebytes[sizeof(u##bits)];					\
 	const int part = unit_size - oil;				\
@@ -94,7 +94,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb,		\
 	const unsigned long idx = get_eb_folio_index(eb, member_offset);\
 	const unsigned long oil = get_eb_offset_in_folio(eb,		\
 							 member_offset);\
-	const int unit_size = folio_size(eb->folios[0]);		\
+	const int unit_size = eb->folio_size;				\
 	char *kaddr = folio_address(eb->folios[idx]);			\
 	const int size = sizeof(u##bits);				\
 	const int part = unit_size - oil;				\
@@ -117,8 +117,8 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token,		\
 	const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
 	const unsigned long oil = get_eb_offset_in_folio(token->eb,	\
 							 member_offset);\
-	const int unit_size = folio_size(token->eb->folios[0]);		\
-	const int unit_shift = folio_shift(token->eb->folios[0]);	\
+	const int unit_size = token->eb->folio_size;			\
+	const int unit_shift = token->eb->folio_shift;			\
 	const int size = sizeof(u##bits);				\
 	u8 lebytes[sizeof(u##bits)];					\
 	const int part = unit_size - oil;				\
@@ -151,7 +151,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr,	\
 	const unsigned long idx = get_eb_folio_index(eb, member_offset);\
 	const unsigned long oil = get_eb_offset_in_folio(eb,		\
 							 member_offset);\
-	const int unit_size = folio_size(eb->folios[0]);		\
+	const int unit_size = eb->folio_size;				\
 	char *kaddr = folio_address(eb->folios[idx]);			\
 	const int size = sizeof(u##bits);				\
 	const int part = unit_size - oil;				\
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e65e012bac5531..33145da449cc8d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -820,7 +820,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
 	}
 
 	while (low < high) {
-		const int unit_size = folio_size(eb->folios[0]);
+		const int unit_size = eb->folio_size;
 		unsigned long oil;
 		unsigned long offset;
 		struct btrfs_disk_key *tmp;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c6907d533fe839..57be7dd44da5db 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -193,7 +193,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
 		struct folio *folio = eb->folios[i];
 		u64 start = max_t(u64, eb->start, folio_pos(folio));
 		u64 end = min_t(u64, eb->start + eb->len,
-				folio_pos(folio) + folio_size(folio));
+				folio_pos(folio) + eb->folio_size);
 		u32 len = end - start;
 
 		ret = btrfs_repair_io_failure(fs_info, 0, start, len,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3dc387edc164a9..c8aabe3be16978 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -78,7 +78,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
 		eb = list_first_entry(&fs_info->allocated_ebs,
 				      struct extent_buffer, leak_list);
 		pr_err(
-	"BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
+	"BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n",
 		       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
 		       btrfs_header_owner(eb));
 		list_del(&eb->leak_list);
@@ -729,6 +729,8 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp)
 
 	for (int i = 0; i < num_pages; i++)
 		eb->folios[i] = page_folio(page_array[i]);
+	eb->folio_size = PAGE_SIZE;
+	eb->folio_shift = PAGE_SHIFT;
 	return 0;
 }
 
@@ -1728,10 +1730,10 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
 			folio_lock(folio);
 			folio_clear_dirty_for_io(folio);
 			folio_start_writeback(folio);
-			ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0);
+			ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
 			ASSERT(ret);
 			wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
-						 folio_size(folio));
+						 eb->folio_size);
 			wbc->nr_to_write -= folio_nr_pages(folio);
 			folio_unlock(folio);
 		}
@@ -3523,7 +3525,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
 	/* For now, we should only have single-page folios for btree inode. */
 	ASSERT(folio_nr_pages(existing_folio) == 1);
 
-	if (folio_size(existing_folio) != folio_size(eb->folios[0])) {
+	if (folio_size(existing_folio) != eb->folio_size) {
 		folio_unlock(existing_folio);
 		folio_put(existing_folio);
 		return -EAGAIN;
@@ -3666,6 +3668,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 		 * and free the allocated page.
 		 */
 		folio = eb->folios[i];
+		eb->folio_size = folio_size(folio);
+		eb->folio_shift = folio_shift(folio);
 		spin_lock(&mapping->private_lock);
 		/* Should not fail, as we have preallocated the memory */
 		ret = attach_extent_buffer_folio(eb, folio, prealloc);
@@ -4115,7 +4119,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
 		for (int i = 0; i < num_folios; i++) {
 			struct folio *folio = eb->folios[i];
 
-			ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0);
+			ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
 			ASSERT(ret);
 		}
 	}
@@ -4135,7 +4139,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
 			    unsigned long len)
 {
 	btrfs_warn(eb->fs_info,
-		"access to eb bytenr %llu len %lu out of range start %lu len %lu",
+		"access to eb bytenr %llu len %u out of range start %lu len %lu",
 		eb->start, eb->len, start, len);
 	WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
 
@@ -4164,7 +4168,7 @@ static inline int check_eb_range(const struct extent_buffer *eb,
 void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
 			unsigned long start, unsigned long len)
 {
-	const int unit_size = folio_size(eb->folios[0]);
+	const int unit_size = eb->folio_size;
 	size_t cur;
 	size_t offset;
 	char *dst = (char *)dstv;
@@ -4204,7 +4208,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
 				       void __user *dstv,
 				       unsigned long start, unsigned long len)
 {
-	const int unit_size = folio_size(eb->folios[0]);
+	const int unit_size = eb->folio_size;
 	size_t cur;
 	size_t offset;
 	char __user *dst = (char __user *)dstv;
@@ -4244,7 +4248,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
 			 unsigned long start, unsigned long len)
 {
-	const int unit_size = folio_size(eb->folios[0]);
+	const int unit_size = eb->folio_size;
 	size_t cur;
 	size_t offset;
 	char *kaddr;
@@ -4315,7 +4319,7 @@ static void __write_extent_buffer(const struct extent_buffer *eb,
 				  const void *srcv, unsigned long start,
 				  unsigned long len, bool use_memmove)
 {
-	const int unit_size = folio_size(eb->folios[0]);
+	const int unit_size = eb->folio_size;
 	size_t cur;
 	size_t offset;
 	char *kaddr;
@@ -4364,7 +4368,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
 static void memset_extent_buffer(const struct extent_buffer *eb, int c,
 				 unsigned long start, unsigned long len)
 {
-	const int unit_size = folio_size(eb->folios[0]);
+	const int unit_size = eb->folio_size;
 	unsigned long cur = start;
 
 	if (eb->addr) {
@@ -4395,7 +4399,7 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
 void copy_extent_buffer_full(const struct extent_buffer *dst,
 			     const struct extent_buffer *src)
 {
-	const int unit_size = folio_size(src->folios[0]);
+	const int unit_size = src->folio_size;
 	unsigned long cur = 0;
 
 	ASSERT(dst->len == src->len);
@@ -4417,7 +4421,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
 			unsigned long dst_offset, unsigned long src_offset,
 			unsigned long len)
 {
-	const int unit_size = folio_size(dst->folios[0]);
+	const int unit_size = dst->folio_size;
 	u64 dst_len = dst->len;
 	size_t cur;
 	size_t offset;
@@ -4473,10 +4477,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
 	 * the bitmap item in the extent buffer + the offset of the byte in the
 	 * bitmap item.
 	 */
-	offset = start + offset_in_folio(eb->folios[0], eb->start) + byte_offset;
+	offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset;
 
-	*folio_index = offset >> folio_shift(eb->folios[0]);
-	*folio_offset = offset_in_folio(eb->folios[0], offset);
+	*folio_index = offset >> eb->folio_shift;
+	*folio_offset = offset_in_eb_folio(eb, offset);
 }
 
 /*
@@ -4590,7 +4594,7 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
 			  unsigned long dst_offset, unsigned long src_offset,
 			  unsigned long len)
 {
-	const int unit_size = folio_size(dst->folios[0]);
+	const int unit_size = dst->folio_size;
 	unsigned long cur_off = 0;
 
 	if (check_eb_range(dst, dst_offset, len) ||
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 46050500529bff..8e5639597800a7 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -8,6 +8,7 @@
 #include <linux/fiemap.h>
 #include <linux/btrfs_tree.h>
 #include "compression.h"
+#include "messages.h"
 #include "ulist.h"
 #include "misc.h"
 
@@ -75,7 +76,8 @@ void __cold extent_buffer_free_cachep(void);
 #define INLINE_EXTENT_BUFFER_PAGES     (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
 struct extent_buffer {
 	u64 start;
-	unsigned long len;
+	u32 len;
+	u32 folio_size;
 	unsigned long bflags;
 	struct btrfs_fs_info *fs_info;
 
@@ -90,6 +92,7 @@ struct extent_buffer {
 	int read_mirror;
 	/* >= 0 if eb belongs to a log tree, -1 otherwise */
 	s8 log_index;
+	u8 folio_shift;
 	struct rcu_head rcu_head;
 
 	struct rw_semaphore lock;
@@ -113,6 +116,13 @@ struct btrfs_eb_write_context {
 	struct btrfs_block_group *zoned_bg;
 };
 
+static inline unsigned long offset_in_eb_folio(const struct extent_buffer *eb,
+					       u64 start)
+{
+	ASSERT(eb->folio_size);
+	return start & (eb->folio_size - 1);
+}
+
 /*
  * Get the correct offset inside the page of extent buffer.
  *
@@ -151,13 +161,13 @@ static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb,
 	 *	   the folio_shift would be large enough to always make us
 	 *	   return 0 as index.
 	 *    1.2) Several page sized folios
-	 *         The folio_shift() would be PAGE_SHIFT, giving us the correct
+	 *         The folio_shift would be PAGE_SHIFT, giving us the correct
 	 *         index.
 	 *
 	 * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
 	 *    The folio would only be page sized, and always give us 0 as index.
 	 */
-	return offset >> folio_shift(eb->folios[0]);
+	return offset >> eb->folio_shift;
 }
 
 /*

From 2b16c80d801185e748b7e3f7ff9e842bb9fb4267 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Sun, 7 Jan 2024 19:02:49 +0100
Subject: [PATCH 091/707] Bluetooth: Add new state HCI_POWERING_DOWN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new state HCI_POWERING_DOWN that indicates that the device is
currently powering down, this will be useful for the next commit.

Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h |  1 +
 net/bluetooth/hci_sync.c    | 16 +++++++++++-----
 net/bluetooth/mgmt.c        | 14 ++++++++++++++
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index f7918c7551834b..a94a8491ec7a1a 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -372,6 +372,7 @@ enum {
 	HCI_SETUP,
 	HCI_CONFIG,
 	HCI_DEBUGFS_CREATED,
+	HCI_POWERING_DOWN,
 	HCI_AUTO_OFF,
 	HCI_RFKILLED,
 	HCI_MGMT,
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 5716345a26dfb7..b146562a65fc40 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -5403,27 +5403,33 @@ static int hci_power_off_sync(struct hci_dev *hdev)
 	if (!test_bit(HCI_UP, &hdev->flags))
 		return 0;
 
+	hci_dev_set_flag(hdev, HCI_POWERING_DOWN);
+
 	if (test_bit(HCI_ISCAN, &hdev->flags) ||
 	    test_bit(HCI_PSCAN, &hdev->flags)) {
 		err = hci_write_scan_enable_sync(hdev, 0x00);
 		if (err)
-			return err;
+			goto out;
 	}
 
 	err = hci_clear_adv_sync(hdev, NULL, false);
 	if (err)
-		return err;
+		goto out;
 
 	err = hci_stop_discovery_sync(hdev);
 	if (err)
-		return err;
+		goto out;
 
 	/* Terminated due to Power Off */
 	err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF);
 	if (err)
-		return err;
+		goto out;
+
+	err = hci_dev_close_sync(hdev);
 
-	return hci_dev_close_sync(hdev);
+out:
+	hci_dev_clear_flag(hdev, HCI_POWERING_DOWN);
+	return err;
 }
 
 int hci_set_powered_sync(struct hci_dev *hdev, u8 val)
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index d1c55e409659f0..cabc5466401754 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -1388,6 +1388,14 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	hci_dev_lock(hdev);
 
+	if (!cp->val) {
+		if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN)) {
+			err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED,
+					      MGMT_STATUS_BUSY);
+			goto failed;
+		}
+	}
+
 	if (pending_find(MGMT_OP_SET_POWERED, hdev)) {
 		err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED,
 				      MGMT_STATUS_BUSY);
@@ -9746,6 +9754,9 @@ bool mgmt_powering_down(struct hci_dev *hdev)
 	struct mgmt_pending_cmd *cmd;
 	struct mgmt_mode *cp;
 
+	if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN))
+		return true;
+
 	cmd = pending_find(MGMT_OP_SET_POWERED, hdev);
 	if (!cmd)
 		return false;
@@ -10053,6 +10064,9 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status)
 		/* If this is a HCI command related to powering on the
 		 * HCI dev don't send any mgmt signals.
 		 */
+		if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN))
+			return;
+
 		if (pending_find(MGMT_OP_SET_POWERED, hdev))
 			return;
 	}

From 088656165c2d9c9236b64ab45c51f509f22b1eee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Sun, 7 Jan 2024 19:02:50 +0100
Subject: [PATCH 092/707] Bluetooth: Disconnect connected devices before
 rfkilling adapter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On a lot of platforms (at least the MS Surface devices, M1 macbooks, and
a few ThinkPads) firmware doesn't do its job when rfkilling a device
and the bluetooth adapter is not actually shut down properly on rfkill.
This leads to connected devices remaining in connected state and the
bluetooth connection eventually timing out after rfkilling an adapter.

Use the rfkill hook in the HCI driver to go through the full power-off
sequence (including stopping scans and disconnecting devices) before
rfkilling it, just like MGMT_OP_SET_POWERED would do.

In case anything during the larger power-off sequence fails, make sure
the device is still closed and the rfkill ends up being effective in
the end.

Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_core.c | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 2821a42cefdc6e..e5cb618fa6d39c 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -940,20 +940,51 @@ int hci_get_dev_info(void __user *arg)
 
 /* ---- Interface to HCI drivers ---- */
 
+static int hci_dev_do_poweroff(struct hci_dev *hdev)
+{
+	int err;
+
+	BT_DBG("%s %p", hdev->name, hdev);
+
+	hci_req_sync_lock(hdev);
+
+	err = hci_set_powered_sync(hdev, false);
+
+	hci_req_sync_unlock(hdev);
+
+	return err;
+}
+
 static int hci_rfkill_set_block(void *data, bool blocked)
 {
 	struct hci_dev *hdev = data;
+	int err;
 
 	BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked);
 
 	if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL))
 		return -EBUSY;
 
+	if (blocked == hci_dev_test_flag(hdev, HCI_RFKILLED))
+		return 0;
+
 	if (blocked) {
 		hci_dev_set_flag(hdev, HCI_RFKILLED);
+
 		if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
-		    !hci_dev_test_flag(hdev, HCI_CONFIG))
-			hci_dev_do_close(hdev);
+		    !hci_dev_test_flag(hdev, HCI_CONFIG)) {
+			err = hci_dev_do_poweroff(hdev);
+			if (err) {
+				bt_dev_err(hdev, "Error when powering off device on rfkill (%d)",
+					   err);
+
+				/* Make sure the device is still closed even if
+				 * anything during power off sequence (eg.
+				 * disconnecting devices) failed.
+				 */
+				hci_dev_do_close(hdev);
+			}
+		}
 	} else {
 		hci_dev_clear_flag(hdev, HCI_RFKILLED);
 	}

From de4b77527d36ac19ef88e518e4ae7148cc8cc210 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 4 Jan 2024 11:48:46 -0800
Subject: [PATCH 093/707] btrfs: don't abort filesystem when attempting to
 snapshot deleted subvolume

If the source file descriptor to the snapshot ioctl refers to a deleted
subvolume, we get the following abort:

  BTRFS: Transaction aborted (error -2)
  WARNING: CPU: 0 PID: 833 at fs/btrfs/transaction.c:1875 create_pending_snapshot+0x1040/0x1190 [btrfs]
  Modules linked in: pata_acpi btrfs ata_piix libata scsi_mod virtio_net blake2b_generic xor net_failover virtio_rng failover scsi_common rng_core raid6_pq libcrc32c
  CPU: 0 PID: 833 Comm: t_snapshot_dele Not tainted 6.7.0-rc6 #2
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-1.fc39 04/01/2014
  RIP: 0010:create_pending_snapshot+0x1040/0x1190 [btrfs]
  RSP: 0018:ffffa09c01337af8 EFLAGS: 00010282
  RAX: 0000000000000000 RBX: ffff9982053e7c78 RCX: 0000000000000027
  RDX: ffff99827dc20848 RSI: 0000000000000001 RDI: ffff99827dc20840
  RBP: ffffa09c01337c00 R08: 0000000000000000 R09: ffffa09c01337998
  R10: 0000000000000003 R11: ffffffffb96da248 R12: fffffffffffffffe
  R13: ffff99820535bb28 R14: ffff99820b7bd000 R15: ffff99820381ea80
  FS:  00007fe20aadabc0(0000) GS:ffff99827dc00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000559a120b502f CR3: 00000000055b6000 CR4: 00000000000006f0
  Call Trace:
   <TASK>
   ? create_pending_snapshot+0x1040/0x1190 [btrfs]
   ? __warn+0x81/0x130
   ? create_pending_snapshot+0x1040/0x1190 [btrfs]
   ? report_bug+0x171/0x1a0
   ? handle_bug+0x3a/0x70
   ? exc_invalid_op+0x17/0x70
   ? asm_exc_invalid_op+0x1a/0x20
   ? create_pending_snapshot+0x1040/0x1190 [btrfs]
   ? create_pending_snapshot+0x1040/0x1190 [btrfs]
   create_pending_snapshots+0x92/0xc0 [btrfs]
   btrfs_commit_transaction+0x66b/0xf40 [btrfs]
   btrfs_mksubvol+0x301/0x4d0 [btrfs]
   btrfs_mksnapshot+0x80/0xb0 [btrfs]
   __btrfs_ioctl_snap_create+0x1c2/0x1d0 [btrfs]
   btrfs_ioctl_snap_create_v2+0xc4/0x150 [btrfs]
   btrfs_ioctl+0x8a6/0x2650 [btrfs]
   ? kmem_cache_free+0x22/0x340
   ? do_sys_openat2+0x97/0xe0
   __x64_sys_ioctl+0x97/0xd0
   do_syscall_64+0x46/0xf0
   entry_SYSCALL_64_after_hwframe+0x6e/0x76
  RIP: 0033:0x7fe20abe83af
  RSP: 002b:00007ffe6eff1360 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
  RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fe20abe83af
  RDX: 00007ffe6eff23c0 RSI: 0000000050009417 RDI: 0000000000000003
  RBP: 0000000000000003 R08: 0000000000000000 R09: 00007fe20ad16cd0
  R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
  R13: 00007ffe6eff13c0 R14: 00007fe20ad45000 R15: 0000559a120b6d58
   </TASK>
  ---[ end trace 0000000000000000 ]---
  BTRFS: error (device vdc: state A) in create_pending_snapshot:1875: errno=-2 No such entry
  BTRFS info (device vdc: state EA): forced readonly
  BTRFS warning (device vdc: state EA): Skipping commit of aborted transaction.
  BTRFS: error (device vdc: state EA) in cleanup_transaction:2055: errno=-2 No such entry

This happens because create_pending_snapshot() initializes the new root
item as a copy of the source root item. This includes the refs field,
which is 0 for a deleted subvolume. The call to btrfs_insert_root()
therefore inserts a root with refs == 0. btrfs_get_new_fs_root() then
finds the root and returns -ENOENT if refs == 0, which causes
create_pending_snapshot() to abort.

Fix it by checking the source root's refs before attempting the
snapshot, but after locking subvol_sem to avoid racing with deletion.

CC: stable@vger.kernel.org # 4.14+
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a1743904202b78..0af214c8bef4d2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -790,6 +790,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 		return -EOPNOTSUPP;
 	}
 
+	if (btrfs_root_refs(&root->root_item) == 0)
+		return -ENOENT;
+
 	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
 		return -EINVAL;
 

From 92dd093f23d224e2317e47376ea2796c79fa969f Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 4 Jan 2024 11:48:47 -0800
Subject: [PATCH 094/707] btrfs: avoid copying BTRFS_ROOT_SUBVOL_DEAD flag to
 snapshot of subvolume being deleted

Sweet Tea spotted a race between subvolume deletion and snapshotting
that can result in the root item for the snapshot having the
BTRFS_ROOT_SUBVOL_DEAD flag set. The race is:

Thread 1                                      | Thread 2
----------------------------------------------|----------
btrfs_delete_subvolume                        |
  btrfs_set_root_flags(BTRFS_ROOT_SUBVOL_DEAD)|
                                              |btrfs_mksubvol
                                              |  down_read(subvol_sem)
                                              |  create_snapshot
                                              |    ...
                                              |    create_pending_snapshot
                                              |      copy root item from source
  down_write(subvol_sem)                      |

This flag is only checked in send and swap activate, which this would
cause to fail mysteriously.

create_snapshot() now checks the root refs to reject a deleted
subvolume, so we can fix this by locking subvol_sem earlier so that the
BTRFS_ROOT_SUBVOL_DEAD flag and the root refs are updated atomically.

CC: stable@vger.kernel.org # 4.14+
Reported-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Reviewed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1002ebde14da20..e285ddbcdee04f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4458,6 +4458,8 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 	u64 root_flags;
 	int ret;
 
+	down_write(&fs_info->subvol_sem);
+
 	/*
 	 * Don't allow to delete a subvolume with send in progress. This is
 	 * inside the inode lock so the error handling that has to drop the bit
@@ -4469,25 +4471,25 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 		btrfs_warn(fs_info,
 			   "attempt to delete subvolume %llu during send",
 			   dest->root_key.objectid);
-		return -EPERM;
+		ret = -EPERM;
+		goto out_up_write;
 	}
 	if (atomic_read(&dest->nr_swapfiles)) {
 		spin_unlock(&dest->root_item_lock);
 		btrfs_warn(fs_info,
 			   "attempt to delete subvolume %llu with active swapfile",
 			   root->root_key.objectid);
-		return -EPERM;
+		ret = -EPERM;
+		goto out_up_write;
 	}
 	root_flags = btrfs_root_flags(&dest->root_item);
 	btrfs_set_root_flags(&dest->root_item,
 			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
 	spin_unlock(&dest->root_item_lock);
 
-	down_write(&fs_info->subvol_sem);
-
 	ret = may_destroy_subvol(dest);
 	if (ret)
-		goto out_up_write;
+		goto out_undead;
 
 	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
 	/*
@@ -4497,7 +4499,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 	 */
 	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
 	if (ret)
-		goto out_up_write;
+		goto out_undead;
 
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
@@ -4563,15 +4565,17 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 	inode->i_flags |= S_DEAD;
 out_release:
 	btrfs_subvolume_release_metadata(root, &block_rsv);
-out_up_write:
-	up_write(&fs_info->subvol_sem);
+out_undead:
 	if (ret) {
 		spin_lock(&dest->root_item_lock);
 		root_flags = btrfs_root_flags(&dest->root_item);
 		btrfs_set_root_flags(&dest->root_item,
 				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
 		spin_unlock(&dest->root_item_lock);
-	} else {
+	}
+out_up_write:
+	up_write(&fs_info->subvol_sem);
+	if (!ret) {
 		d_invalidate(dentry);
 		btrfs_prune_dentries(dest);
 		ASSERT(dest->send_in_progress == 0);

From 7974b2128489d062c9d21419633eebde07f07032 Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Tue, 9 Jan 2024 19:03:23 +0800
Subject: [PATCH 095/707] Bluetooth: hci_event: Fix wrongly recorded wakeup
 BD_ADDR

hci_store_wake_reason() wrongly parses event HCI_Connection_Request
as HCI_Connection_Complete and HCI_Connection_Complete as
HCI_Connection_Request, so causes recording wakeup BD_ADDR error and
potential stability issue, fix it by using the correct field.

Fixes: 2f20216c1d6f ("Bluetooth: Emit controller suspend and resume events")
Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index ef8c3bed73617e..22b22c264c2a5e 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -7420,10 +7420,10 @@ static void hci_store_wake_reason(struct hci_dev *hdev, u8 event,
 	 * keep track of the bdaddr of the connection event that woke us up.
 	 */
 	if (event == HCI_EV_CONN_REQUEST) {
-		bacpy(&hdev->wake_addr, &conn_complete->bdaddr);
+		bacpy(&hdev->wake_addr, &conn_request->bdaddr);
 		hdev->wake_addr_type = BDADDR_BREDR;
 	} else if (event == HCI_EV_CONN_COMPLETE) {
-		bacpy(&hdev->wake_addr, &conn_request->bdaddr);
+		bacpy(&hdev->wake_addr, &conn_complete->bdaddr);
 		hdev->wake_addr_type = BDADDR_BREDR;
 	} else if (event == HCI_EV_LE_META) {
 		struct hci_ev_le_meta *le_ev = (void *)skb->data;

From a7ee39bea31519849985453b9acbab06894ee0d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Mon, 8 Jan 2024 23:46:06 +0100
Subject: [PATCH 096/707] Bluetooth: Remove superfluous call to
 hci_conn_check_pending()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "pending connections" feature was originally introduced with commit
4c67bc74f016 ("[Bluetooth] Support concurrent connect requests") and
6bd57416127e ("[Bluetooth] Handling pending connect attempts after
inquiry") to handle controllers supporting only a single connection request
at a time. Later things were extended to also cancel ongoing inquiries on
connect() with commit 89e65975fea5 ("Bluetooth: Cancel Inquiry before
Create Connection").

With commit a9de9248064b ("[Bluetooth] Switch from OGF+OCF to using only
opcodes"), hci_conn_check_pending() was introduced as a helper to
consolidate a few places where we check for pending connections (indicated
by the BT_CONNECT2 flag) and then try to connect.

This refactoring commit also snuck in two more calls to
hci_conn_check_pending():

- One is in the failure callback of hci_cs_inquiry(), this one probably
makes sense: If we send an "HCI Inquiry" command and then immediately
after a "Create Connection" command, the "Create Connection" command might
fail before the "HCI Inquiry" command, and then we want to retry the
"Create Connection" on failure of the "HCI Inquiry".

- The other added call to hci_conn_check_pending() is in the event handler
for the "Remote Name" event, this seems unrelated and is possibly a
copy-paste error, so remove that one.

Fixes: a9de9248064b ("[Bluetooth] Switch from OGF+OCF to using only opcodes")
Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 22b22c264c2a5e..23e0e63ac312be 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3556,8 +3556,6 @@ static void hci_remote_name_evt(struct hci_dev *hdev, void *data,
 
 	bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
 
-	hci_conn_check_pending(hdev);
-
 	hci_dev_lock(hdev);
 
 	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);

From f8c47ee39e6dc6170da06865b84e8c8b08e87ab0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Dre=C3=9Fler?= <verdre@v0yd.nl>
Date: Mon, 8 Jan 2024 23:46:07 +0100
Subject: [PATCH 097/707] Bluetooth: hci_event: Use HCI error defines instead
 of magic values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We have error defines already, so let's use them.

Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h | 2 ++
 net/bluetooth/hci_event.c   | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index a94a8491ec7a1a..1cd212bb378916 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -653,6 +653,7 @@ enum {
 #define HCI_ERROR_PIN_OR_KEY_MISSING	0x06
 #define HCI_ERROR_MEMORY_EXCEEDED	0x07
 #define HCI_ERROR_CONNECTION_TIMEOUT	0x08
+#define HCI_ERROR_COMMAND_DISALLOWED	0x0c
 #define HCI_ERROR_REJ_LIMITED_RESOURCES	0x0d
 #define HCI_ERROR_REJ_BAD_ADDR		0x0f
 #define HCI_ERROR_INVALID_PARAMETERS	0x12
@@ -661,6 +662,7 @@ enum {
 #define HCI_ERROR_REMOTE_POWER_OFF	0x15
 #define HCI_ERROR_LOCAL_HOST_TERM	0x16
 #define HCI_ERROR_PAIRING_NOT_ALLOWED	0x18
+#define HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE	0x1e
 #define HCI_ERROR_INVALID_LL_PARAMS	0x1e
 #define HCI_ERROR_UNSPECIFIED		0x1f
 #define HCI_ERROR_ADVERTISING_TIMEOUT	0x3c
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 23e0e63ac312be..6130c969f361a7 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -95,11 +95,11 @@ static u8 hci_cc_inquiry_cancel(struct hci_dev *hdev, void *data,
 	/* It is possible that we receive Inquiry Complete event right
 	 * before we receive Inquiry Cancel Command Complete event, in
 	 * which case the latter event should have status of Command
-	 * Disallowed (0x0c). This should not be treated as error, since
+	 * Disallowed. This should not be treated as error, since
 	 * we actually achieve what Inquiry Cancel wants to achieve,
 	 * which is to end the last Inquiry session.
 	 */
-	if (rp->status == 0x0c && !test_bit(HCI_INQUIRY, &hdev->flags)) {
+	if (rp->status == HCI_ERROR_COMMAND_DISALLOWED && !test_bit(HCI_INQUIRY, &hdev->flags)) {
 		bt_dev_warn(hdev, "Ignoring error of Inquiry Cancel command");
 		rp->status = 0x00;
 	}
@@ -2342,7 +2342,7 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
 
 	if (status) {
 		if (conn && conn->state == BT_CONNECT) {
-			if (status != 0x0c || conn->attempt > 2) {
+			if (status != HCI_ERROR_COMMAND_DISALLOWED || conn->attempt > 2) {
 				conn->state = BT_CLOSED;
 				hci_connect_cfm(conn, status);
 				hci_conn_del(conn);
@@ -6679,7 +6679,7 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, void *data,
 			 * transition into connected state and mark it as
 			 * successful.
 			 */
-			if (!conn->out && ev->status == 0x1a &&
+			if (!conn->out && ev->status == HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE &&
 			    (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES))
 				status = 0x00;
 			else

From 711c35949648ba19f54bce27b49ced0ad90b19b9 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 9 Jan 2024 13:45:40 -0500
Subject: [PATCH 098/707] Bluetooth: hci_core: Cancel request on command
 timeout

If command has timed out call __hci_cmd_sync_cancel to notify the
hci_req since it will inevitably cause a timeout.

This also rework the code around __hci_cmd_sync_cancel since it was
wrongly assuming it needs to cancel timer as well, but sometimes the
timers have not been started or in fact they already had timed out in
which case they don't need to be cancel yet again.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |  2 +-
 net/bluetooth/hci_core.c         | 84 ++++++++++++++++++++++----------
 net/bluetooth/hci_request.c      |  2 +-
 net/bluetooth/hci_sync.c         | 20 ++++----
 net/bluetooth/mgmt.c             |  2 +-
 5 files changed, 71 insertions(+), 39 deletions(-)

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 6efbc2152146bd..e2582c24254498 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -42,7 +42,7 @@ int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen,
 void hci_cmd_sync_init(struct hci_dev *hdev);
 void hci_cmd_sync_clear(struct hci_dev *hdev);
 void hci_cmd_sync_cancel(struct hci_dev *hdev, int err);
-void __hci_cmd_sync_cancel(struct hci_dev *hdev, int err);
+void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err);
 
 int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 			void *data, hci_cmd_sync_work_destroy_t destroy);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index e5cb618fa6d39c..de730d210ccb45 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1523,10 +1523,11 @@ static void hci_cmd_timeout(struct work_struct *work)
 					    cmd_timer.work);
 
 	if (hdev->sent_cmd) {
-		struct hci_command_hdr *sent = (void *) hdev->sent_cmd->data;
-		u16 opcode = __le16_to_cpu(sent->opcode);
+		u16 opcode = hci_skb_opcode(hdev->sent_cmd);
 
 		bt_dev_err(hdev, "command 0x%4.4x tx timeout", opcode);
+
+		hci_cmd_sync_cancel_sync(hdev, ETIMEDOUT);
 	} else {
 		bt_dev_err(hdev, "command tx timeout");
 	}
@@ -2857,6 +2858,23 @@ int hci_unregister_suspend_notifier(struct hci_dev *hdev)
 	return ret;
 }
 
+/* Cancel ongoing command synchronously:
+ *
+ * - Cancel command timer
+ * - Reset command counter
+ * - Cancel command request
+ */
+static void hci_cancel_cmd_sync(struct hci_dev *hdev, int err)
+{
+	bt_dev_dbg(hdev, "err 0x%2.2x", err);
+
+	cancel_delayed_work_sync(&hdev->cmd_timer);
+	cancel_delayed_work_sync(&hdev->ncmd_timer);
+	atomic_set(&hdev->cmd_cnt, 1);
+
+	hci_cmd_sync_cancel_sync(hdev, -err);
+}
+
 /* Suspend HCI device */
 int hci_suspend_dev(struct hci_dev *hdev)
 {
@@ -2874,7 +2892,7 @@ int hci_suspend_dev(struct hci_dev *hdev)
 		return 0;
 
 	/* Cancel potentially blocking sync operation before suspend */
-	__hci_cmd_sync_cancel(hdev, -EHOSTDOWN);
+	hci_cancel_cmd_sync(hdev, -EHOSTDOWN);
 
 	hci_req_sync_lock(hdev);
 	ret = hci_suspend_sync(hdev);
@@ -4159,6 +4177,33 @@ static void hci_rx_work(struct work_struct *work)
 	}
 }
 
+static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	int err;
+
+	bt_dev_dbg(hdev, "skb %p", skb);
+
+	kfree_skb(hdev->sent_cmd);
+
+	hdev->sent_cmd = skb_clone(skb, GFP_KERNEL);
+	if (!hdev->sent_cmd) {
+		skb_queue_head(&hdev->cmd_q, skb);
+		queue_work(hdev->workqueue, &hdev->cmd_work);
+		return;
+	}
+
+	err = hci_send_frame(hdev, skb);
+	if (err < 0) {
+		hci_cmd_sync_cancel_sync(hdev, err);
+		return;
+	}
+
+	if (hci_req_status_pend(hdev))
+		hci_dev_set_flag(hdev, HCI_CMD_PENDING);
+
+	atomic_dec(&hdev->cmd_cnt);
+}
+
 static void hci_cmd_work(struct work_struct *work)
 {
 	struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_work);
@@ -4173,30 +4218,15 @@ static void hci_cmd_work(struct work_struct *work)
 		if (!skb)
 			return;
 
-		kfree_skb(hdev->sent_cmd);
-
-		hdev->sent_cmd = skb_clone(skb, GFP_KERNEL);
-		if (hdev->sent_cmd) {
-			int res;
-			if (hci_req_status_pend(hdev))
-				hci_dev_set_flag(hdev, HCI_CMD_PENDING);
-			atomic_dec(&hdev->cmd_cnt);
+		hci_send_cmd_sync(hdev, skb);
 
-			res = hci_send_frame(hdev, skb);
-			if (res < 0)
-				__hci_cmd_sync_cancel(hdev, -res);
-
-			rcu_read_lock();
-			if (test_bit(HCI_RESET, &hdev->flags) ||
-			    hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
-				cancel_delayed_work(&hdev->cmd_timer);
-			else
-				queue_delayed_work(hdev->workqueue, &hdev->cmd_timer,
-						   HCI_CMD_TIMEOUT);
-			rcu_read_unlock();
-		} else {
-			skb_queue_head(&hdev->cmd_q, skb);
-			queue_work(hdev->workqueue, &hdev->cmd_work);
-		}
+		rcu_read_lock();
+		if (test_bit(HCI_RESET, &hdev->flags) ||
+		    hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
+			cancel_delayed_work(&hdev->cmd_timer);
+		else
+			queue_delayed_work(hdev->workqueue, &hdev->cmd_timer,
+					   HCI_CMD_TIMEOUT);
+		rcu_read_unlock();
 	}
 }
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 6e023b0104b039..00e02138003ece 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -895,7 +895,7 @@ void hci_request_setup(struct hci_dev *hdev)
 
 void hci_request_cancel_all(struct hci_dev *hdev)
 {
-	__hci_cmd_sync_cancel(hdev, ENODEV);
+	hci_cmd_sync_cancel_sync(hdev, ENODEV);
 
 	cancel_interleave_scan(hdev);
 }
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index b146562a65fc40..1122296ce3fa3f 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -584,7 +584,7 @@ void hci_cmd_sync_clear(struct hci_dev *hdev)
 	mutex_unlock(&hdev->cmd_sync_work_lock);
 }
 
-void __hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
+void hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
 {
 	bt_dev_dbg(hdev, "err 0x%2.2x", err);
 
@@ -592,15 +592,17 @@ void __hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
 		hdev->req_result = err;
 		hdev->req_status = HCI_REQ_CANCELED;
 
-		cancel_delayed_work_sync(&hdev->cmd_timer);
-		cancel_delayed_work_sync(&hdev->ncmd_timer);
-		atomic_set(&hdev->cmd_cnt, 1);
-
-		wake_up_interruptible(&hdev->req_wait_q);
+		queue_work(hdev->workqueue, &hdev->cmd_sync_cancel_work);
 	}
 }
+EXPORT_SYMBOL(hci_cmd_sync_cancel);
 
-void hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
+/* Cancel ongoing command request synchronously:
+ *
+ * - Set result and mark status to HCI_REQ_CANCELED
+ * - Wakeup command sync thread
+ */
+void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err)
 {
 	bt_dev_dbg(hdev, "err 0x%2.2x", err);
 
@@ -608,10 +610,10 @@ void hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
 		hdev->req_result = err;
 		hdev->req_status = HCI_REQ_CANCELED;
 
-		queue_work(hdev->workqueue, &hdev->cmd_sync_cancel_work);
+		wake_up_interruptible(&hdev->req_wait_q);
 	}
 }
-EXPORT_SYMBOL(hci_cmd_sync_cancel);
+EXPORT_SYMBOL(hci_cmd_sync_cancel_sync);
 
 /* Submit HCI command to be run in as cmd_sync_work:
  *
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index cabc5466401754..173986f3405f7a 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -1415,7 +1415,7 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	/* Cancel potentially blocking sync operation before power off */
 	if (cp->val == 0x00) {
-		__hci_cmd_sync_cancel(hdev, -EHOSTDOWN);
+		hci_cmd_sync_cancel_sync(hdev, -EHOSTDOWN);
 		err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd,
 					 mgmt_set_powered_complete);
 	} else {

From eaab5c300eab901be0040aa03bf22b20cc6ce0a5 Mon Sep 17 00:00:00 2001
From: Ulrik Strid <ulrik@strid.tech>
Date: Sat, 13 Jan 2024 15:27:38 +0800
Subject: [PATCH 099/707] Bluetooth: btusb: Add new VID/PID 13d3/3602 for
 MT7925

Add VID 13d3 & PID 3602 for MediaTek MT7925 USB Bluetooth chip.

The information in /sys/kernel/debug/usb/devices about the Bluetooth
device is listed as the below.

T:  Bus=07 Lev=01 Prnt=01 Port=10 Cnt=02 Dev#=  2 Spd=480  MxCh= 0
D:  Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=13d3 ProdID=3602 Rev= 1.00
S:  Manufacturer=MediaTek Inc.
S:  Product=Wireless_Device
S:  SerialNumber=000000000
C:* #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=100mA
A:  FirstIf#= 0 IfCount= 3 Cls=e0(wlcon) Sub=01 Prot=01
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=125us
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
I:  If#= 1 Alt= 6 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  63 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  63 Ivl=1ms
I:* If#= 2 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS=  64 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS=  64 Ivl=125us
I:  If#= 2 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS= 512 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS= 512 Ivl=125us

Signed-off-by: Ulrik Strid <ulrik@strid.tech>
Signed-off-by: Deren Wu <deren.wu@mediatek.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index a0a317bac0954f..c497d87ddf7b8b 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -658,6 +658,11 @@ static const struct usb_device_id quirks_table[] = {
 						     BTUSB_WIDEBAND_SPEECH |
 						     BTUSB_VALID_LE_STATES },
 
+	/* Additional MediaTek MT7925 Bluetooth devices */
+	{ USB_DEVICE(0x13d3, 0x3602), .driver_info = BTUSB_MEDIATEK |
+						     BTUSB_WIDEBAND_SPEECH |
+						     BTUSB_VALID_LE_STATES },
+
 	/* Additional Realtek 8723AE Bluetooth devices */
 	{ USB_DEVICE(0x0930, 0x021d), .driver_info = BTUSB_REALTEK },
 	{ USB_DEVICE(0x13d3, 0x3394), .driver_info = BTUSB_REALTEK },

From 6061d66bd0e5ac1eca2858356cd8d7c12f415176 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Mon, 15 Jan 2024 21:12:19 +0100
Subject: [PATCH 100/707] Bluetooth: Remove usage of the deprecated
 ida_simple_xx() API

ida_alloc() and ida_free() should be preferred to the deprecated
ida_simple_get() and ida_simple_remove().

Note that the upper limit of ida_simple_get() is exclusive, but the one of
ida_alloc_max() is inclusive. So a -1 has been added when needed.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_core.c | 9 +++++----
 net/bluetooth/hci_sock.c | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index de730d210ccb45..34c8dca2069f6b 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2640,10 +2640,11 @@ int hci_register_dev(struct hci_dev *hdev)
 	 */
 	switch (hdev->dev_type) {
 	case HCI_PRIMARY:
-		id = ida_simple_get(&hci_index_ida, 0, HCI_MAX_ID, GFP_KERNEL);
+		id = ida_alloc_max(&hci_index_ida, HCI_MAX_ID - 1, GFP_KERNEL);
 		break;
 	case HCI_AMP:
-		id = ida_simple_get(&hci_index_ida, 1, HCI_MAX_ID, GFP_KERNEL);
+		id = ida_alloc_range(&hci_index_ida, 1, HCI_MAX_ID - 1,
+				     GFP_KERNEL);
 		break;
 	default:
 		return -EINVAL;
@@ -2742,7 +2743,7 @@ int hci_register_dev(struct hci_dev *hdev)
 	destroy_workqueue(hdev->workqueue);
 	destroy_workqueue(hdev->req_workqueue);
 err:
-	ida_simple_remove(&hci_index_ida, hdev->id);
+	ida_free(&hci_index_ida, hdev->id);
 
 	return error;
 }
@@ -2825,7 +2826,7 @@ void hci_release_dev(struct hci_dev *hdev)
 	hci_dev_unlock(hdev);
 
 	ida_destroy(&hdev->unset_handle_ida);
-	ida_simple_remove(&hci_index_ida, hdev->id);
+	ida_free(&hci_index_ida, hdev->id);
 	kfree_skb(hdev->sent_cmd);
 	kfree_skb(hdev->recv_event);
 	kfree(hdev);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 3e7cd330d731ac..4ee1b976678b25 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -101,7 +101,7 @@ static bool hci_sock_gen_cookie(struct sock *sk)
 	int id = hci_pi(sk)->cookie;
 
 	if (!id) {
-		id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL);
+		id = ida_alloc_min(&sock_cookie_ida, 1, GFP_KERNEL);
 		if (id < 0)
 			id = 0xffffffff;
 
@@ -119,7 +119,7 @@ static void hci_sock_free_cookie(struct sock *sk)
 
 	if (id) {
 		hci_pi(sk)->cookie = 0xffffffff;
-		ida_simple_remove(&sock_cookie_ida, id);
+		ida_free(&sock_cookie_ida, id);
 	}
 }
 

From 7f29d67809293992a721edeab3903ad498e0e9cd Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Fri, 22 Dec 2023 22:37:03 -0500
Subject: [PATCH 101/707] ovl: Reject mounting case-insensitive filesystems

overlayfs relies on the filesystem setting DCACHE_OP_HASH or
DCACHE_OP_COMPARE to reject mounting over case-insensitive directories.

Since commit bb9cd9106b22 ("fscrypt: Have filesystems handle their
d_ops"), we set ->d_op through a hook in ->d_lookup, which
means the root dentry won't have them, causing the mount to accidentally
succeed.

In v6.7-rc7, the following sequence will succeed to mount, but any
dentry other than the root dentry will be a "weird" dentry to ovl and
fail with EREMOTE.

  mkfs.ext4 -O casefold lower.img
  mount -O loop lower.img lower
  mount -t overlay -o lowerdir=lower,upperdir=upper,workdir=work ovl /mnt

Mounting on a subdirectory fails, as expected, because DCACHE_OP_HASH
and DCACHE_OP_COMPARE are properly set by ->lookup.

Fix by explicitly rejecting superblocks that allow case-insensitive
dentries.

While there, re-sort the entries to have more descriptive error messages
first.

Fixes: bb9cd9106b22 ("fscrypt: Have filesystems handle their d_ops")
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Acked-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/overlayfs/params.c | 13 ++++++++++---
 include/linux/fs.h    |  9 +++++++++
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 112b4b12f8252a..a3edd9e1bdc464 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -280,12 +280,19 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path,
 {
 	struct ovl_fs_context *ctx = fc->fs_private;
 
-	if (ovl_dentry_weird(path->dentry))
-		return invalfc(fc, "filesystem on %s not supported", name);
-
 	if (!d_is_dir(path->dentry))
 		return invalfc(fc, "%s is not a directory", name);
 
+	/*
+	 * Root dentries of case-insensitive filesystems might not have
+	 * the dentry operations set, but still be incompatible with
+	 * overlayfs.  Check explicitly to prevent post-mount failures.
+	 */
+	if (sb_has_encoding(path->mnt->mnt_sb))
+		return invalfc(fc, "case-insensitive filesystem on %s not supported", name);
+
+	if (ovl_dentry_weird(path->dentry))
+		return invalfc(fc, "filesystem on %s not supported", name);
 
 	/*
 	 * Check whether upper path is read-only here to report failures
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e6ba0cc6f2eeea..a0eb8b5759a6fe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3282,6 +3282,15 @@ extern int generic_check_addressable(unsigned, u64);
 
 extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry);
 
+static inline bool sb_has_encoding(const struct super_block *sb)
+{
+#if IS_ENABLED(CONFIG_UNICODE)
+	return !!sb->s_encoding;
+#else
+	return false;
+#endif
+}
+
 int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
 		unsigned int ia_valid);
 int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *);

From dd7adce79e7748a946f917ce15106a89132daa10 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Wed, 17 Jan 2024 13:44:05 +0200
Subject: [PATCH 102/707] hwmon: put HWMON_CHANNEL_INFO() initializers in
 rodata

HWMON_CHANNEL_INFO() is supposed to be used as initializer for arrays of
const struct hwmon_channel_info *. However, without explicit const,
HWMON_CHANNEL_INFO() creates mutable compound literals, and the const
pointers point at the mutable data. Add const to place the data in
rodata.

Cc: Jean Delvare <jdelvare@suse.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://lore.kernel.org/r/20240117114405.1506775-1-jani.nikula@intel.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 include/linux/hwmon.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h
index 8cd6a6b335930d..c2c0da18dfa369 100644
--- a/include/linux/hwmon.h
+++ b/include/linux/hwmon.h
@@ -425,12 +425,12 @@ struct hwmon_channel_info {
 	const u32 *config;
 };
 
-#define HWMON_CHANNEL_INFO(stype, ...)	\
-	(&(struct hwmon_channel_info) {	\
-		.type = hwmon_##stype,	\
-		.config = (u32 []) {	\
-			__VA_ARGS__, 0	\
-		}			\
+#define HWMON_CHANNEL_INFO(stype, ...)		\
+	(&(const struct hwmon_channel_info) {	\
+		.type = hwmon_##stype,		\
+		.config = (const u32 []) {	\
+			__VA_ARGS__, 0		\
+		}				\
 	})
 
 /**

From 5cade5212484e9d552a0859b9d27d661a69c79f3 Mon Sep 17 00:00:00 2001
From: Forest Crossman <cyrozap@gmail.com>
Date: Sat, 13 Jan 2024 16:27:55 -0800
Subject: [PATCH 103/707] hwmon: (nct6683) Add another customer ID for MSI

This value was found on an MSI PRO X670-P WIFI with an NCT6687D chip.

Signed-off-by: Forest Crossman <cyrozap@gmail.com>
Link: https://lore.kernel.org/r/CAO3ALPwot01+bBisj7Roog7SD9UwV+y4NtiakKrBDE0tPvFhbw@mail.gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/nct6683.rst | 1 +
 drivers/hwmon/nct6683.c         | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/Documentation/hwmon/nct6683.rst b/Documentation/hwmon/nct6683.rst
index 3e7f6ee779c2f9..2a7a78eb1b4688 100644
--- a/Documentation/hwmon/nct6683.rst
+++ b/Documentation/hwmon/nct6683.rst
@@ -64,4 +64,5 @@ Intel DB85FL	NCT6683D EC firmware version 1.0 build 04/03/13
 ASRock X570	NCT6683D EC firmware version 1.0 build 06/28/19
 ASRock X670E	NCT6686D EC firmware version 1.0 build 05/19/22
 MSI B550	NCT6687D EC firmware version 1.0 build 05/07/20
+MSI X670-P	NCT6687D EC firmware version 0.0 build 09/27/22
 =============== ===============================================
diff --git a/drivers/hwmon/nct6683.c b/drivers/hwmon/nct6683.c
index 3f3f7a88413e01..0d016fedb9c2eb 100644
--- a/drivers/hwmon/nct6683.c
+++ b/drivers/hwmon/nct6683.c
@@ -174,6 +174,7 @@ superio_exit(int ioreg)
 #define NCT6683_CUSTOMER_ID_MITAC	0xa0e
 #define NCT6683_CUSTOMER_ID_MSI		0x201
 #define NCT6683_CUSTOMER_ID_MSI2	0x200
+#define NCT6683_CUSTOMER_ID_MSI3	0x207
 #define NCT6683_CUSTOMER_ID_ASROCK		0xe2c
 #define NCT6683_CUSTOMER_ID_ASROCK2	0xe1b
 #define NCT6683_CUSTOMER_ID_ASROCK3	0x1631
@@ -1224,6 +1225,8 @@ static int nct6683_probe(struct platform_device *pdev)
 		break;
 	case NCT6683_CUSTOMER_ID_MSI2:
 		break;
+	case NCT6683_CUSTOMER_ID_MSI3:
+		break;
 	case NCT6683_CUSTOMER_ID_ASROCK:
 		break;
 	case NCT6683_CUSTOMER_ID_ASROCK2:

From a53faa6bfa3bd670281e59d005482d24c98c4eb9 Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@amd.com>
Date: Wed, 20 Dec 2023 14:12:13 +0100
Subject: [PATCH 104/707] dt-bindings: hwmon: ina2xx: Add label property

Add a label property to allow a custom name to be used for identifying
a device on the board. This is useful when multiple devices are present on
the same board. Similar change was done by commit ffae65fb1ae4
("dt-bindings: spi: spi-cadence: Add label property").

Signed-off-by: Michal Simek <michal.simek@amd.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/6f3c57d08984c1978569d3918cb38eb295c0c67d.1703077926.git.michal.simek@amd.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml
index 378d1f6aeeb3f5..8e5c1935b5f4a7 100644
--- a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml
+++ b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml
@@ -32,6 +32,9 @@ properties:
   reg:
     maxItems: 1
 
+  label:
+    description: A descriptive name for this device.
+
   shunt-resistor:
     description:
       Shunt resistor value in micro-Ohm.
@@ -77,6 +80,7 @@ examples:
         power-sensor@44 {
             compatible = "ti,ina220";
             reg = <0x44>;
+            label = "vdd_3v0";
             shunt-resistor = <1000>;
             vs-supply = <&vdd_3v0>;
         };

From bd975f3fa28a0573781f4244c2b314ebfd7c1015 Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@amd.com>
Date: Wed, 20 Dec 2023 14:12:14 +0100
Subject: [PATCH 105/707] dt-bindings: hwmon: ina2xx: Describe
 #io-channel-cells property

There are two drivers in the Linux kernel. One is hwmon based and second
IIO. IIO version requires to define #io-channel-cells to operate.

Signed-off-by: Michal Simek <michal.simek@amd.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/aa303b9fe3116e7f98d6b72822f7f57694366db3.1703077926.git.michal.simek@amd.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml
index 8e5c1935b5f4a7..f324b627bf9c22 100644
--- a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml
+++ b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml
@@ -32,6 +32,9 @@ properties:
   reg:
     maxItems: 1
 
+  "#io-channel-cells":
+    const: 1
+
   label:
     description: A descriptive name for this device.
 
@@ -80,6 +83,7 @@ examples:
         power-sensor@44 {
             compatible = "ti,ina220";
             reg = <0x44>;
+            #io-channel-cells = <1>;
             label = "vdd_3v0";
             shunt-resistor = <1000>;
             vs-supply = <&vdd_3v0>;

From cc85a2f966361054d32dd79a432b2fb6b54b3db8 Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@amd.com>
Date: Mon, 8 Jan 2024 15:30:51 +0100
Subject: [PATCH 106/707] dt-bindings: hwmon: ina2xx: Describe ina260 chip

Describe ina260 chip which is precision digital current and power monitor
with precision integrated shunt resistor.

Signed-off-by: Michal Simek <michal.simek@amd.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/4c82dc4d412e91d1601c1da5bca1cdf1a91cd9b8.1704724242.git.michal.simek@amd.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml
index f324b627bf9c22..a099bb71415e01 100644
--- a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml
+++ b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml
@@ -28,6 +28,7 @@ properties:
       - ti,ina231
       - ti,ina237
       - ti,ina238
+      - ti,ina260
 
   reg:
     maxItems: 1

From 02db4b99fa850241a168c5238cf499109d263667 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Mon, 22 Jan 2024 13:39:41 +0800
Subject: [PATCH 107/707] firewire: Kill unnecessary buf check in
 device_attribute.show

Per Documentation/filesystems/sysfs.rst:
> sysfs allocates a buffer of size (PAGE_SIZE) and passes it to the
> method.

So we can kill the unnecessary buf check safely.

Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Link: https://lore.kernel.org/r/20240122053942.80648-1-lizhijian@fujitsu.com
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-device.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c
index 0547253d16fe5d..47d6cb3dc916d2 100644
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -323,7 +323,7 @@ static ssize_t show_immediate(struct device *dev,
 	if (value < 0)
 		return -ENOENT;
 
-	return snprintf(buf, buf ? PAGE_SIZE : 0, "0x%06x\n", value);
+	return snprintf(buf, PAGE_SIZE, "0x%06x\n", value);
 }
 
 #define IMMEDIATE_ATTR(name, key)				\
@@ -335,8 +335,6 @@ static ssize_t show_text_leaf(struct device *dev,
 	struct config_rom_attribute *attr =
 		container_of(dattr, struct config_rom_attribute, attr);
 	const u32 *directories[] = {NULL, NULL};
-	size_t bufsize;
-	char dummy_buf[2];
 	int i, ret = -ENOENT;
 
 	down_read(&fw_device_rwsem);
@@ -358,15 +356,9 @@ static ssize_t show_text_leaf(struct device *dev,
 		}
 	}
 
-	if (buf) {
-		bufsize = PAGE_SIZE - 1;
-	} else {
-		buf = dummy_buf;
-		bufsize = 1;
-	}
-
 	for (i = 0; i < ARRAY_SIZE(directories) && !!directories[i]; ++i) {
-		int result = fw_csr_string(directories[i], attr->key, buf, bufsize);
+		int result = fw_csr_string(directories[i], attr->key, buf,
+					   PAGE_SIZE - 1);
 		// Detected.
 		if (result >= 0)
 			ret = result;

From dd754748f1bef240c38f987cabd70366b7e91474 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Mon, 22 Jan 2024 13:39:42 +0800
Subject: [PATCH 108/707] firewire: Convert snprintf/sprintf to sysfs_emit

Per filesystems/sysfs.rst, show() should only use sysfs_emit()
or sysfs_emit_at() when formatting the value to be returned to user space.

coccinelle complains that there are still a couple of functions that use
snprintf(). Convert them to sysfs_emit().

> drivers/firewire/core-device.c:326:8-16: WARNING: please use sysfs_emit or sysfs_emit_at

No functional change intended

Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Link: https://lore.kernel.org/r/20240122053942.80648-2-lizhijian@fujitsu.com
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-device.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c
index 47d6cb3dc916d2..790985479ff3b8 100644
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -323,7 +323,7 @@ static ssize_t show_immediate(struct device *dev,
 	if (value < 0)
 		return -ENOENT;
 
-	return snprintf(buf, PAGE_SIZE, "0x%06x\n", value);
+	return sysfs_emit(buf, "0x%06x\n", value);
 }
 
 #define IMMEDIATE_ATTR(name, key)				\
@@ -474,7 +474,7 @@ static ssize_t is_local_show(struct device *dev,
 {
 	struct fw_device *device = fw_device(dev);
 
-	return sprintf(buf, "%u\n", device->is_local);
+	return sysfs_emit(buf, "%u\n", device->is_local);
 }
 
 static int units_sprintf(char *buf, const u32 *directory)

From 213b755e42e2e7127777f74d2174bb4843a9b03a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20J=C3=BCcker?= <martin.juecker@gmail.com>
Date: Fri, 22 Dec 2023 00:02:58 +0100
Subject: [PATCH 109/707] ARM: defconfig: enable STMicroelectronics
 accelerometer and gyro for Exynos
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable STMicroelectronics accelerometer and gyro drivers for the Samsung
P4note device family in exynos and multi_v7 defconfigs.

Signed-off-by: Martin Jücker <martin.juecker@gmail.com>
Link: https://lore.kernel.org/r/20231221230258.56272-2-martin.juecker@gmail.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 arch/arm/configs/exynos_defconfig   | 3 +++
 arch/arm/configs/multi_v7_defconfig | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig
index c98d5ff8a1ed08..7ad48fdda1dac6 100644
--- a/arch/arm/configs/exynos_defconfig
+++ b/arch/arm/configs/exynos_defconfig
@@ -318,8 +318,11 @@ CONFIG_EXTCON_MAX77693=y
 CONFIG_EXTCON_MAX8997=y
 CONFIG_EXYNOS5422_DMC=y
 CONFIG_IIO=y
+CONFIG_IIO_ST_ACCEL_3AXIS=m
+# CONFIG_IIO_ST_ACCEL_SPI_3AXIS is not set
 CONFIG_EXYNOS_ADC=y
 CONFIG_STMPE_ADC=y
+CONFIG_IIO_ST_GYRO_3AXIS=m
 CONFIG_CM36651=y
 CONFIG_AK8975=y
 CONFIG_SENSORS_ISL29018=y
diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig
index ecb3e286107a4c..0d885cb6120679 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -1150,6 +1150,8 @@ CONFIG_STM32_FMC2_EBI=y
 CONFIG_EXYNOS5422_DMC=m
 CONFIG_IIO=y
 CONFIG_IIO_SW_TRIGGER=y
+CONFIG_IIO_ST_ACCEL_3AXIS=m
+# CONFIG_IIO_ST_ACCEL_SPI_3AXIS is not set
 CONFIG_ASPEED_ADC=m
 CONFIG_AT91_ADC=m
 CONFIG_AT91_SAMA5D2_ADC=m
@@ -1169,6 +1171,7 @@ CONFIG_IIO_CROS_EC_SENSORS_CORE=m
 CONFIG_IIO_CROS_EC_SENSORS=m
 CONFIG_STM32_DAC=m
 CONFIG_MPU3050_I2C=y
+CONFIG_IIO_ST_GYRO_3AXIS=m
 CONFIG_CM36651=m
 CONFIG_IIO_CROS_EC_LIGHT_PROX=m
 CONFIG_SENSORS_ISL29018=y

From 062bfa2943207a63315c812ebccadc4830e88d93 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sun, 7 Jan 2024 22:19:47 +0900
Subject: [PATCH 110/707] kconfig: remove unneeded buffer allocation in
 zconf_initscan()

In Kconfig, there is a stack to save the lexer state for each inclusion
level.

Currently, it operates as an empty stack, with the 'current_buf' always
pointing to an empty buffer. There is no need to preallocate the buffer.
Change it to a full stack.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 scripts/kconfig/lexer.l | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/scripts/kconfig/lexer.l b/scripts/kconfig/lexer.l
index cc386e44368346..d75423ec4eae16 100644
--- a/scripts/kconfig/lexer.l
+++ b/scripts/kconfig/lexer.l
@@ -391,9 +391,6 @@ void zconf_initscan(const char *name)
 		exit(1);
 	}
 
-	current_buf = xmalloc(sizeof(*current_buf));
-	memset(current_buf, 0, sizeof(*current_buf));
-
 	current_file = file_lookup(name);
 	yylineno = 1;
 }
@@ -403,9 +400,10 @@ void zconf_nextfile(const char *name)
 	struct file *iter;
 	struct file *file = file_lookup(name);
 	struct buffer *buf = xmalloc(sizeof(*buf));
-	memset(buf, 0, sizeof(*buf));
 
-	current_buf->state = YY_CURRENT_BUFFER;
+	buf->state = YY_CURRENT_BUFFER;
+	buf->parent = current_buf;
+	current_buf = buf;
 	yyin = zconf_fopen(file->name);
 	if (!yyin) {
 		fprintf(stderr, "%s:%d: can't open file \"%s\"\n",
@@ -413,8 +411,6 @@ void zconf_nextfile(const char *name)
 		exit(1);
 	}
 	yy_switch_to_buffer(yy_create_buffer(yyin, YY_BUF_SIZE));
-	buf->parent = current_buf;
-	current_buf = buf;
 
 	current_file->lineno = yylineno;
 	file->parent = current_file;
@@ -441,20 +437,21 @@ void zconf_nextfile(const char *name)
 
 static void zconf_endfile(void)
 {
-	struct buffer *parent;
+	struct buffer *tmp;
 
 	current_file = current_file->parent;
 	if (current_file)
 		yylineno = current_file->lineno;
 
-	parent = current_buf->parent;
-	if (parent) {
-		fclose(yyin);
-		yy_delete_buffer(YY_CURRENT_BUFFER);
-		yy_switch_to_buffer(parent->state);
-	}
-	free(current_buf);
-	current_buf = parent;
+	if (!current_buf)
+		return;
+
+	fclose(yyin);
+	yy_delete_buffer(YY_CURRENT_BUFFER);
+	yy_switch_to_buffer(current_buf->state);
+	tmp = current_buf;
+	current_buf = current_buf->parent;
+	free(tmp);
 }
 
 int zconf_lineno(void)

From de085fa995076869340362f30828db9c683111b4 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sun, 7 Jan 2024 22:19:48 +0900
Subject: [PATCH 111/707] kconfig: fix line number in recursive inclusion
 detection

The error message shows a wrong line number if the 'source' directive
is wrapped to the following line.

[Test Code]

  source \
  "Kconfig"

This results in the following error message:

  Recursive inclusion detected.
  Inclusion path:
    current file : Kconfig
    included from: Kconfig:2

The correct message should be as follows:

  Recursive inclusion detected.
  Inclusion path:
    current file : Kconfig
    included from: Kconfig:1

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 scripts/kconfig/lexer.l | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/scripts/kconfig/lexer.l b/scripts/kconfig/lexer.l
index d75423ec4eae16..f93b535a080c00 100644
--- a/scripts/kconfig/lexer.l
+++ b/scripts/kconfig/lexer.l
@@ -33,6 +33,7 @@ static int text_size, text_asize;
 struct buffer {
 	struct buffer *parent;
 	YY_BUFFER_STATE state;
+	int yylineno;
 };
 
 static struct buffer *current_buf;
@@ -402,6 +403,7 @@ void zconf_nextfile(const char *name)
 	struct buffer *buf = xmalloc(sizeof(*buf));
 
 	buf->state = YY_CURRENT_BUFFER;
+	buf->yylineno = yylineno;
 	buf->parent = current_buf;
 	current_buf = buf;
 	yyin = zconf_fopen(file->name);
@@ -412,7 +414,7 @@ void zconf_nextfile(const char *name)
 	}
 	yy_switch_to_buffer(yy_create_buffer(yyin, YY_BUF_SIZE));
 
-	current_file->lineno = yylineno;
+	current_file->lineno = zconf_lineno();
 	file->parent = current_file;
 
 	for (iter = current_file; iter; iter = iter->parent) {
@@ -425,7 +427,7 @@ void zconf_nextfile(const char *name)
 			do {
 				iter = iter->parent;
 				fprintf(stderr, "  included from: %s:%d\n",
-					iter->name, iter->lineno - 1);
+					iter->name, iter->lineno);
 			} while (strcmp(iter->name, file->name));
 			exit(1);
 		}
@@ -440,8 +442,6 @@ static void zconf_endfile(void)
 	struct buffer *tmp;
 
 	current_file = current_file->parent;
-	if (current_file)
-		yylineno = current_file->lineno;
 
 	if (!current_buf)
 		return;
@@ -449,6 +449,7 @@ static void zconf_endfile(void)
 	fclose(yyin);
 	yy_delete_buffer(YY_CURRENT_BUFFER);
 	yy_switch_to_buffer(current_buf->state);
+	yylineno = current_buf->yylineno;
 	tmp = current_buf;
 	current_buf = current_buf->parent;
 	free(tmp);

From 7c7f8529f43eccf2e8c47d15fa65a9868f443348 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Fri, 12 Jan 2024 13:58:30 +0100
Subject: [PATCH 112/707] docs: kbuild/kconfig: reformat/cleanup

This document was using headings in an odd way, causing the sidebar to be
quite messy. I've adding new headings and turned some of the old headings
into description lists.

The indentation was a mix of spaces and tabs; I've turned them all into
4 spaces so it always reads correctly regardless of tab settings.

Also use ``...`` instead of `...`; the difference is that `` is meant
for "inline literals" (and renders in a monospace font) while ` is for
"interpreted text" (and renders with italics).

Also changed the title of the document to be more descriptive.

Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 Documentation/kbuild/kconfig.rst | 363 ++++++++++++++-----------------
 1 file changed, 166 insertions(+), 197 deletions(-)

diff --git a/Documentation/kbuild/kconfig.rst b/Documentation/kbuild/kconfig.rst
index c946eb44bd138e..fc4e845bc249ba 100644
--- a/Documentation/kbuild/kconfig.rst
+++ b/Documentation/kbuild/kconfig.rst
@@ -1,10 +1,10 @@
-===================
-Kconfig make config
-===================
+=================================
+Configuration targets and editors
+=================================
 
-This file contains some assistance for using `make *config`.
+This file contains some assistance for using ``make *config``.
 
-Use "make help" to list all of the possible configuration targets.
+Use ``make help`` to list all of the possible configuration targets.
 
 The xconfig ('qconf'), menuconfig ('mconf'), and nconfig ('nconf')
 programs also have embedded help text.  Be sure to check that for
@@ -12,8 +12,9 @@ navigation, search, and other general help text.
 
 The gconfig ('gconf') program has limited help text.
 
+
 General
--------
+=======
 
 New kernel releases often introduce new config symbols.  Often more
 important, new kernel releases may rename config symbols.  When
@@ -24,118 +25,102 @@ symbols have been introduced.
 
 To see a list of new config symbols, use::
 
-	cp user/some/old.config .config
-	make listnewconfig
+    cp user/some/old.config .config
+    make listnewconfig
 
 and the config program will list any new symbols, one per line.
 
 Alternatively, you can use the brute force method::
 
-	make oldconfig
-	scripts/diffconfig .config.old .config | less
-
-----------------------------------------------------------------------
-
-Environment variables for `*config`
+    make oldconfig
+    scripts/diffconfig .config.old .config | less
 
-KCONFIG_CONFIG
---------------
-This environment variable can be used to specify a default kernel config
-file name to override the default name of ".config".
 
-KCONFIG_DEFCONFIG_LIST
-----------------------
+Environment variables
+=====================
 
-This environment variable specifies a list of config files which can be used
-as a base configuration in case the .config does not exist yet. Entries in
-the list are separated with whitespaces to each other, and the first one
-that exists is used.
+Environment variables for ``*config``:
 
-KCONFIG_OVERWRITECONFIG
------------------------
-If you set KCONFIG_OVERWRITECONFIG in the environment, Kconfig will not
-break symlinks when .config is a symlink to somewhere else.
+``KCONFIG_CONFIG``
+    This environment variable can be used to specify a default kernel config
+    file name to override the default name of ".config".
 
-KCONFIG_WARN_UNKNOWN_SYMBOLS
-----------------------------
-This environment variable makes Kconfig warn about all unrecognized
-symbols in the config input.
+``KCONFIG_DEFCONFIG_LIST``
+    This environment variable specifies a list of config files which can be
+    used as a base configuration in case the .config does not exist yet.
+    Entries in the list are separated with whitespaces to each other, and
+    the first one that exists is used.
 
-KCONFIG_WERROR
---------------
-If set, Kconfig treats warnings as errors.
+``KCONFIG_OVERWRITECONFIG``
+    If you set KCONFIG_OVERWRITECONFIG in the environment, Kconfig will not
+    break symlinks when .config is a symlink to somewhere else.
 
-`CONFIG_`
----------
-If you set `CONFIG_` in the environment, Kconfig will prefix all symbols
-with its value when saving the configuration, instead of using the default,
-`CONFIG_`.
+``KCONFIG_WARN_UNKNOWN_SYMBOLS``
+    This environment variable makes Kconfig warn about all unrecognized
+    symbols in the config input.
 
-----------------------------------------------------------------------
+``KCONFIG_WERROR``
+    If set, Kconfig treats warnings as errors.
 
-Environment variables for '{allyes/allmod/allno/rand}config'
+``CONFIG_``
+    If you set ``CONFIG_`` in the environment, Kconfig will prefix all symbols
+    with its value when saving the configuration, instead of using the
+    default, ``CONFIG_``.
 
-KCONFIG_ALLCONFIG
------------------
-(partially based on lkml email from/by Rob Landley, re: miniconfig)
+Environment variables for ``{allyes/allmod/allno/rand}config``:
 
---------------------------------------------------
+``KCONFIG_ALLCONFIG``
+    The allyesconfig/allmodconfig/allnoconfig/randconfig variants can also
+    use the environment variable KCONFIG_ALLCONFIG as a flag or a filename
+    that contains config symbols that the user requires to be set to a
+    specific value.  If KCONFIG_ALLCONFIG is used without a filename where
+    KCONFIG_ALLCONFIG == "" or KCONFIG_ALLCONFIG == "1", ``make *config``
+    checks for a file named "all{yes/mod/no/def/random}.config"
+    (corresponding to the ``*config`` command that was used) for symbol values
+    that are to be forced.  If this file is not found, it checks for a
+    file named "all.config" to contain forced values.
 
-The allyesconfig/allmodconfig/allnoconfig/randconfig variants can also
-use the environment variable KCONFIG_ALLCONFIG as a flag or a filename
-that contains config symbols that the user requires to be set to a
-specific value.  If KCONFIG_ALLCONFIG is used without a filename where
-KCONFIG_ALLCONFIG == "" or KCONFIG_ALLCONFIG == "1", `make *config`
-checks for a file named "all{yes/mod/no/def/random}.config"
-(corresponding to the `*config` command that was used) for symbol values
-that are to be forced.  If this file is not found, it checks for a
-file named "all.config" to contain forced values.
+    This enables you to create "miniature" config (miniconfig) or custom
+    config files containing just the config symbols that you are interested
+    in.  Then the kernel config system generates the full .config file,
+    including symbols of your miniconfig file.
 
-This enables you to create "miniature" config (miniconfig) or custom
-config files containing just the config symbols that you are interested
-in.  Then the kernel config system generates the full .config file,
-including symbols of your miniconfig file.
-
-This 'KCONFIG_ALLCONFIG' file is a config file which contains
-(usually a subset of all) preset config symbols.  These variable
-settings are still subject to normal dependency checks.
-
-Examples::
+    This ``KCONFIG_ALLCONFIG`` file is a config file which contains
+    (usually a subset of all) preset config symbols.  These variable
+    settings are still subject to normal dependency checks.
 
-	KCONFIG_ALLCONFIG=custom-notebook.config make allnoconfig
+    Examples::
 
-or::
+        KCONFIG_ALLCONFIG=custom-notebook.config make allnoconfig
 
-	KCONFIG_ALLCONFIG=mini.config make allnoconfig
+    or::
 
-or::
+        KCONFIG_ALLCONFIG=mini.config make allnoconfig
 
-	make KCONFIG_ALLCONFIG=mini.config allnoconfig
+    or::
 
-These examples will disable most options (allnoconfig) but enable or
-disable the options that are explicitly listed in the specified
-mini-config files.
+        make KCONFIG_ALLCONFIG=mini.config allnoconfig
 
-----------------------------------------------------------------------
+    These examples will disable most options (allnoconfig) but enable or
+    disable the options that are explicitly listed in the specified
+    mini-config files.
 
-Environment variables for 'randconfig'
+Environment variables for ``randconfig``:
 
-KCONFIG_SEED
-------------
-You can set this to the integer value used to seed the RNG, if you want
-to somehow debug the behaviour of the kconfig parser/frontends.
-If not set, the current time will be used.
+``KCONFIG_SEED``
+    You can set this to the integer value used to seed the RNG, if you want
+    to somehow debug the behaviour of the kconfig parser/frontends.
+    If not set, the current time will be used.
 
-KCONFIG_PROBABILITY
--------------------
-This variable can be used to skew the probabilities. This variable can
-be unset or empty, or set to three different formats:
+``KCONFIG_PROBABILITY``
+    This variable can be used to skew the probabilities. This variable can
+    be unset or empty, or set to three different formats:
 
     =======================     ==================  =====================
-	KCONFIG_PROBABILITY     y:n split           y:m:n split
+    KCONFIG_PROBABILITY         y:n split           y:m:n split
     =======================     ==================  =====================
-	unset or empty          50  : 50            33  : 33  : 34
-	N                        N  : 100-N         N/2 : N/2 : 100-N
+    unset or empty              50  : 50            33  : 33  : 34
+    N                            N  : 100-N         N/2 : N/2 : 100-N
     [1] N:M                     N+M : 100-(N+M)      N  :  M  : 100-(N+M)
     [2] N:M:L                    N  : 100-N          M  :  L  : 100-(M+L)
     =======================     ==================  =====================
@@ -149,112 +134,98 @@ that:
 
 Examples::
 
-	KCONFIG_PROBABILITY=10
-		10% of booleans will be set to 'y', 90% to 'n'
-		5% of tristates will be set to 'y', 5% to 'm', 90% to 'n'
-	KCONFIG_PROBABILITY=15:25
-		40% of booleans will be set to 'y', 60% to 'n'
-		15% of tristates will be set to 'y', 25% to 'm', 60% to 'n'
-	KCONFIG_PROBABILITY=10:15:15
-		10% of booleans will be set to 'y', 90% to 'n'
-		15% of tristates will be set to 'y', 15% to 'm', 70% to 'n'
+    KCONFIG_PROBABILITY=10
+        10% of booleans will be set to 'y', 90% to 'n'
+        5% of tristates will be set to 'y', 5% to 'm', 90% to 'n'
+    KCONFIG_PROBABILITY=15:25
+        40% of booleans will be set to 'y', 60% to 'n'
+        15% of tristates will be set to 'y', 25% to 'm', 60% to 'n'
+    KCONFIG_PROBABILITY=10:15:15
+        10% of booleans will be set to 'y', 90% to 'n'
+        15% of tristates will be set to 'y', 15% to 'm', 70% to 'n'
 
-----------------------------------------------------------------------
+Environment variables for ``syncconfig``:
 
-Environment variables for 'syncconfig'
+``KCONFIG_NOSILENTUPDATE``
+    If this variable has a non-blank value, it prevents silent kernel
+    config updates (requires explicit updates).
 
-KCONFIG_NOSILENTUPDATE
-----------------------
-If this variable has a non-blank value, it prevents silent kernel
-config updates (requires explicit updates).
+``KCONFIG_AUTOCONFIG``
+    This environment variable can be set to specify the path & name of the
+    "auto.conf" file.  Its default value is "include/config/auto.conf".
 
-KCONFIG_AUTOCONFIG
-------------------
-This environment variable can be set to specify the path & name of the
-"auto.conf" file.  Its default value is "include/config/auto.conf".
+``KCONFIG_AUTOHEADER``
+    This environment variable can be set to specify the path & name of the
+    "autoconf.h" (header) file.
+    Its default value is "include/generated/autoconf.h".
 
-KCONFIG_AUTOHEADER
-------------------
-This environment variable can be set to specify the path & name of the
-"autoconf.h" (header) file.
-Its default value is "include/generated/autoconf.h".
-
-
-----------------------------------------------------------------------
 
 menuconfig
-----------
-
-SEARCHING for CONFIG symbols
+==========
 
 Searching in menuconfig:
 
-	The Search function searches for kernel configuration symbol
-	names, so you have to know something close to what you are
-	looking for.
+    The Search function searches for kernel configuration symbol
+    names, so you have to know something close to what you are
+    looking for.
 
-	Example::
+    Example::
 
-		/hotplug
-		This lists all config symbols that contain "hotplug",
-		e.g., HOTPLUG_CPU, MEMORY_HOTPLUG.
+        /hotplug
+        This lists all config symbols that contain "hotplug",
+        e.g., HOTPLUG_CPU, MEMORY_HOTPLUG.
 
-	For search help, enter / followed by TAB-TAB (to highlight
-	<Help>) and Enter.  This will tell you that you can also use
-	regular expressions (regexes) in the search string, so if you
-	are not interested in MEMORY_HOTPLUG, you could try::
+    For search help, enter / followed by TAB-TAB (to highlight
+    <Help>) and Enter.  This will tell you that you can also use
+    regular expressions (regexes) in the search string, so if you
+    are not interested in MEMORY_HOTPLUG, you could try::
 
-		/^hotplug
+        /^hotplug
 
-	When searching, symbols are sorted thus:
+    When searching, symbols are sorted thus:
 
-	  - first, exact matches, sorted alphabetically (an exact match
-	    is when the search matches the complete symbol name);
-	  - then, other matches, sorted alphabetically.
+    - first, exact matches, sorted alphabetically (an exact match
+      is when the search matches the complete symbol name);
+    - then, other matches, sorted alphabetically.
 
-	For example: ^ATH.K matches:
+    For example, ^ATH.K matches:
 
-	    ATH5K ATH9K ATH5K_AHB ATH5K_DEBUG [...] ATH6KL ATH6KL_DEBUG
-	    [...] ATH9K_AHB ATH9K_BTCOEX_SUPPORT ATH9K_COMMON [...]
+        ATH5K ATH9K ATH5K_AHB ATH5K_DEBUG [...] ATH6KL ATH6KL_DEBUG
+        [...] ATH9K_AHB ATH9K_BTCOEX_SUPPORT ATH9K_COMMON [...]
 
-	of which only ATH5K and ATH9K match exactly and so are sorted
-	first (and in alphabetical order), then come all other symbols,
-	sorted in alphabetical order.
+    of which only ATH5K and ATH9K match exactly and so are sorted
+    first (and in alphabetical order), then come all other symbols,
+    sorted in alphabetical order.
 
-	In this menu, pressing the key in the (#) prefix will jump
-	directly to that location. You will be returned to the current
-	search results after exiting this new menu.
+    In this menu, pressing the key in the (#) prefix will jump
+    directly to that location. You will be returned to the current
+    search results after exiting this new menu.
 
-----------------------------------------------------------------------
+User interface options for 'menuconfig':
 
-User interface options for 'menuconfig'
+``MENUCONFIG_COLOR``
+    It is possible to select different color themes using the variable
+    MENUCONFIG_COLOR.  To select a theme use::
 
-MENUCONFIG_COLOR
-----------------
-It is possible to select different color themes using the variable
-MENUCONFIG_COLOR.  To select a theme use::
+        make MENUCONFIG_COLOR=<theme> menuconfig
 
-	make MENUCONFIG_COLOR=<theme> menuconfig
+    Available themes are::
 
-Available themes are::
+      - mono       => selects colors suitable for monochrome displays
+      - blackbg    => selects a color scheme with black background
+      - classic    => theme with blue background. The classic look
+      - bluetitle  => a LCD friendly version of classic. (default)
 
-  - mono       => selects colors suitable for monochrome displays
-  - blackbg    => selects a color scheme with black background
-  - classic    => theme with blue background. The classic look
-  - bluetitle  => a LCD friendly version of classic. (default)
+``MENUCONFIG_MODE``
+    This mode shows all sub-menus in one large tree.
 
-MENUCONFIG_MODE
----------------
-This mode shows all sub-menus in one large tree.
+    Example::
 
-Example::
+        make MENUCONFIG_MODE=single_menu menuconfig
 
-	make MENUCONFIG_MODE=single_menu menuconfig
-
-----------------------------------------------------------------------
 
 nconfig
--------
+=======
 
 nconfig is an alternate text-based configurator.  It lists function
 keys across the bottom of the terminal (window) that execute commands.
@@ -266,61 +237,59 @@ Use F1 for Global help or F3 for the Short help menu.
 
 Searching in nconfig:
 
-	You can search either in the menu entry "prompt" strings
-	or in the configuration symbols.
+    You can search either in the menu entry "prompt" strings
+    or in the configuration symbols.
+
+    Use / to begin a search through the menu entries.  This does
+    not support regular expressions.  Use <Down> or <Up> for
+    Next hit and Previous hit, respectively.  Use <Esc> to
+    terminate the search mode.
 
-	Use / to begin a search through the menu entries.  This does
-	not support regular expressions.  Use <Down> or <Up> for
-	Next hit and Previous hit, respectively.  Use <Esc> to
-	terminate the search mode.
+    F8 (SymSearch) searches the configuration symbols for the
+    given string or regular expression (regex).
 
-	F8 (SymSearch) searches the configuration symbols for the
-	given string or regular expression (regex).
+    In the SymSearch, pressing the key in the (#) prefix will
+    jump directly to that location. You will be returned to the
+    current search results after exiting this new menu.
 
-	In the SymSearch, pressing the key in the (#) prefix will
-	jump directly to that location. You will be returned to the
-	current search results after exiting this new menu.
+Environment variables:
 
-NCONFIG_MODE
-------------
-This mode shows all sub-menus in one large tree.
+``NCONFIG_MODE``
+    This mode shows all sub-menus in one large tree.
 
-Example::
+    Example::
 
-	make NCONFIG_MODE=single_menu nconfig
+        make NCONFIG_MODE=single_menu nconfig
 
-----------------------------------------------------------------------
 
 xconfig
--------
+=======
 
 Searching in xconfig:
 
-	The Search function searches for kernel configuration symbol
-	names, so you have to know something close to what you are
-	looking for.
-
-	Example::
+    The Search function searches for kernel configuration symbol
+    names, so you have to know something close to what you are
+    looking for.
 
-		Ctrl-F hotplug
+    Example::
 
-	or::
+        Ctrl-F hotplug
 
-		Menu: File, Search, hotplug
+    or::
 
-	lists all config symbol entries that contain "hotplug" in
-	the symbol name.  In this Search dialog, you may change the
-	config setting for any of the entries that are not grayed out.
-	You can also enter a different search string without having
-	to return to the main menu.
+        Menu: File, Search, hotplug
 
+    lists all config symbol entries that contain "hotplug" in
+    the symbol name.  In this Search dialog, you may change the
+    config setting for any of the entries that are not grayed out.
+    You can also enter a different search string without having
+    to return to the main menu.
 
-----------------------------------------------------------------------
 
 gconfig
--------
+=======
 
 Searching in gconfig:
 
-	There is no search command in gconfig.  However, gconfig does
-	have several different viewing choices, modes, and options.
+    There is no search command in gconfig.  However, gconfig does
+    have several different viewing choices, modes, and options.

From f98b755f7bee20cb09814c106c1d40386ed4172e Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sat, 13 Jan 2024 19:43:36 +0900
Subject: [PATCH 113/707] kbuild: deb-pkg: show verbose log for direct package
 builds

When the Debian package build is initiated by Kbuild ('make deb-pkg'
or 'make bindeb-pkg'), the log messages are displayed in the short
form, which is the Kbuild default.

Otherwise, let's show verbose messages (unless the 'terse' tag is set
in DEB_BUILD_OPTION), as suggested by Debian Policy: "The package build
should be as verbose as reasonably possible, except where the terse tag
is included in DEB_BUILD_OPTIONS." [1]

This is what the Debian kernel also does. [2]

[1]: https://www.debian.org/doc/debian-policy/ch-source.html#main-building-script-debian-rules
[2]: https://salsa.debian.org/kernel-team/linux/-/blob/debian/6.7-1_exp1/debian/rules.real#L36

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Nicolas Schier <n.schier@avm.de>
---
 scripts/package/debian/rules | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/scripts/package/debian/rules b/scripts/package/debian/rules
index 09830778006227..697fbfa7595f79 100755
--- a/scripts/package/debian/rules
+++ b/scripts/package/debian/rules
@@ -11,6 +11,14 @@ ifneq (,$(filter-out parallel=1,$(filter parallel=%,$(DEB_BUILD_OPTIONS))))
     MAKEFLAGS += -j$(NUMJOBS)
 endif
 
+# When KBUILD_VERBOSE is undefined (presumably you are directly working with
+# the debianized tree), show verbose logs unless DEB_BUILD_OPTION=terse is set.
+ifeq ($(origin KBUILD_VERBOSE),undefined)
+    ifeq (,$(filter terse,$(DEB_BUILD_OPTIONS)))
+        export KBUILD_VERBOSE := 1
+    endif
+endif
+
 revision = $(lastword $(subst -, ,$(shell dpkg-parsechangelog -S Version)))
 CROSS_COMPILE ?= $(filter-out $(DEB_BUILD_GNU_TYPE)-, $(DEB_HOST_GNU_TYPE)-)
 make-opts = ARCH=$(ARCH) KERNELRELEASE=$(KERNELRELEASE) KBUILD_BUILD_VERSION=$(revision) $(addprefix CROSS_COMPILE=,$(CROSS_COMPILE))

From 057b438ff43bc31546ef297855fb7c64ad0e81e9 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sat, 13 Jan 2024 19:43:37 +0900
Subject: [PATCH 114/707] kbuild: deb-pkg: make debian/rules quiet for 'make
 deb-pkg'

Add $(Q) to the commands in debian/rules to make them quiet when the
package built is initiated by 'make deb-pkg' or when the 'terse' tag
is set to DEB_BUILD_OPTIONS.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Nicolas Schier <n.schier@avm.de>
---
 scripts/package/debian/rules | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/scripts/package/debian/rules b/scripts/package/debian/rules
index 697fbfa7595f79..a183e95886e61d 100755
--- a/scripts/package/debian/rules
+++ b/scripts/package/debian/rules
@@ -16,6 +16,8 @@ endif
 ifeq ($(origin KBUILD_VERBOSE),undefined)
     ifeq (,$(filter terse,$(DEB_BUILD_OPTIONS)))
         export KBUILD_VERBOSE := 1
+    else
+        Q := @
     endif
 endif
 
@@ -27,20 +29,20 @@ make-opts = ARCH=$(ARCH) KERNELRELEASE=$(KERNELRELEASE) KBUILD_BUILD_VERSION=$(r
 binary: binary-arch binary-indep
 binary-indep: build-indep
 binary-arch: build-arch
-	$(MAKE) $(make-opts) \
+	$(Q)$(MAKE) $(make-opts) \
 	run-command KBUILD_RUN_COMMAND='+$$(srctree)/scripts/package/builddeb'
 
 .PHONY: build build-indep build-arch
 build: build-arch build-indep
 build-indep:
 build-arch:
-	$(MAKE) $(make-opts) olddefconfig
-	$(MAKE) $(make-opts) $(if $(filter um,$(ARCH)),,headers) all
+	$(Q)$(MAKE) $(make-opts) olddefconfig
+	$(Q)$(MAKE) $(make-opts) $(if $(filter um,$(ARCH)),,headers) all
 
 .PHONY: clean
 clean:
-	rm -rf debian/files debian/linux-* debian/deb-env.vars*
-	$(MAKE) ARCH=$(ARCH) clean
+	$(Q)rm -rf debian/files debian/linux-* debian/deb-env.vars*
+	$(Q)$(MAKE) ARCH=$(ARCH) clean
 
 # If DEB_HOST_ARCH is empty, it is likely that debian/rules was executed
 # directly. Run 'dpkg-architecture --print-set --print-format=make' to
@@ -49,6 +51,6 @@ ifndef DEB_HOST_ARCH
 include debian/deb-env.vars
 
 debian/deb-env.vars:
-	dpkg-architecture -a$$(cat debian/arch) --print-set --print-format=make > $@.tmp
-	mv $@.tmp $@
+	$(Q)dpkg-architecture -a$$(cat debian/arch) --print-set --print-format=make > $@.tmp
+	$(Q)mv $@.tmp $@
 endif

From 2c5251ac1ff06696881ff021e6d1b8a3e63d90e5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sat, 13 Jan 2024 19:43:38 +0900
Subject: [PATCH 115/707] kbuild: deb-pkg: build binary-arch in parallel

'make deb-pkg' builds build-arch in parallel, but binary-arch serially.

Given that all binary packages are independent of one another, they can
be built in parallel.

I am uncertain whether debian/files is robust against a race condition.
Just in case, make dh_gencontrol (dpkg-gencontrol) output to separate
debian/*.files, which are then concatenated into debian/files.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Nicolas Schier <n.schier@avm.de>
---
 scripts/package/builddeb     | 42 +++++++++++-------------------------
 scripts/package/debian/rules | 39 +++++++++++++++++++++++++++++----
 2 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/scripts/package/builddeb b/scripts/package/builddeb
index bf96a3c2460814..d31b16afe0db8c 100755
--- a/scripts/package/builddeb
+++ b/scripts/package/builddeb
@@ -24,18 +24,6 @@ if_enabled_echo() {
 	fi
 }
 
-create_package() {
-	export DH_OPTIONS="-p${1}"
-
-	dh_installdocs
-	dh_installchangelogs
-	dh_compress
-	dh_fixperms
-	dh_gencontrol
-	dh_md5sums
-	dh_builddeb -- ${KDEB_COMPRESS:+-Z$KDEB_COMPRESS}
-}
-
 install_linux_image () {
 	pname=$1
 	pdir=debian/$1
@@ -161,21 +149,15 @@ install_libc_headers () {
 	mv "$pdir/usr/include/asm" "$pdir/usr/include/${DEB_HOST_MULTIARCH}"
 }
 
-rm -f debian/files
-
-packages_enabled=$(dh_listpackages)
-
-for package in ${packages_enabled}
-do
-	case ${package} in
-	*-dbg)
-		install_linux_image_dbg "${package}";;
-	linux-image-*|user-mode-linux-*)
-		install_linux_image "${package}";;
-	linux-libc-dev)
-		install_libc_headers "${package}";;
-	linux-headers-*)
-		install_kernel_headers "${package}";;
-	esac
-	create_package "${package}"
-done
+package=$1
+
+case "${package}" in
+*-dbg)
+	install_linux_image_dbg "${package}";;
+linux-image-*|user-mode-linux-*)
+	install_linux_image "${package}";;
+linux-libc-dev)
+	install_libc_headers "${package}";;
+linux-headers-*)
+	install_kernel_headers "${package}";;
+esac
diff --git a/scripts/package/debian/rules b/scripts/package/debian/rules
index a183e95886e61d..57f1cf7c6b32f1 100755
--- a/scripts/package/debian/rules
+++ b/scripts/package/debian/rules
@@ -25,12 +25,43 @@ revision = $(lastword $(subst -, ,$(shell dpkg-parsechangelog -S Version)))
 CROSS_COMPILE ?= $(filter-out $(DEB_BUILD_GNU_TYPE)-, $(DEB_HOST_GNU_TYPE)-)
 make-opts = ARCH=$(ARCH) KERNELRELEASE=$(KERNELRELEASE) KBUILD_BUILD_VERSION=$(revision) $(addprefix CROSS_COMPILE=,$(CROSS_COMPILE))
 
+binary-targets := $(addprefix binary-, image image-dbg headers libc-dev)
+
+all-packages = $(shell dh_listpackages)
+image-package = $(filter linux-image-% user-%, $(filter-out %-dbg, $(all-packages)))
+image-dbg-package = $(filter %-dbg, $(all-packages))
+libc-dev-package = $(filter linux-libc-dev, $(all-packages))
+headers-package = $(filter linux-headers-%, $(all-packages))
+
+mk-files = $(patsubst binary-%,debian/%.files,$1)
+package = $($(@:binary-%=%-package))
+
+# DH_OPTION is an environment variable common for all debhelper commands.
+# We could 'export' it, but here it is passed from the command line to clarify
+# which package is being processed in the build log.
+DH_OPTIONS = -p$(package)
+
+define binary
+	$(Q)+$(MAKE) $(make-opts) run-command KBUILD_RUN_COMMAND='+$$(srctree)/scripts/package/builddeb $(package)'
+	$(Q)dh_installdocs $(DH_OPTIONS)
+	$(Q)dh_installchangelogs $(DH_OPTIONS)
+	$(Q)dh_compress $(DH_OPTIONS)
+	$(Q)dh_fixperms $(DH_OPTIONS)
+	$(Q)dh_gencontrol $(DH_OPTIONS) -- -f$(call mk-files,$@)
+	$(Q)dh_md5sums $(DH_OPTIONS)
+	$(Q)dh_builddeb $(DH_OPTIONS) -- $(addprefix -Z,$(KDEB_COMPRESS))
+endef
+
+.PHONY: $(binary-targets)
+$(binary-targets): build-arch
+	$(Q)truncate -s0 $(call mk-files,$@)
+	$(if $(package),$(binary))
+
 .PHONY: binary binary-indep binary-arch
 binary: binary-arch binary-indep
 binary-indep: build-indep
-binary-arch: build-arch
-	$(Q)$(MAKE) $(make-opts) \
-	run-command KBUILD_RUN_COMMAND='+$$(srctree)/scripts/package/builddeb'
+binary-arch: $(binary-targets)
+	$(Q)cat $(call mk-files,$^) > debian/files
 
 .PHONY: build build-indep build-arch
 build: build-arch build-indep
@@ -41,7 +72,7 @@ build-arch:
 
 .PHONY: clean
 clean:
-	$(Q)rm -rf debian/files debian/linux-* debian/deb-env.vars*
+	$(Q)rm -rf debian/files debian/linux-* debian/deb-env.vars* debian/*.files
 	$(Q)$(MAKE) ARCH=$(ARCH) clean
 
 # If DEB_HOST_ARCH is empty, it is likely that debian/rules was executed

From bd768db42ef6d27c3eca1d29ec8beb4474e4ba35 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sat, 13 Jan 2024 19:43:39 +0900
Subject: [PATCH 116/707] kbuild: deb-pkg: call more misc debhelper commands

Use dh_prep instead of removing old build directories manually.

Use dh_clean instead of removing build directories and debian/files
manually.

Call dh_testdir and dh_testroot for preliminary checks.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Nicolas Schier <n.schier@avm.de>
---
 scripts/package/builddeb     | 8 --------
 scripts/package/debian/rules | 6 +++++-
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/scripts/package/builddeb b/scripts/package/builddeb
index d31b16afe0db8c..e797ad360f7a5b 100755
--- a/scripts/package/builddeb
+++ b/scripts/package/builddeb
@@ -28,8 +28,6 @@ install_linux_image () {
 	pname=$1
 	pdir=debian/$1
 
-	rm -rf ${pdir}
-
 	# Only some architectures with OF support have this target
 	if is_enabled CONFIG_OF_EARLY_FLATTREE && [ -d "${srctree}/arch/${SRCARCH}/boot/dts" ]; then
 		${MAKE} -f ${srctree}/Makefile INSTALL_DTBS_PATH="${pdir}/usr/lib/linux-image-${KERNELRELEASE}" dtbs_install
@@ -97,8 +95,6 @@ install_linux_image () {
 install_linux_image_dbg () {
 	pdir=debian/$1
 
-	rm -rf ${pdir}
-
 	# Parse modules.order directly because 'make modules_install' may sign,
 	# compress modules, and then run unneeded depmod.
 	while read -r mod; do
@@ -128,8 +124,6 @@ install_kernel_headers () {
 	pdir=debian/$1
 	version=${1#linux-headers-}
 
-	rm -rf $pdir
-
 	"${srctree}/scripts/package/install-extmod-build" "${pdir}/usr/src/linux-headers-${version}"
 
 	mkdir -p $pdir/lib/modules/$version/
@@ -139,8 +133,6 @@ install_kernel_headers () {
 install_libc_headers () {
 	pdir=debian/$1
 
-	rm -rf $pdir
-
 	$MAKE -f $srctree/Makefile headers_install INSTALL_HDR_PATH=$pdir/usr
 
 	# move asm headers to /usr/include/<libc-machine>/asm to match the structure
diff --git a/scripts/package/debian/rules b/scripts/package/debian/rules
index 57f1cf7c6b32f1..ca07243bd5cdf6 100755
--- a/scripts/package/debian/rules
+++ b/scripts/package/debian/rules
@@ -42,6 +42,9 @@ package = $($(@:binary-%=%-package))
 DH_OPTIONS = -p$(package)
 
 define binary
+	$(Q)dh_testdir $(DH_OPTIONS)
+	$(Q)dh_testroot $(DH_OPTIONS)
+	$(Q)dh_prep $(DH_OPTIONS)
 	$(Q)+$(MAKE) $(make-opts) run-command KBUILD_RUN_COMMAND='+$$(srctree)/scripts/package/builddeb $(package)'
 	$(Q)dh_installdocs $(DH_OPTIONS)
 	$(Q)dh_installchangelogs $(DH_OPTIONS)
@@ -72,7 +75,8 @@ build-arch:
 
 .PHONY: clean
 clean:
-	$(Q)rm -rf debian/files debian/linux-* debian/deb-env.vars* debian/*.files
+	$(Q)dh_clean
+	$(Q)rm -rf debian/deb-env.vars* debian/*.files
 	$(Q)$(MAKE) ARCH=$(ARCH) clean
 
 # If DEB_HOST_ARCH is empty, it is likely that debian/rules was executed

From df38347f1934b29d4b3075a53a6404b0d58dcb2f Mon Sep 17 00:00:00 2001
From: "Borislav Petkov (AMD)" <bp@alien8.de>
Date: Fri, 5 Jan 2024 11:14:07 +0100
Subject: [PATCH 117/707] x86/sev: Harden #VC instruction emulation somewhat

Compare the opcode bytes at rIP for each #VC exit reason to verify the
instruction which raised the #VC exception is actually the right one.

Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Link: https://lore.kernel.org/r/20240105101407.11694-1-bp@alien8.de
---
 arch/x86/boot/compressed/sev.c |   4 ++
 arch/x86/kernel/sev-shared.c   | 102 ++++++++++++++++++++++++++++++++-
 arch/x86/kernel/sev.c          |   5 +-
 3 files changed, 108 insertions(+), 3 deletions(-)

diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 454acd7a2dafff..073291832f44d2 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -304,6 +304,10 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
 	if (result != ES_OK)
 		goto finish;
 
+	result = vc_check_opcode_bytes(&ctxt, exit_code);
+	if (result != ES_OK)
+		goto finish;
+
 	switch (exit_code) {
 	case SVM_EXIT_RDTSC:
 	case SVM_EXIT_RDTSCP:
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 1d24ec6799157b..5db24d0fc557cd 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -10,11 +10,15 @@
  */
 
 #ifndef __BOOT_COMPRESSED
-#define error(v)	pr_err(v)
-#define has_cpuflag(f)	boot_cpu_has(f)
+#define error(v)			pr_err(v)
+#define has_cpuflag(f)			boot_cpu_has(f)
+#define sev_printk(fmt, ...)		printk(fmt, ##__VA_ARGS__)
+#define sev_printk_rtl(fmt, ...)	printk_ratelimited(fmt, ##__VA_ARGS__)
 #else
 #undef WARN
 #define WARN(condition, format...) (!!(condition))
+#define sev_printk(fmt, ...)
+#define sev_printk_rtl(fmt, ...)
 #endif
 
 /* I/O parameters for CPUID-related helpers */
@@ -574,6 +578,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
 {
 	unsigned int subfn = lower_bits(regs->cx, 32);
 	unsigned int fn = lower_bits(regs->ax, 32);
+	u16 opcode = *(unsigned short *)regs->ip;
 	struct cpuid_leaf leaf;
 	int ret;
 
@@ -581,6 +586,10 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
 	if (exit_code != SVM_EXIT_CPUID)
 		goto fail;
 
+	/* Is it really a CPUID insn? */
+	if (opcode != 0xa20f)
+		goto fail;
+
 	leaf.fn = fn;
 	leaf.subfn = subfn;
 
@@ -1170,3 +1179,92 @@ static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc)
 out:
 	return ret;
 }
+
+static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt,
+					    unsigned long exit_code)
+{
+	unsigned int opcode = (unsigned int)ctxt->insn.opcode.value;
+	u8 modrm = ctxt->insn.modrm.value;
+
+	switch (exit_code) {
+
+	case SVM_EXIT_IOIO:
+	case SVM_EXIT_NPF:
+		/* handled separately */
+		return ES_OK;
+
+	case SVM_EXIT_CPUID:
+		if (opcode == 0xa20f)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_INVD:
+		if (opcode == 0x080f)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_MONITOR:
+		if (opcode == 0x010f && modrm == 0xc8)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_MWAIT:
+		if (opcode == 0x010f && modrm == 0xc9)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_MSR:
+		/* RDMSR */
+		if (opcode == 0x320f ||
+		/* WRMSR */
+		    opcode == 0x300f)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_RDPMC:
+		if (opcode == 0x330f)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_RDTSC:
+		if (opcode == 0x310f)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_RDTSCP:
+		if (opcode == 0x010f && modrm == 0xf9)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_READ_DR7:
+		if (opcode == 0x210f &&
+		    X86_MODRM_REG(ctxt->insn.modrm.value) == 7)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_VMMCALL:
+		if (opcode == 0x010f && modrm == 0xd9)
+			return ES_OK;
+
+		break;
+
+	case SVM_EXIT_WRITE_DR7:
+		if (opcode == 0x230f &&
+		    X86_MODRM_REG(ctxt->insn.modrm.value) == 7)
+			return ES_OK;
+		break;
+
+	case SVM_EXIT_WBINVD:
+		if (opcode == 0x90f)
+			return ES_OK;
+		break;
+
+	default:
+		break;
+	}
+
+	sev_printk(KERN_ERR "Wrong/unhandled opcode bytes: 0x%x, exit_code: 0x%lx, rIP: 0x%lx\n",
+		   opcode, exit_code, ctxt->regs->ip);
+
+	return ES_UNSUPPORTED;
+}
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index c67285824e8267..1ec753331524ab 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -1752,7 +1752,10 @@ static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
 					 struct ghcb *ghcb,
 					 unsigned long exit_code)
 {
-	enum es_result result;
+	enum es_result result = vc_check_opcode_bytes(ctxt, exit_code);
+
+	if (result != ES_OK)
+		return result;
 
 	switch (exit_code) {
 	case SVM_EXIT_READ_DR7:

From 8b4e2d8976b6c93b3786ced79f1c55d5d7b38737 Mon Sep 17 00:00:00 2001
From: Stephan Gerhold <stephan@gerhold.net>
Date: Wed, 3 Jan 2024 22:10:05 +0100
Subject: [PATCH 118/707] pmdomain: core: Scale down parent/child performance
 states in reverse order

Power domains might have parent domains assigned that are automatically
managed by the PM domain core. In particular, parent domains are
automatically powered on/off and setting performance states on child
domains are propagated to parent domains (e.g. using an OPP table from the
device tree).

Currently the parent performance state is always adjusted before the
performance state of the child domain, which is a problem for some cases
when scaling down the performance state. More exactly, it may lead to that
the parent domain could run in a lower performance state, than what is
required by the child domain.

To fix the behaviour, let's differentiate between scaling up/down and
adjust the order of operations:

 - When scaling up, parent domains should be adjusted before the child
   domain. In case of an error, the rollback happens in reverse order.

 - When scaling down, parent domains should be adjusted after the child
   domain, in reverse order, just as if we would rollback scaling up.
   In case of an error, the rollback happens in normal order (just as
   if we would normally scale up).

Signed-off-by: Stephan Gerhold <stephan@gerhold.net>
Link: https://lore.kernel.org/r/20240103-genpd-perf-order-v2-1-eeecfc55624b@gerhold.net
Tested-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/pmdomain/core.c | 124 +++++++++++++++++++++++++---------------
 1 file changed, 77 insertions(+), 47 deletions(-)

diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index a1f6cba3ae6c86..fec9dc6ab82863 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -311,72 +311,102 @@ static int genpd_xlate_performance_state(struct generic_pm_domain *genpd,
 }
 
 static int _genpd_set_performance_state(struct generic_pm_domain *genpd,
-					unsigned int state, int depth)
+					unsigned int state, int depth);
+
+static void _genpd_rollback_parent_state(struct gpd_link *link, int depth)
 {
-	struct generic_pm_domain *parent;
-	struct gpd_link *link;
-	int parent_state, ret;
+	struct generic_pm_domain *parent = link->parent;
+	int parent_state;
 
-	if (state == genpd->performance_state)
-		return 0;
+	genpd_lock_nested(parent, depth + 1);
 
-	/* Propagate to parents of genpd */
-	list_for_each_entry(link, &genpd->child_links, child_node) {
-		parent = link->parent;
+	parent_state = link->prev_performance_state;
+	link->performance_state = parent_state;
 
-		/* Find parent's performance state */
-		ret = genpd_xlate_performance_state(genpd, parent, state);
-		if (unlikely(ret < 0))
-			goto err;
+	parent_state = _genpd_reeval_performance_state(parent, parent_state);
+	if (_genpd_set_performance_state(parent, parent_state, depth + 1)) {
+		pr_err("%s: Failed to roll back to %d performance state\n",
+		       parent->name, parent_state);
+	}
 
-		parent_state = ret;
+	genpd_unlock(parent);
+}
 
-		genpd_lock_nested(parent, depth + 1);
+static int _genpd_set_parent_state(struct generic_pm_domain *genpd,
+				   struct gpd_link *link,
+				   unsigned int state, int depth)
+{
+	struct generic_pm_domain *parent = link->parent;
+	int parent_state, ret;
 
-		link->prev_performance_state = link->performance_state;
-		link->performance_state = parent_state;
-		parent_state = _genpd_reeval_performance_state(parent,
-						parent_state);
-		ret = _genpd_set_performance_state(parent, parent_state, depth + 1);
-		if (ret)
-			link->performance_state = link->prev_performance_state;
+	/* Find parent's performance state */
+	ret = genpd_xlate_performance_state(genpd, parent, state);
+	if (unlikely(ret < 0))
+		return ret;
 
-		genpd_unlock(parent);
+	parent_state = ret;
 
-		if (ret)
-			goto err;
-	}
+	genpd_lock_nested(parent, depth + 1);
 
-	if (genpd->set_performance_state) {
-		ret = genpd->set_performance_state(genpd, state);
-		if (ret)
-			goto err;
-	}
+	link->prev_performance_state = link->performance_state;
+	link->performance_state = parent_state;
 
-	genpd->performance_state = state;
-	return 0;
+	parent_state = _genpd_reeval_performance_state(parent, parent_state);
+	ret = _genpd_set_performance_state(parent, parent_state, depth + 1);
+	if (ret)
+		link->performance_state = link->prev_performance_state;
 
-err:
-	/* Encountered an error, lets rollback */
-	list_for_each_entry_continue_reverse(link, &genpd->child_links,
-					     child_node) {
-		parent = link->parent;
+	genpd_unlock(parent);
 
-		genpd_lock_nested(parent, depth + 1);
+	return ret;
+}
+
+static int _genpd_set_performance_state(struct generic_pm_domain *genpd,
+					unsigned int state, int depth)
+{
+	struct gpd_link *link = NULL;
+	int ret;
+
+	if (state == genpd->performance_state)
+		return 0;
 
-		parent_state = link->prev_performance_state;
-		link->performance_state = parent_state;
+	/* When scaling up, propagate to parents first in normal order */
+	if (state > genpd->performance_state) {
+		list_for_each_entry(link, &genpd->child_links, child_node) {
+			ret = _genpd_set_parent_state(genpd, link, state, depth);
+			if (ret)
+				goto rollback_parents_up;
+		}
+	}
 
-		parent_state = _genpd_reeval_performance_state(parent,
-						parent_state);
-		if (_genpd_set_performance_state(parent, parent_state, depth + 1)) {
-			pr_err("%s: Failed to roll back to %d performance state\n",
-			       parent->name, parent_state);
+	if (genpd->set_performance_state) {
+		ret = genpd->set_performance_state(genpd, state);
+		if (ret) {
+			if (link)
+				goto rollback_parents_up;
+			return ret;
 		}
+	}
 
-		genpd_unlock(parent);
+	/* When scaling down, propagate to parents last in reverse order */
+	if (state < genpd->performance_state) {
+		list_for_each_entry_reverse(link, &genpd->child_links, child_node) {
+			ret = _genpd_set_parent_state(genpd, link, state, depth);
+			if (ret)
+				goto rollback_parents_down;
+		}
 	}
 
+	genpd->performance_state = state;
+	return 0;
+
+rollback_parents_up:
+	list_for_each_entry_continue_reverse(link, &genpd->child_links, child_node)
+		_genpd_rollback_parent_state(link, depth);
+	return ret;
+rollback_parents_down:
+	list_for_each_entry_continue(link, &genpd->child_links, child_node)
+		_genpd_rollback_parent_state(link, depth);
 	return ret;
 }
 

From f55fcdb06f529c5031ada7edd5ede63dfdcc4a54 Mon Sep 17 00:00:00 2001
From: Jay <merqqcury@gmail.com>
Date: Tue, 9 Jan 2024 15:29:27 +0800
Subject: [PATCH 119/707] fs: fix a typo in attr.c

The word "filesytem" should be "filesystem"

Signed-off-by: Jay <merqqcury@gmail.com>
Link: https://lore.kernel.org/r/20240109072927.29626-1-merqqcury@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/attr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/attr.c b/fs/attr.c
index 5a13f0c8495fde..49d23b5dbab4b9 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -352,7 +352,7 @@ int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
 EXPORT_SYMBOL(may_setattr);
 
 /**
- * notify_change - modify attributes of a filesytem object
+ * notify_change - modify attributes of a filesystem object
  * @idmap:	idmap of the mount the inode was found from
  * @dentry:	object affected
  * @attr:	new attributes

From a121e297aac51c3ddb3f1f9aea58d4289aea8bc8 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Mon, 8 Jan 2024 18:20:40 +0100
Subject: [PATCH 120/707] fs: Wrong function name in comment

This comment refers to function mark_buffer_inode_dirty(), but the
function is actually called mark_buffer_dirty_inode(), so fix the
comment.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Link: https://lore.kernel.org/r/20240108172040.178173-1-agruenba@redhat.com
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index d3bcf601d3e5a5..dcafee512089a2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -464,7 +464,7 @@ EXPORT_SYMBOL(mark_buffer_async_write);
  * a successful fsync().  For example, ext2 indirect blocks need to be
  * written back and waited upon before fsync() returns.
  *
- * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
+ * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
  * management of a list of dependent buffers at ->i_mapping->i_private_list.
  *

From 30c45816e23523cd4527a8d20bf20c7d6bfd5a16 Mon Sep 17 00:00:00 2001
From: David Disseldorp <ddiss@suse.de>
Date: Thu, 11 Jan 2024 17:22:40 +1100
Subject: [PATCH 121/707] initramfs: remove duplicate built-in
 __initramfs_start unpacking

If initrd_start cpio extraction fails, CONFIG_BLK_DEV_RAM triggers
fallback to initrd.image handling via populate_initrd_image().
The populate_initrd_image() call follows successful extraction of any
built-in cpio archive at __initramfs_start, but currently performs
built-in archive extraction a second time.

Prior to commit b2a74d5f9d446 ("initramfs: remove clean_rootfs"),
the second built-in initramfs unpack call was used to repopulate entries
removed by clean_rootfs(), but it's no longer necessary now the contents
of the previous extraction are retained.

Signed-off-by: David Disseldorp <ddiss@suse.de>
Link: https://lore.kernel.org/r/20240111062240.9362-1-ddiss@suse.de
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 init/initramfs.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/init/initramfs.c b/init/initramfs.c
index 76deb48c38cb16..d3c623dde01a88 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -679,8 +679,6 @@ static void __init populate_initrd_image(char *err)
 	struct file *file;
 	loff_t pos = 0;
 
-	unpack_to_rootfs(__initramfs_start, __initramfs_size);
-
 	printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n",
 			err);
 	file = filp_open("/initrd.image", O_WRONLY | O_CREAT, 0700);

From ce2128e96b51c78732010fc79763d7a7141259e2 Mon Sep 17 00:00:00 2001
From: Wen Yang <wenyang.linux@foxmail.com>
Date: Wed, 10 Jan 2024 23:47:40 +0800
Subject: [PATCH 122/707] eventfd: add a BUILD_BUG_ON() to ensure consistency
 between EFD_SEMAPHORE and the uapi

introduce a BUILD_BUG_ON to check that the EFD_SEMAPHORE is equal to its
definition in the uapi file, just like EFD_CLOEXEC and EFD_NONBLOCK.

Signed-off-by: Wen Yang <wenyang.linux@foxmail.com>
Link: https://lore.kernel.org/r/tencent_0BAA2DEAF9208D49987457E6583F9BE79507@qq.com
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: <linux-fsdevel@vger.kernel.org>
Cc: <linux-kernel@vger.kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/eventfd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index ad8186d47ba760..0252b71099fbca 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -383,6 +383,7 @@ static int do_eventfd(unsigned int count, int flags)
 	/* Check the EFD_* constants for consistency.  */
 	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
+	BUILD_BUG_ON(EFD_SEMAPHORE != (1 << 0));
 
 	if (flags & ~EFD_FLAGS_SET)
 		return -EINVAL;

From 4148bf4c5e6dc0932e3d4649047b203e9d554d3c Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Tue, 16 Jan 2024 17:11:37 +0800
Subject: [PATCH 123/707] buffer: Use KMEM_CACHE instead of kmem_cache_create()

Use the new KMEM_CACHE() macro instead of direct kmem_cache_create
to simplify the creation of SLAB caches.

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Link: https://lore.kernel.org/r/20240116091137.92375-1-chentao@kylinos.cn
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/buffer.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index dcafee512089a2..b55dea034a5d83 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3121,12 +3121,8 @@ void __init buffer_init(void)
 	unsigned long nrpages;
 	int ret;
 
-	bh_cachep = kmem_cache_create("buffer_head",
-			sizeof(struct buffer_head), 0,
-				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
-				SLAB_MEM_SPREAD),
-				NULL);
-
+	bh_cachep = KMEM_CACHE(buffer_head,
+				SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
 	/*
 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
 	 */

From c2f1af4e033e17f9332e47e8fc3266afd95ad548 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Tue, 16 Jan 2024 15:53:35 +0800
Subject: [PATCH 124/707] fs: improve dump_mapping() robustness

We met a kernel crash issue when running stress-ng testing, and the
system crashes when printing the dentry name in dump_mapping().

Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
pc : dentry_name+0xd8/0x224
lr : pointer+0x22c/0x370
sp : ffff800025f134c0
......
Call trace:
  dentry_name+0xd8/0x224
  pointer+0x22c/0x370
  vsnprintf+0x1ec/0x730
  vscnprintf+0x2c/0x60
  vprintk_store+0x70/0x234
  vprintk_emit+0xe0/0x24c
  vprintk_default+0x3c/0x44
  vprintk_func+0x84/0x2d0
  printk+0x64/0x88
  __dump_page+0x52c/0x530
  dump_page+0x14/0x20
  set_migratetype_isolate+0x110/0x224
  start_isolate_page_range+0xc4/0x20c
  offline_pages+0x124/0x474
  memory_block_offline+0x44/0xf4
  memory_subsys_offline+0x3c/0x70
  device_offline+0xf0/0x120
  ......

The root cause is that, one thread is doing page migration, and we will
use the target page's ->mapping field to save 'anon_vma' pointer between
page unmap and page move, and now the target page is locked and refcount
is 1.

Currently, there is another stress-ng thread performing memory hotplug,
attempting to offline the target page that is being migrated. It discovers
that the refcount of this target page is 1, preventing the offline operation,
thus proceeding to dump the page. However, page_mapping() of the target
page may return an incorrect file mapping to crash the system in dump_mapping(),
since the target page->mapping only saves 'anon_vma' pointer without setting
PAGE_MAPPING_ANON flag.

The page migration issue has been fixed by commit d1adb25df711 ("mm: migrate:
fix getting incorrect page mapping during page migration"). In addition,
Matthew suggested we should also improve dump_mapping()'s robustness to
resilient against the kernel crash [1].

With checking the 'dentry.parent' and 'dentry.d_name.name' used by
dentry_name(), I can see dump_mapping() will output the invalid dentry
instead of crashing the system when this issue is reproduced again.

[12211.189128] page:fffff7de047741c0 refcount:1 mapcount:0 mapping:ffff989117f55ea0 index:0x1 pfn:0x211dd07
[12211.189144] aops:0x0 ino:1 invalid dentry:74786574206e6870
[12211.189148] flags: 0x57ffffc0000001(locked|node=1|zone=2|lastcpupid=0x1fffff)
[12211.189150] page_type: 0xffffffff()
[12211.189153] raw: 0057ffffc0000001 0000000000000000 dead000000000122 ffff989117f55ea0
[12211.189154] raw: 0000000000000001 0000000000000001 00000001ffffffff 0000000000000000
[12211.189155] page dumped because: unmovable page

[1] https://lore.kernel.org/all/ZXxn%2F0oixJxxAnpF@casper.infradead.org/

Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Link: https://lore.kernel.org/r/937ab1f87328516821d39be672b6bc18861d9d3e.1705391420.git.baolin.wang@linux.alibaba.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/inode.c b/fs/inode.c
index 91048c4c9c9e7d..6d0d5423036380 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -588,7 +588,8 @@ void dump_mapping(const struct address_space *mapping)
 	}
 
 	dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
-	if (get_kernel_nofault(dentry, dentry_ptr)) {
+	if (get_kernel_nofault(dentry, dentry_ptr) ||
+	    !dentry.d_parent || !dentry.d_name.name) {
 		pr_warn("aops:%ps ino:%lx invalid dentry:%px\n",
 				a_ops, ino, dentry_ptr);
 		return;

From 352f0ba021364519f95aa378dcca18cf7600fba1 Mon Sep 17 00:00:00 2001
From: Hu Yadi <hu.yadi@h3c.com>
Date: Fri, 12 Jan 2024 15:40:59 +0800
Subject: [PATCH 125/707] selftests/filesystems:fix build error in overlayfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

One build issue comes up due to both mount.h included dev_in_maps.c

In file included from dev_in_maps.c:10:
/usr/include/sys/mount.h:35:3: error: expected identifier before numeric constant
   35 |   MS_RDONLY = 1,  /* Mount read-only.  */
      |   ^~~~~~~~~
In file included from dev_in_maps.c:13:

Remove one of them to solve conflict, another error comes up:

dev_in_maps.c:170:6: error: implicit declaration of function ‘mount’ [-Werror=implicit-function-declaration]
  170 |  if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) {
      |      ^~~~~
cc1: all warnings being treated as errors

and then , add sys_mount definition to solve it
After both above, dev_in_maps.c can be built correctly on my mache(gcc 10.2,glibc-2.32,kernel-5.10)

Signed-off-by: Hu Yadi <hu.yadi@h3c.com>
Link: https://lore.kernel.org/r/20240112074059.29673-1-hu.yadi@h3c.com
Acked-by: Andrei Vagin <avagin@google.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../selftests/filesystems/overlayfs/dev_in_maps.c      | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
index e19ab0e8570913..759f86e7d263e4 100644
--- a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
+++ b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
@@ -10,7 +10,6 @@
 #include <linux/mount.h>
 #include <sys/syscall.h>
 #include <sys/stat.h>
-#include <sys/mount.h>
 #include <sys/mman.h>
 #include <sched.h>
 #include <fcntl.h>
@@ -32,7 +31,11 @@ static int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags)
 {
 	return syscall(__NR_fsmount, fd, flags, attr_flags);
 }
-
+static int sys_mount(const char *src, const char *tgt, const char *fst,
+		unsigned long flags, const void *data)
+{
+	return syscall(__NR_mount, src, tgt, fst, flags, data);
+}
 static int sys_move_mount(int from_dfd, const char *from_pathname,
 			  int to_dfd, const char *to_pathname,
 			  unsigned int flags)
@@ -166,8 +169,7 @@ int main(int argc, char **argv)
 		ksft_test_result_skip("unable to create a new mount namespace\n");
 		return 1;
 	}
-
-	if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) {
+	if (sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) {
 		pr_perror("mount");
 		return 1;
 	}

From b872e2a5ac3456dd1a3229657e1ea46aff06d2f1 Mon Sep 17 00:00:00 2001
From: "Hu.Yadi" <hu.yadi@h3c.com>
Date: Thu, 11 Jan 2024 19:32:29 +0800
Subject: [PATCH 126/707] selftests/move_mount_set_group:Make tests build with
 old libc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace SYS_<syscall> with __NR_<syscall>.  Using the __NR_<syscall>
notation, provided by UAPI, is useful to build tests on systems without
the SYS_<syscall> definitions.

Replace SYS_move_mount with __NR_move_mount

Similar changes: commit 87129ef13603 ("selftests/landlock: Make tests build with old libc")

Acked-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Hu.Yadi <hu.yadi@h3c.com>
Link: https://lore.kernel.org/r/20240111113229.10820-1-hu.yadi@h3c.com
Reviewed-by: Berlin <berlin@h3c.com>
Suggested-by: Jiao <jiaoxupo@h3c.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../move_mount_set_group/move_mount_set_group_test.c          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c
index 50ed5d475dd131..bcf51d785a3712 100644
--- a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c
+++ b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c
@@ -218,7 +218,7 @@ static bool move_mount_set_group_supported(void)
 	if (mount(NULL, SET_GROUP_FROM, NULL, MS_SHARED, 0))
 		return -1;
 
-	ret = syscall(SYS_move_mount, AT_FDCWD, SET_GROUP_FROM,
+	ret = syscall(__NR_move_mount, AT_FDCWD, SET_GROUP_FROM,
 		      AT_FDCWD, SET_GROUP_TO, MOVE_MOUNT_SET_GROUP);
 	umount2("/tmp", MNT_DETACH);
 
@@ -363,7 +363,7 @@ TEST_F(move_mount_set_group, complex_sharing_copying)
 		       CLONE_VM | CLONE_FILES); ASSERT_GT(pid, 0);
 	ASSERT_EQ(wait_for_pid(pid), 0);
 
-	ASSERT_EQ(syscall(SYS_move_mount, ca_from.mntfd, "",
+	ASSERT_EQ(syscall(__NR_move_mount, ca_from.mntfd, "",
 			  ca_to.mntfd, "", MOVE_MOUNT_SET_GROUP
 			  | MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH),
 		  0);

From 74ad68a64b60029c9e5dc78f63ae3fffc9a5569b Mon Sep 17 00:00:00 2001
From: Rich Felker <dalias@libc.org>
Date: Mon, 31 Aug 2020 11:32:08 -0400
Subject: [PATCH 127/707] vfs: add RWF_NOAPPEND flag for pwritev2

The pwrite function, originally defined by POSIX (thus the "p"), is
defined to ignore O_APPEND and write at the offset passed as its
argument. However, historically Linux honored O_APPEND if set and
ignored the offset. This cannot be changed due to stability policy,
but is documented in the man page as a bug.

Now that there's a pwritev2 syscall providing a superset of the pwrite
functionality that has a flags argument, the conforming behavior can
be offered to userspace via a new flag. Since pwritev2 checks flag
validity (in kiocb_set_rw_flags) and reports unknown ones with
EOPNOTSUPP, callers will not get wrong behavior on old kernels that
don't support the new flag; the error is reported and the caller can
decide how to handle it.

Signed-off-by: Rich Felker <dalias@libc.org>
Link: https://lore.kernel.org/r/20200831153207.GO3265@brightrain.aerifal.cx
Reviewed-by: Jann Horn <jannh@google.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h      | 8 ++++++++
 include/uapi/linux/fs.h | 5 ++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed5966a7049512..4f7cfda29143e2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3335,6 +3335,8 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 		return 0;
 	if (unlikely(flags & ~RWF_SUPPORTED))
 		return -EOPNOTSUPP;
+	if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND)))
+		return -EINVAL;
 
 	if (flags & RWF_NOWAIT) {
 		if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
@@ -3345,6 +3347,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 	if (flags & RWF_SYNC)
 		kiocb_flags |= IOCB_DSYNC;
 
+	if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) {
+		if (IS_APPEND(file_inode(ki->ki_filp)))
+			return -EPERM;
+		ki->ki_flags &= ~IOCB_APPEND;
+	}
+
 	ki->ki_flags |= kiocb_flags;
 	return 0;
 }
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 48ad69f7722e1a..2203d3194b91a7 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -301,9 +301,12 @@ typedef int __bitwise __kernel_rwf_t;
 /* per-IO O_APPEND */
 #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
 
+/* per-IO negation of O_APPEND */
+#define RWF_NOAPPEND	((__force __kernel_rwf_t)0x00000020)
+
 /* mask of flags supported by the kernel */
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
-			 RWF_APPEND)
+			 RWF_APPEND | RWF_NOAPPEND)
 
 /* Pagemap ioctl */
 #define PAGEMAP_SCAN	_IOWR('f', 16, struct pm_scan_arg)

From 172827cc44e92f3416392a43e4d219cbcf4d012c Mon Sep 17 00:00:00 2001
From: Kemeng Shi <shikemeng@huaweicloud.com>
Date: Fri, 19 Jan 2024 04:33:39 +0800
Subject: [PATCH 128/707] writeback: move wb_wakeup_delayed defination to
 fs-writeback.c

The wb_wakeup_delayed is only used in fs-writeback.c. Move it to
fs-writeback.c after defination of wb_wakeup and make it static.

Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Link: https://lore.kernel.org/r/20240118203339.764093-1-shikemeng@huaweicloud.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs-writeback.c           | 25 +++++++++++++++++++++++++
 include/linux/backing-dev.h |  1 -
 mm/backing-dev.c            | 25 -------------------------
 3 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d84fcc471c600..e4f17c53ddfcf3 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -141,6 +141,31 @@ static void wb_wakeup(struct bdi_writeback *wb)
 	spin_unlock_irq(&wb->work_lock);
 }
 
+/*
+ * This function is used when the first inode for this wb is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ *
+ * We have to be careful not to postpone flush work if it is scheduled for
+ * earlier. Thus we use queue_delayed_work().
+ */
+static void wb_wakeup_delayed(struct bdi_writeback *wb)
+{
+	unsigned long timeout;
+
+	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+	spin_lock_irq(&wb->work_lock);
+	if (test_bit(WB_registered, &wb->state))
+		queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+	spin_unlock_irq(&wb->work_lock);
+}
+
 static void finish_writeback_work(struct bdi_writeback *wb,
 				  struct wb_writeback_work *work)
 {
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 1a97277f99b1b8..8e7af9a03b41dd 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -38,7 +38,6 @@ struct backing_dev_info *bdi_alloc(int node_id);
 
 void wb_start_background_writeback(struct bdi_writeback *wb);
 void wb_workfn(struct work_struct *work);
-void wb_wakeup_delayed(struct bdi_writeback *wb);
 
 void wb_wait_for_completion(struct wb_completion *done);
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb14d..039dc74b505a85 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -372,31 +372,6 @@ static int __init default_bdi_init(void)
 }
 subsys_initcall(default_bdi_init);
 
-/*
- * This function is used when the first inode for this wb is marked dirty. It
- * wakes-up the corresponding bdi thread which should then take care of the
- * periodic background write-out of dirty inodes. Since the write-out would
- * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
- * set up a timer which wakes the bdi thread up later.
- *
- * Note, we wouldn't bother setting up the timer, but this function is on the
- * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
- * by delaying the wake-up.
- *
- * We have to be careful not to postpone flush work if it is scheduled for
- * earlier. Thus we use queue_delayed_work().
- */
-void wb_wakeup_delayed(struct bdi_writeback *wb)
-{
-	unsigned long timeout;
-
-	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
-	spin_lock_irq(&wb->work_lock);
-	if (test_bit(WB_registered, &wb->state))
-		queue_delayed_work(bdi_wq, &wb->dwork, timeout);
-	spin_unlock_irq(&wb->work_lock);
-}
-
 static void wb_update_bandwidth_workfn(struct work_struct *work)
 {
 	struct bdi_writeback *wb = container_of(to_delayed_work(work),

From 19e062e48b33f9f52dbb6f87def79b709f407260 Mon Sep 17 00:00:00 2001
From: Nikita Zhandarovich <n.zhandarovich@fintech.ru>
Date: Fri, 19 Jan 2024 07:39:06 -0800
Subject: [PATCH 129/707] do_sys_name_to_handle(): use kzalloc() to fix
 kernel-infoleak

syzbot identified a kernel information leak vulnerability in
do_sys_name_to_handle() and issued the following report [1].

[1]
"BUG: KMSAN: kernel-infoleak in instrument_copy_to_user include/linux/instrumented.h:114 [inline]
BUG: KMSAN: kernel-infoleak in _copy_to_user+0xbc/0x100 lib/usercopy.c:40
 instrument_copy_to_user include/linux/instrumented.h:114 [inline]
 _copy_to_user+0xbc/0x100 lib/usercopy.c:40
 copy_to_user include/linux/uaccess.h:191 [inline]
 do_sys_name_to_handle fs/fhandle.c:73 [inline]
 __do_sys_name_to_handle_at fs/fhandle.c:112 [inline]
 __se_sys_name_to_handle_at+0x949/0xb10 fs/fhandle.c:94
 __x64_sys_name_to_handle_at+0xe4/0x140 fs/fhandle.c:94
 ...

Uninit was created at:
 slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768
 slab_alloc_node mm/slub.c:3478 [inline]
 __kmem_cache_alloc_node+0x5c9/0x970 mm/slub.c:3517
 __do_kmalloc_node mm/slab_common.c:1006 [inline]
 __kmalloc+0x121/0x3c0 mm/slab_common.c:1020
 kmalloc include/linux/slab.h:604 [inline]
 do_sys_name_to_handle fs/fhandle.c:39 [inline]
 __do_sys_name_to_handle_at fs/fhandle.c:112 [inline]
 __se_sys_name_to_handle_at+0x441/0xb10 fs/fhandle.c:94
 __x64_sys_name_to_handle_at+0xe4/0x140 fs/fhandle.c:94
 ...

Bytes 18-19 of 20 are uninitialized
Memory access of size 20 starts at ffff888128a46380
Data copied to user address 0000000020000240"

Per Chuck Lever's suggestion, use kzalloc() instead of kmalloc() to
solve the problem.

Fixes: 990d6c2d7aee ("vfs: Add name to file handle conversion support")
Suggested-by: Chuck Lever III <chuck.lever@oracle.com>
Reported-and-tested-by: <syzbot+09b349b3066c2e0b1e96@syzkaller.appspotmail.com>
Signed-off-by: Nikita Zhandarovich <n.zhandarovich@fintech.ru>
Link: https://lore.kernel.org/r/20240119153906.4367-1-n.zhandarovich@fintech.ru
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fhandle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/fhandle.c b/fs/fhandle.c
index 18b3ba8dc8ead7..57a12614addfd4 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -36,7 +36,7 @@ static long do_sys_name_to_handle(const struct path *path,
 	if (f_handle.handle_bytes > MAX_HANDLE_SZ)
 		return -EINVAL;
 
-	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+	handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
 			 GFP_KERNEL);
 	if (!handle)
 		return -ENOMEM;

From aafef05b6399a2c4ce1c457d7fadf19443cdda2c Mon Sep 17 00:00:00 2001
From: Konrad Dybcio <konrad.dybcio@linaro.org>
Date: Wed, 27 Dec 2023 02:20:32 +0100
Subject: [PATCH 130/707] pmdomain: qcom: rpmpd: Keep one RPM handle for all
 RPMPDs

For no apparent reason (as there's just one RPM per SoC), all RPMPDs
currently store a copy of a pointer to smd_rpm. Introduce a single,
global one to save up on space in each definition.

bloat-o-meter reports:

Total: Before=92010, After=91062, chg -1.03%

Signed-off-by: Konrad Dybcio <konrad.dybcio@linaro.org>
Link: https://lore.kernel.org/r/20231227-topic-rpmpd_cleanup-v1-1-860ab141b076@linaro.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/pmdomain/qcom/rpmpd.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/pmdomain/qcom/rpmpd.c b/drivers/pmdomain/qcom/rpmpd.c
index 7796d65f96e8cf..90b62767f9d028 100644
--- a/drivers/pmdomain/qcom/rpmpd.c
+++ b/drivers/pmdomain/qcom/rpmpd.c
@@ -16,6 +16,8 @@
 
 #define domain_to_rpmpd(domain) container_of(domain, struct rpmpd, pd)
 
+static struct qcom_smd_rpm *rpmpd_smd_rpm;
+
 /* Resource types:
  * RPMPD_X is X encoded as a little-endian, lower-case, ASCII string */
 #define RPMPD_SMPA 0x61706d73
@@ -54,7 +56,6 @@ struct rpmpd {
 	bool enabled;
 	const int res_type;
 	const int res_id;
-	struct qcom_smd_rpm *rpm;
 	unsigned int max_state;
 	__le32 key;
 	bool state_synced;
@@ -879,7 +880,7 @@ static int rpmpd_send_enable(struct rpmpd *pd, bool enable)
 		.value = cpu_to_le32(enable),
 	};
 
-	return qcom_rpm_smd_write(pd->rpm, QCOM_SMD_RPM_ACTIVE_STATE,
+	return qcom_rpm_smd_write(rpmpd_smd_rpm, QCOM_SMD_RPM_ACTIVE_STATE,
 				  pd->res_type, pd->res_id, &req, sizeof(req));
 }
 
@@ -891,7 +892,7 @@ static int rpmpd_send_corner(struct rpmpd *pd, int state, unsigned int corner)
 		.value = cpu_to_le32(corner),
 	};
 
-	return qcom_rpm_smd_write(pd->rpm, state, pd->res_type, pd->res_id,
+	return qcom_rpm_smd_write(rpmpd_smd_rpm, state, pd->res_type, pd->res_id,
 				  &req, sizeof(req));
 };
 
@@ -1004,12 +1005,11 @@ static int rpmpd_probe(struct platform_device *pdev)
 	int i;
 	size_t num;
 	struct genpd_onecell_data *data;
-	struct qcom_smd_rpm *rpm;
 	struct rpmpd **rpmpds;
 	const struct rpmpd_desc *desc;
 
-	rpm = dev_get_drvdata(pdev->dev.parent);
-	if (!rpm) {
+	rpmpd_smd_rpm = dev_get_drvdata(pdev->dev.parent);
+	if (!rpmpd_smd_rpm) {
 		dev_err(&pdev->dev, "Unable to retrieve handle to RPM\n");
 		return -ENODEV;
 	}
@@ -1039,7 +1039,6 @@ static int rpmpd_probe(struct platform_device *pdev)
 			continue;
 		}
 
-		rpmpds[i]->rpm = rpm;
 		rpmpds[i]->max_state = desc->max_state;
 		rpmpds[i]->pd.power_off = rpmpd_power_off;
 		rpmpds[i]->pd.power_on = rpmpd_power_on;

From 2afe7095791a48696940c8921c7b67c442882317 Mon Sep 17 00:00:00 2001
From: Konrad Dybcio <konrad.dybcio@linaro.org>
Date: Wed, 27 Dec 2023 16:18:54 +0100
Subject: [PATCH 131/707] pmdomain: core: Print a message when unused power
 domains are disabled

In a similar spirit to commit 12ca59b91d04 ("clk: Print an info line
before disabling unused clocks"), print the message in both ignore AND
cleanup cases to better inform the user (and more importantly, the
developer) when it happens.

Signed-off-by: Konrad Dybcio <konrad.dybcio@linaro.org>
Link: https://lore.kernel.org/r/20231227-topic-pmdomain_spam-v1-1-ff0410086b36@linaro.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/pmdomain/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index fec9dc6ab82863..30b9be61bab18d 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -1130,6 +1130,7 @@ static int __init genpd_power_off_unused(void)
 		return 0;
 	}
 
+	pr_info("genpd: Disabling unused power domains\n");
 	mutex_lock(&gpd_list_lock);
 
 	list_for_each_entry(genpd, &gpd_list, gpd_list_node)

From d901fa88145178db7bfedfbdc13924b564c94d50 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 22 Jan 2024 14:44:41 +0100
Subject: [PATCH 132/707] pmdomain: renesas: rcar-gen4-sysc: Remove unneeded
 includes

The R-Car V3U System Controller (SYSC) driver no longer needs these
includes since the factoring out of the common R-Car Gen4 SYSC driver in
commit e62906d6315f652b ("soc: renesas: rcar-gen4-sysc: Introduce R-Car
Gen4 SYSC driver").

The R-Car S4-8 and V4H SYSC drivers never needed these includes, as
these drivers always used the common R-Car Gen4 SYSC driver.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/5b440f84ab8b52499ab307c84154dcbc0f41d1d7.1705931035.git.geert+renesas@glider.be
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/pmdomain/renesas/r8a779a0-sysc.c | 12 ------------
 drivers/pmdomain/renesas/r8a779f0-sysc.c | 12 ------------
 drivers/pmdomain/renesas/r8a779g0-sysc.c | 12 ------------
 3 files changed, 36 deletions(-)

diff --git a/drivers/pmdomain/renesas/r8a779a0-sysc.c b/drivers/pmdomain/renesas/r8a779a0-sysc.c
index 04f1bc322ae7b6..54cdf250f7c2d1 100644
--- a/drivers/pmdomain/renesas/r8a779a0-sysc.c
+++ b/drivers/pmdomain/renesas/r8a779a0-sysc.c
@@ -5,19 +5,7 @@
  * Copyright (C) 2020 Renesas Electronics Corp.
  */
 
-#include <linux/bits.h>
-#include <linux/clk/renesas.h>
-#include <linux/delay.h>
-#include <linux/err.h>
-#include <linux/io.h>
-#include <linux/iopoll.h>
 #include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/of_address.h>
-#include <linux/pm_domain.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
 
 #include <dt-bindings/power/r8a779a0-sysc.h>
 
diff --git a/drivers/pmdomain/renesas/r8a779f0-sysc.c b/drivers/pmdomain/renesas/r8a779f0-sysc.c
index 5602aa6bd7ed15..6ed13cd1cb249d 100644
--- a/drivers/pmdomain/renesas/r8a779f0-sysc.c
+++ b/drivers/pmdomain/renesas/r8a779f0-sysc.c
@@ -5,19 +5,7 @@
  * Copyright (C) 2021 Renesas Electronics Corp.
  */
 
-#include <linux/bits.h>
-#include <linux/clk/renesas.h>
-#include <linux/delay.h>
-#include <linux/err.h>
-#include <linux/io.h>
-#include <linux/iopoll.h>
 #include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/of_address.h>
-#include <linux/pm_domain.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
 
 #include <dt-bindings/power/r8a779f0-sysc.h>
 
diff --git a/drivers/pmdomain/renesas/r8a779g0-sysc.c b/drivers/pmdomain/renesas/r8a779g0-sysc.c
index b932eba1b8042d..249cf43af45b64 100644
--- a/drivers/pmdomain/renesas/r8a779g0-sysc.c
+++ b/drivers/pmdomain/renesas/r8a779g0-sysc.c
@@ -5,19 +5,7 @@
  * Copyright (C) 2022 Renesas Electronics Corp.
  */
 
-#include <linux/bits.h>
-#include <linux/clk/renesas.h>
-#include <linux/delay.h>
-#include <linux/err.h>
-#include <linux/io.h>
-#include <linux/iopoll.h>
 #include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/of_address.h>
-#include <linux/pm_domain.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
 
 #include <dt-bindings/power/r8a779g0-sysc.h>
 

From 05170c8598f1edccc5948fe797a40125d55c4f65 Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Thu, 18 Jan 2024 13:42:57 +0800
Subject: [PATCH 133/707] pmdomain: ti: Add a null pointer check to the
 omap_prm_domain_init

devm_kasprintf() returns a pointer to dynamically allocated memory
which can be NULL upon failure. Ensure the allocation was successful
by checking the pointer validity.

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Link: https://lore.kernel.org/r/20240118054257.200814-1-chentao@kylinos.cn
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/pmdomain/ti/omap_prm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/pmdomain/ti/omap_prm.c b/drivers/pmdomain/ti/omap_prm.c
index c2feae3a634caf..b8ceb3c2b81c25 100644
--- a/drivers/pmdomain/ti/omap_prm.c
+++ b/drivers/pmdomain/ti/omap_prm.c
@@ -695,6 +695,8 @@ static int omap_prm_domain_init(struct device *dev, struct omap_prm *prm)
 	data = prm->data;
 	name = devm_kasprintf(dev, GFP_KERNEL, "prm_%s",
 			      data->name);
+	if (!name)
+		return -ENOMEM;
 
 	prmd->dev = dev;
 	prmd->prm = prm;

From 768c04ccb9013c0eab51dd31f63474ef8a7456db Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Fri, 19 Jan 2024 02:47:41 +0100
Subject: [PATCH 134/707] pmdomain: imx8mp-blk-ctrl: Error out if domains are
 missing in DT

This driver assumes that domain->power_dev is non-NULL in its
suspend/resume path. The assumption is valid, since all the devices that
are being looked up here should be described in DT. In case they are not
described in DT, because the DT is faulty, suspend/resume attempt would
trigger NULL pointer dereference.

To avoid this failure, check whether the power_dev assignment is not NULL
right away in probe callback and fail early if it is.

Signed-off-by: Marek Vasut <marex@denx.de>
Reviewed-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20240119014807.268694-1-marex@denx.de
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/pmdomain/imx/imx8m-blk-ctrl.c  | 9 ++++++---
 drivers/pmdomain/imx/imx8mp-blk-ctrl.c | 9 ++++++---
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/pmdomain/imx/imx8m-blk-ctrl.c b/drivers/pmdomain/imx/imx8m-blk-ctrl.c
index 1341a707f61bcb..ca942d7929c2ba 100644
--- a/drivers/pmdomain/imx/imx8m-blk-ctrl.c
+++ b/drivers/pmdomain/imx/imx8m-blk-ctrl.c
@@ -258,11 +258,14 @@ static int imx8m_blk_ctrl_probe(struct platform_device *pdev)
 
 		domain->power_dev =
 			dev_pm_domain_attach_by_name(dev, data->gpc_name);
-		if (IS_ERR(domain->power_dev)) {
-			dev_err_probe(dev, PTR_ERR(domain->power_dev),
+		if (IS_ERR_OR_NULL(domain->power_dev)) {
+			if (!domain->power_dev)
+				ret = -ENODEV;
+			else
+				ret = PTR_ERR(domain->power_dev);
+			dev_err_probe(dev, ret,
 				      "failed to attach power domain \"%s\"\n",
 				      data->gpc_name);
-			ret = PTR_ERR(domain->power_dev);
 			goto cleanup_pds;
 		}
 
diff --git a/drivers/pmdomain/imx/imx8mp-blk-ctrl.c b/drivers/pmdomain/imx/imx8mp-blk-ctrl.c
index e3203eb6a02293..e488cf79b80070 100644
--- a/drivers/pmdomain/imx/imx8mp-blk-ctrl.c
+++ b/drivers/pmdomain/imx/imx8mp-blk-ctrl.c
@@ -687,11 +687,14 @@ static int imx8mp_blk_ctrl_probe(struct platform_device *pdev)
 
 		domain->power_dev =
 			dev_pm_domain_attach_by_name(dev, data->gpc_name);
-		if (IS_ERR(domain->power_dev)) {
-			dev_err_probe(dev, PTR_ERR(domain->power_dev),
+		if (IS_ERR_OR_NULL(domain->power_dev)) {
+			if (!domain->power_dev)
+				ret = -ENODEV;
+			else
+				ret = PTR_ERR(domain->power_dev);
+			dev_err_probe(dev, ret,
 				      "failed to attach power domain %s\n",
 				      data->gpc_name);
-			ret = PTR_ERR(domain->power_dev);
 			goto cleanup_pds;
 		}
 

From 460fad3643ba3a2b4332f3f21c149ad35749d87c Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Mon, 15 Jan 2024 13:18:20 +0100
Subject: [PATCH 135/707] gfs2: Fix gfs2_drevalidate NULL pointer dereference

Commit dd00aaeb3432 added an RCU-safe way of computing d_inode(parent)
to gfs2_drevalidate() to support the LOOKUP_RCU flag, but then failed to
convert one of the instances of d_inode(parent) to its RCU-safe
replacement.  This manifested as a NULL pointer dereference.  Fix that.

Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Fixes: dd00aaeb3432 ("gfs2: Use GL_NOBLOCK flag for non-blocking lookups")
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Andrew Price <anprice@redhat.com>
---
 fs/gfs2/dentry.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 177f1f41f22545..c6483fb9862450 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -72,7 +72,7 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
 			goto out;
 	}
 
-	error = gfs2_dir_check(d_inode(parent), &dentry->d_name, ip);
+	error = gfs2_dir_check(dinode, &dentry->d_name, ip);
 	valid = inode ? !error : (error == -ENOENT);
 
 	if (!had_lock)

From 04b945e4cf81a12365f8207a4d34dbc81ba17413 Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Thu, 21 Dec 2023 07:16:04 -0800
Subject: [PATCH 136/707] slimbus: qcom-ngd-ctrl: Make QMI message rules const

Commit ff6d365898d4 ("soc: qcom: qmi: use const for struct
qmi_elem_info") allows QMI message encoding/decoding rules
to be const, so do that for qcom-ngd-ctrl.c.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 drivers/slimbus/qcom-ngd-ctrl.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/slimbus/qcom-ngd-ctrl.c b/drivers/slimbus/qcom-ngd-ctrl.c
index 77aa6d26476cd2..efeba8275a6691 100644
--- a/drivers/slimbus/qcom-ngd-ctrl.c
+++ b/drivers/slimbus/qcom-ngd-ctrl.c
@@ -220,7 +220,7 @@ struct slimbus_power_resp_msg_v01 {
 	struct qmi_response_type_v01 resp;
 };
 
-static struct qmi_elem_info slimbus_select_inst_req_msg_v01_ei[] = {
+static const struct qmi_elem_info slimbus_select_inst_req_msg_v01_ei[] = {
 	{
 		.data_type  = QMI_UNSIGNED_4_BYTE,
 		.elem_len   = 1,
@@ -262,7 +262,7 @@ static struct qmi_elem_info slimbus_select_inst_req_msg_v01_ei[] = {
 	},
 };
 
-static struct qmi_elem_info slimbus_select_inst_resp_msg_v01_ei[] = {
+static const struct qmi_elem_info slimbus_select_inst_resp_msg_v01_ei[] = {
 	{
 		.data_type  = QMI_STRUCT,
 		.elem_len   = 1,
@@ -284,7 +284,7 @@ static struct qmi_elem_info slimbus_select_inst_resp_msg_v01_ei[] = {
 	},
 };
 
-static struct qmi_elem_info slimbus_power_req_msg_v01_ei[] = {
+static const struct qmi_elem_info slimbus_power_req_msg_v01_ei[] = {
 	{
 		.data_type  = QMI_UNSIGNED_4_BYTE,
 		.elem_len   = 1,
@@ -324,7 +324,7 @@ static struct qmi_elem_info slimbus_power_req_msg_v01_ei[] = {
 	},
 };
 
-static struct qmi_elem_info slimbus_power_resp_msg_v01_ei[] = {
+static const struct qmi_elem_info slimbus_power_resp_msg_v01_ei[] = {
 	{
 		.data_type  = QMI_STRUCT,
 		.elem_len   = 1,

From 53ddef135d3a80064b74964a08b0e0f3aed7c952 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Danis?= <frederic.danis@collabora.com>
Date: Mon, 22 Jan 2024 17:59:55 +0100
Subject: [PATCH 137/707] Bluetooth: mgmt: Fix limited discoverable off timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LIMITED_DISCOVERABLE flag is not reset from Class of Device and
advertisement on limited discoverable timeout. This prevents to pass PTS
test GAP/DISC/LIMM/BV-02-C

Calling set_discoverable_sync as when the limited discovery is set
correctly update the Class of Device and advertisement.

Signed-off-by: Frédéric Danis <frederic.danis@collabora.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/mgmt.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 173986f3405f7a..8c4493255f92ab 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -1045,6 +1045,8 @@ static void rpa_expired(struct work_struct *work)
 	hci_cmd_sync_queue(hdev, rpa_expired_sync, NULL, NULL);
 }
 
+static int set_discoverable_sync(struct hci_dev *hdev, void *data);
+
 static void discov_off(struct work_struct *work)
 {
 	struct hci_dev *hdev = container_of(work, struct hci_dev,
@@ -1063,7 +1065,7 @@ static void discov_off(struct work_struct *work)
 	hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
 	hdev->discov_timeout = 0;
 
-	hci_update_discoverable(hdev);
+	hci_cmd_sync_queue(hdev, set_discoverable_sync, NULL, NULL);
 
 	mgmt_new_settings(hdev);
 

From 819faf7a243ef2771ba650dfc1b8a40f5a3d94ab Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Mon, 15 Jan 2024 22:10:44 +0100
Subject: [PATCH 138/707] gfs2: Pass FGP flags to gfs2_getbuf

Replace gfs2_getbuf()'s create argument with a fgp_flags argument.  Use
the FGP_CREAT flag instead of the CREATE flag to indicate that new
buffers should be created.

In addition, when the FGP_NOWAIT flag is set and gfs2_getbuf() would
sleep, -EAGAIN is returned instead.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Andrew Price <anprice@redhat.com>
---
 fs/gfs2/bmap.c    |  2 +-
 fs/gfs2/dir.c     |  2 +-
 fs/gfs2/meta_io.c | 36 +++++++++++++++++++++++-------------
 fs/gfs2/meta_io.h |  2 +-
 4 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index d9ccfd27e4f11f..92945e5b764348 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -301,7 +301,7 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
 		if (!*t)
 			continue;
 
-		rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
+		rabh = gfs2_getbuf(gl, be64_to_cpu(*t), FGP_CREAT);
 		if (trylock_buffer(rabh)) {
 			if (!buffer_uptodate(rabh)) {
 				rabh->b_end_io = end_buffer_read_sync;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 560e4624c09f2d..518a7fb42df044 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1500,7 +1500,7 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
 		if (blocknr == last)
 			continue;
 
-		bh = gfs2_getbuf(gl, blocknr, 1);
+		bh = gfs2_getbuf(gl, blocknr, FGP_CREAT);
 		if (trylock_buffer(bh)) {
 			if (buffer_uptodate(bh)) {
 				unlock_buffer(bh);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index f814054c8cd08a..785d9ef6f24a21 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -106,12 +106,15 @@ const struct address_space_operations gfs2_rgrp_aops = {
  * gfs2_getbuf - Get a buffer with a given address space
  * @gl: the glock
  * @blkno: the block number (filesystem scope)
- * @create: 1 if the buffer should be created
+ * @fgp_flags: Flags like FGP_CREAT and FGP_NOWAIT
+ *
+ * Returns ERR_PTR(-EAGAIN) if the FGP_NOWAIT flag is set and the function
+ * would sleep.
  *
  * Returns: the buffer
  */
-
-struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
+				fgf_t fgp_flags)
 {
 	struct address_space *mapping = gfs2_glock2aspace(gl);
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
@@ -128,17 +131,24 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 	index = blkno >> shift;             /* convert block to page */
 	bufnum = blkno - (index << shift);  /* block buf index within page */
 
-	if (create) {
+	if (fgp_flags & FGP_CREAT) {
 		folio = __filemap_get_folio(mapping, index,
-				FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+				FGP_LOCK | FGP_ACCESSED | fgp_flags,
 				mapping_gfp_mask(mapping) | __GFP_NOFAIL);
+		if (IS_ERR(folio))
+			return ERR_CAST(folio);
 		bh = folio_buffers(folio);
-		if (!bh)
+		if (!bh) {
+			if (fgp_flags & FGP_NOWAIT) {
+				bh = ERR_PTR(-EAGAIN);
+				goto out_unlock;
+			}
 			bh = create_empty_buffers(folio,
 				sdp->sd_sb.sb_bsize, 0);
+		}
 	} else {
 		folio = __filemap_get_folio(mapping, index,
-				FGP_LOCK | FGP_ACCESSED, 0);
+				FGP_LOCK | FGP_ACCESSED | fgp_flags, 0);
 		if (IS_ERR(folio))
 			return NULL;
 		bh = folio_buffers(folio);
@@ -181,7 +191,7 @@ static void meta_prep_new(struct buffer_head *bh)
 struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 {
 	struct buffer_head *bh;
-	bh = gfs2_getbuf(gl, blkno, CREATE);
+	bh = gfs2_getbuf(gl, blkno, FGP_CREAT);
 	meta_prep_new(bh);
 	return bh;
 }
@@ -258,7 +268,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 		return -EIO;
 	}
 
-	*bhp = bh = gfs2_getbuf(gl, blkno, CREATE);
+	*bhp = bh = gfs2_getbuf(gl, blkno, FGP_CREAT);
 
 	lock_buffer(bh);
 	if (buffer_uptodate(bh)) {
@@ -271,7 +281,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 	}
 
 	if (rahead) {
-		bh = gfs2_getbuf(gl, blkno + 1, CREATE);
+		bh = gfs2_getbuf(gl, blkno + 1, FGP_CREAT);
 
 		lock_buffer(bh);
 		if (buffer_uptodate(bh)) {
@@ -443,7 +453,7 @@ void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	gfs2_ail1_wipe(sdp, bstart, blen);
 	while (blen) {
 		ty = REMOVE_META;
-		bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE);
+		bh = gfs2_getbuf(ip->i_gl, bstart, 0);
 		if (!bh && gfs2_is_jdata(ip)) {
 			bh = gfs2_getjdatabuf(ip, bstart);
 			ty = REMOVE_JDATA;
@@ -519,7 +529,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	if (extlen > max_ra)
 		extlen = max_ra;
 
-	first_bh = gfs2_getbuf(gl, dblock, CREATE);
+	first_bh = gfs2_getbuf(gl, dblock, FGP_CREAT);
 
 	if (buffer_uptodate(first_bh))
 		goto out;
@@ -529,7 +539,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	extlen--;
 
 	while (extlen) {
-		bh = gfs2_getbuf(gl, dblock, CREATE);
+		bh = gfs2_getbuf(gl, dblock, FGP_CREAT);
 
 		bh_readahead(bh, REQ_RAHEAD | REQ_META | REQ_PRIO);
 		brelse(bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 831d988c2ceb74..e239e410881c1b 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -55,7 +55,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 		   int rahead, struct buffer_head **bhp);
 int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
 struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
-			        int create);
+				fgf_t fgp_flags);
 enum {
 	REMOVE_JDATA = 0,
 	REMOVE_META = 1,

From 80e9596fe900336e83ef06f6eb79e214a358ee5d Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 16 Jan 2024 00:09:44 +0100
Subject: [PATCH 139/707] gfs2: Split gfs2_meta_read_async off from
 gfs2_meta_read

Split gfs2_meta_read_async() off from gfs2_meta_read() and implement
gfs2_meta_read() in terms of gfs2_meta_read_async() and
gfs2_meta_wait().

The initial check for filesystem withdrawal in gfs2_meta_wait() is
unnecessary and can be removed because gfs2_meta_wait() always follows
gfs2_meta_read_async() in the code and gfs2_meta_read_async() already
performs that check.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Andrew Price <anprice@redhat.com>
---
 fs/gfs2/dir.c     |  6 +++---
 fs/gfs2/incore.h  |  1 -
 fs/gfs2/meta_io.c | 46 ++++++++++++++++++++++------------------------
 fs/gfs2/meta_io.h |  6 ++++--
 fs/gfs2/quota.c   |  2 +-
 fs/gfs2/rgrp.c    |  3 ++-
 fs/gfs2/xattr.c   | 13 ++++++-------
 7 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 518a7fb42df044..446e374a8d78ca 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -105,7 +105,7 @@ static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
 	struct buffer_head *bh;
 	int error;
 
-	error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, 0, &bh);
+	error = gfs2_meta_read(ip->i_gl, block, 0, &bh);
 	if (error)
 		return error;
 	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
@@ -300,7 +300,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf,
 			BUG_ON(extlen < 1);
 			bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
 		} else {
-			error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, 0, &bh);
+			error = gfs2_meta_read(ip->i_gl, dblock, 0, &bh);
 			if (error)
 				goto fail;
 		}
@@ -757,7 +757,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
 {
 	int error;
 
-	error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, 0, bhp);
+	error = gfs2_meta_read(dip->i_gl, leaf_no, 0, bhp);
 	if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
 		/* pr_info("block num=%llu\n", leaf_no); */
 		error = -EIO;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 95a334d64da2a3..e1343fd0a5b108 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -22,7 +22,6 @@
 #include <linux/rhashtable.h>
 #include <linux/mutex.h>
 
-#define DIO_WAIT	0x00000010
 #define DIO_METADATA	0x00000020
 
 struct gfs2_log_operations;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 785d9ef6f24a21..faa2bd7771060a 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -245,18 +245,17 @@ static void gfs2_submit_bhs(blk_opf_t opf, struct buffer_head *bhs[], int num)
 }
 
 /**
- * gfs2_meta_read - Read a block from disk
+ * gfs2_meta_read_async - Read a block from disk
  * @gl: The glock covering the block
  * @blkno: The block number
- * @flags: flags
  * @rahead: Do read-ahead
  * @bhp: the place where the buffer is returned (NULL on failure)
  *
  * Returns: errno
  */
 
-int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
-		   int rahead, struct buffer_head **bhp)
+int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno,
+			 int rahead, struct buffer_head **bhp)
 {
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 	struct buffer_head *bh, *bhs[2];
@@ -273,7 +272,6 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 	lock_buffer(bh);
 	if (buffer_uptodate(bh)) {
 		unlock_buffer(bh);
-		flags &= ~DIO_WAIT;
 	} else {
 		bh->b_end_io = end_buffer_read_sync;
 		get_bh(bh);
@@ -294,20 +292,6 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 	}
 
 	gfs2_submit_bhs(REQ_OP_READ | REQ_META | REQ_PRIO, bhs, num);
-	if (!(flags & DIO_WAIT))
-		return 0;
-
-	bh = *bhp;
-	wait_on_buffer(bh);
-	if (unlikely(!buffer_uptodate(bh))) {
-		struct gfs2_trans *tr = current->journal_info;
-		if (tr && test_bit(TR_TOUCHED, &tr->tr_flags))
-			gfs2_io_error_bh_wd(sdp, bh);
-		brelse(bh);
-		*bhp = NULL;
-		return -EIO;
-	}
-
 	return 0;
 }
 
@@ -321,10 +305,6 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 
 int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
 {
-	if (gfs2_withdrawing_or_withdrawn(sdp) &&
-	    !gfs2_withdraw_in_prog(sdp))
-		return -EIO;
-
 	wait_on_buffer(bh);
 
 	if (!buffer_uptodate(bh)) {
@@ -340,6 +320,24 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
 	return 0;
 }
 
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
+		   int rahead, struct buffer_head **bhp)
+{
+	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+	int ret;
+
+	ret = gfs2_meta_read_async(gl, blkno, rahead, bhp);
+	if (ret)
+		return ret;
+
+	ret = gfs2_meta_wait(sdp, *bhp);
+	if (ret) {
+		brelse(*bhp);
+		*bhp = NULL;
+	}
+	return ret;
+}
+
 void gfs2_remove_from_journal(struct buffer_head *bh, int meta)
 {
 	struct address_space *mapping = bh->b_folio->mapping;
@@ -496,7 +494,7 @@ int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num,
 	if (num == ip->i_no_addr)
 		rahead = ip->i_rahead;
 
-	ret = gfs2_meta_read(gl, num, DIO_WAIT, rahead, &bh);
+	ret = gfs2_meta_read(gl, num, rahead, &bh);
 	if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
 		brelse(bh);
 		ret = -EIO;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index e239e410881c1b..f04c91eadfb4ae 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -51,9 +51,11 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
 }
 
 struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
-int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
-		   int rahead, struct buffer_head **bhp);
+int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno,
+			 int rahead, struct buffer_head **bhp);
 int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
+		   int rahead, struct buffer_head **bhp);
 struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
 				fgf_t fgp_flags);
 enum {
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index aa9cf010284897..a68af8bdf7de44 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -424,7 +424,7 @@ static int bh_get(struct gfs2_quota_data *qd)
 		goto fail;
 
 	error = gfs2_meta_read(ip->i_gl, iomap.addr >> inode->i_blkbits,
-			       DIO_WAIT, 0, &bh);
+			       0, &bh);
 	if (error)
 		goto fail;
 	error = -EIO;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 26d6c1eea55919..fcef82c767e346 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1210,7 +1210,8 @@ int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl)
 
 	for (x = 0; x < length; x++) {
 		bi = rgd->rd_bits + x;
-		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
+		error = gfs2_meta_read_async(gl, rgd->rd_addr + x, 0,
+					     &bi->bi_bh);
 		if (error)
 			goto fail;
 	}
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 8c96ba6230d1b9..759347ec67af5c 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -128,7 +128,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
 	__be64 *eablk, *end;
 	int error;
 
-	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &bh);
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, &bh);
 	if (error)
 		return error;
 
@@ -152,7 +152,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
 			break;
 		bn = be64_to_cpu(*eablk);
 
-		error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, 0, &eabh);
+		error = gfs2_meta_read(ip->i_gl, bn, 0, &eabh);
 		if (error)
 			break;
 		error = ea_foreach_i(ip, eabh, ea_call, data);
@@ -468,8 +468,8 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 		return -ENOMEM;
 
 	for (x = 0; x < nptrs; x++) {
-		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, 0,
-				       bh + x);
+		error = gfs2_meta_read_async(ip->i_gl, be64_to_cpu(*dataptrs),
+					     0, bh + x);
 		if (error) {
 			while (x--)
 				brelse(bh[x]);
@@ -976,8 +976,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
 		__be64 *end;
 
-		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0,
-				       &indbh);
+		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, &indbh);
 		if (error)
 			return error;
 
@@ -1279,7 +1278,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 
 	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
 
-	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh);
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, &indbh);
 	if (error)
 		return error;
 

From 224e701e74ec6cd44f6f80352bbfef03e406720b Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 16 Jan 2024 14:22:21 +0100
Subject: [PATCH 140/707] gfs2: Add FGP_NOWAIT support to gfs2_meta_read_async

Add an fgp_flags argument to gfs2_meta_read_async() and gfs2_meta_read()
that is passed through to gfs2_getbuf().  When the FGP_NOWAIT flag is
set and gfs2_meta_read_async() would sleep, -EAGAIN is returned instead.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Andrew Price <anprice@redhat.com>
---
 fs/gfs2/dir.c     |  6 +++---
 fs/gfs2/meta_io.c | 41 ++++++++++++++++++++++++++++++++++-------
 fs/gfs2/meta_io.h |  4 ++--
 fs/gfs2/quota.c   |  2 +-
 fs/gfs2/rgrp.c    |  2 +-
 fs/gfs2/xattr.c   | 10 +++++-----
 6 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 446e374a8d78ca..a1f47696a2ecfb 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -105,7 +105,7 @@ static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
 	struct buffer_head *bh;
 	int error;
 
-	error = gfs2_meta_read(ip->i_gl, block, 0, &bh);
+	error = gfs2_meta_read(ip->i_gl, block, 0, 0, &bh);
 	if (error)
 		return error;
 	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
@@ -300,7 +300,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf,
 			BUG_ON(extlen < 1);
 			bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
 		} else {
-			error = gfs2_meta_read(ip->i_gl, dblock, 0, &bh);
+			error = gfs2_meta_read(ip->i_gl, dblock, 0, 0, &bh);
 			if (error)
 				goto fail;
 		}
@@ -757,7 +757,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
 {
 	int error;
 
-	error = gfs2_meta_read(dip->i_gl, leaf_no, 0, bhp);
+	error = gfs2_meta_read(dip->i_gl, leaf_no, 0, 0, bhp);
 	if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
 		/* pr_info("block num=%llu\n", leaf_no); */
 		error = -EIO;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index faa2bd7771060a..0827e1f58a5a03 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -248,13 +248,16 @@ static void gfs2_submit_bhs(blk_opf_t opf, struct buffer_head *bhs[], int num)
  * gfs2_meta_read_async - Read a block from disk
  * @gl: The glock covering the block
  * @blkno: The block number
+ * @fgp_flags: FGP_NOWAIT if sleeping is prohibited
  * @rahead: Do read-ahead
  * @bhp: the place where the buffer is returned (NULL on failure)
  *
+ * Returns -EAGAIN if the FGP_NOWAIT flag is set and the function would sleep.
+ *
  * Returns: errno
  */
 
-int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno,
+int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno, fgf_t fgp_flags,
 			 int rahead, struct buffer_head **bhp)
 {
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
@@ -267,7 +270,28 @@ int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno,
 		return -EIO;
 	}
 
-	*bhp = bh = gfs2_getbuf(gl, blkno, FGP_CREAT);
+	*bhp = bh = gfs2_getbuf(gl, blkno, FGP_CREAT | fgp_flags);
+	if (IS_ERR(bh)) {
+		*bhp = NULL;
+		return PTR_ERR(bh);
+	}
+
+	if (fgp_flags & FGP_NOWAIT) {
+		bool uptodate = false;
+
+		/* skips readahead entirely. */
+
+		if (trylock_buffer(bh)) {
+			uptodate = buffer_uptodate(bh);
+			unlock_buffer(bh);
+		}
+		if (!uptodate) {
+			brelse(bh);
+			*bhp = NULL;
+			return -EAGAIN;
+		}
+		return 0;
+	}
 
 	lock_buffer(bh);
 	if (buffer_uptodate(bh)) {
@@ -279,7 +303,9 @@ int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno,
 	}
 
 	if (rahead) {
-		bh = gfs2_getbuf(gl, blkno + 1, FGP_CREAT);
+		bh = gfs2_getbuf(gl, blkno + 1, FGP_CREAT | fgp_flags);
+		if (IS_ERR(bh))
+			goto out;
 
 		lock_buffer(bh);
 		if (buffer_uptodate(bh)) {
@@ -291,6 +317,7 @@ int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno,
 		}
 	}
 
+out:
 	gfs2_submit_bhs(REQ_OP_READ | REQ_META | REQ_PRIO, bhs, num);
 	return 0;
 }
@@ -320,14 +347,14 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
 	return 0;
 }
 
-int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, fgf_t fgp_flags,
 		   int rahead, struct buffer_head **bhp)
 {
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 	int ret;
 
-	ret = gfs2_meta_read_async(gl, blkno, rahead, bhp);
-	if (ret)
+	ret = gfs2_meta_read_async(gl, blkno, fgp_flags, rahead, bhp);
+	if (ret || (fgp_flags & FGP_NOWAIT))
 		return ret;
 
 	ret = gfs2_meta_wait(sdp, *bhp);
@@ -494,7 +521,7 @@ int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num,
 	if (num == ip->i_no_addr)
 		rahead = ip->i_rahead;
 
-	ret = gfs2_meta_read(gl, num, rahead, &bh);
+	ret = gfs2_meta_read(gl, num, 0, rahead, &bh);
 	if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
 		brelse(bh);
 		ret = -EIO;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index f04c91eadfb4ae..6ca37e9f1c955e 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -51,10 +51,10 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
 }
 
 struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
-int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno,
+int gfs2_meta_read_async(struct gfs2_glock *gl, u64 blkno, fgf_t fgp_flags,
 			 int rahead, struct buffer_head **bhp);
 int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
-int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, fgf_t fgp_flags,
 		   int rahead, struct buffer_head **bhp);
 struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
 				fgf_t fgp_flags);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a68af8bdf7de44..8b47306816ab74 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -424,7 +424,7 @@ static int bh_get(struct gfs2_quota_data *qd)
 		goto fail;
 
 	error = gfs2_meta_read(ip->i_gl, iomap.addr >> inode->i_blkbits,
-			       0, &bh);
+			       0, 0, &bh);
 	if (error)
 		goto fail;
 	error = -EIO;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fcef82c767e346..9fa4fc7f4bcfe9 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1210,7 +1210,7 @@ int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl)
 
 	for (x = 0; x < length; x++) {
 		bi = rgd->rd_bits + x;
-		error = gfs2_meta_read_async(gl, rgd->rd_addr + x, 0,
+		error = gfs2_meta_read_async(gl, rgd->rd_addr + x, 0, 0,
 					     &bi->bi_bh);
 		if (error)
 			goto fail;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 759347ec67af5c..745c7cf7851918 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -128,7 +128,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
 	__be64 *eablk, *end;
 	int error;
 
-	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, &bh);
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, 0, &bh);
 	if (error)
 		return error;
 
@@ -152,7 +152,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
 			break;
 		bn = be64_to_cpu(*eablk);
 
-		error = gfs2_meta_read(ip->i_gl, bn, 0, &eabh);
+		error = gfs2_meta_read(ip->i_gl, bn, 0, 0, &eabh);
 		if (error)
 			break;
 		error = ea_foreach_i(ip, eabh, ea_call, data);
@@ -469,7 +469,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 
 	for (x = 0; x < nptrs; x++) {
 		error = gfs2_meta_read_async(ip->i_gl, be64_to_cpu(*dataptrs),
-					     0, bh + x);
+					     0, 0, bh + x);
 		if (error) {
 			while (x--)
 				brelse(bh[x]);
@@ -976,7 +976,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
 		__be64 *end;
 
-		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, &indbh);
+		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, 0, &indbh);
 		if (error)
 			return error;
 
@@ -1278,7 +1278,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 
 	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
 
-	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, &indbh);
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, 0, 0, &indbh);
 	if (error)
 		return error;
 

From add350d5501e99ef7229d08e47f2790d578ead1d Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 16 Jan 2024 15:00:16 +0100
Subject: [PATCH 141/707] gfs2: Pass FGP flags to gfs2_meta_{,inode_}buffer

Pass a fgp_flags argument to gfs2_meta_buffer() and
gfs2_meta_inode_buffer().  If the FGP_NOWAIT flag is set and one of
these functions would sleep, -EAGAIN is returned instead.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Andrew Price <anprice@redhat.com>
---
 fs/gfs2/aops.c       |  4 ++--
 fs/gfs2/bmap.c       | 19 ++++++++++---------
 fs/gfs2/dir.c        | 20 ++++++++++----------
 fs/gfs2/file.c       |  4 ++--
 fs/gfs2/glops.c      |  2 +-
 fs/gfs2/inode.c      |  4 ++--
 fs/gfs2/meta_io.c    |  7 +++++--
 fs/gfs2/meta_io.h    |  7 ++++---
 fs/gfs2/ops_fstype.c |  2 +-
 fs/gfs2/recovery.c   |  2 +-
 fs/gfs2/rgrp.c       |  2 +-
 fs/gfs2/super.c      |  6 +++---
 fs/gfs2/xattr.c      |  4 ++--
 13 files changed, 44 insertions(+), 39 deletions(-)

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 974aca9c8ea84f..31da7f6926c966 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -424,7 +424,7 @@ static int stuffed_read_folio(struct gfs2_inode *ip, struct folio *folio)
 	if (unlikely(folio->index)) {
 		dsize = 0;
 	} else {
-		error = gfs2_meta_inode_buffer(ip, &dibh);
+		error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 		if (error)
 			goto out;
 		from = dibh->b_data + sizeof(struct gfs2_dinode);
@@ -548,7 +548,7 @@ void adjust_fs_space(struct inode *inode)
 
 	/* Total up the file system space, according to the latest rindex. */
 	fs_total = gfs2_ri_total(sdp);
-	if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0)
+	if (gfs2_meta_inode_buffer(m_ip, 0, &m_bh) != 0)
 		goto out;
 
 	spin_lock(&sdp->sd_statfs_spin);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 92945e5b764348..ffe19d3012f5f0 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -95,7 +95,7 @@ static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct folio *folio)
 	int isdir = gfs2_is_dir(ip);
 	int error;
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
+	error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 	if (error)
 		return error;
 
@@ -331,7 +331,8 @@ static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
 
 		if (!dblock)
 			break;
-		ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]);
+		ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, 0,
+				       &mp->mp_bh[x + 1]);
 		if (ret)
 			return ret;
 	}
@@ -858,7 +859,7 @@ static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
 
 	down_read(&ip->i_rw_mutex);
 
-	ret = gfs2_meta_inode_buffer(ip, &dibh);
+	ret = gfs2_meta_inode_buffer(ip, 0, &dibh);
 	if (ret)
 		goto unlock;
 	mp->mp_bh[0] = dibh;
@@ -1377,7 +1378,7 @@ static int trunc_start(struct inode *inode, u64 newsize)
 	if (error)
 		return error;
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
+	error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 	if (error)
 		goto out;
 
@@ -1580,7 +1581,7 @@ static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
 		if (current->journal_info) {
 			struct buffer_head *dibh;
 
-			ret = gfs2_meta_inode_buffer(ip, &dibh);
+			ret = gfs2_meta_inode_buffer(ip, 0, &dibh);
 			if (ret)
 				goto out;
 
@@ -1785,7 +1786,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
 	}
 	start_aligned = mp_h;
 
-	ret = gfs2_meta_inode_buffer(ip, &dibh);
+	ret = gfs2_meta_inode_buffer(ip, 0, &dibh);
 	if (ret)
 		return ret;
 
@@ -1985,7 +1986,7 @@ static int trunc_end(struct gfs2_inode *ip)
 
 	down_write(&ip->i_rw_mutex);
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
+	error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 	if (error)
 		goto out;
 
@@ -2091,7 +2092,7 @@ static int do_grow(struct inode *inode, u64 size)
 			goto do_end_trans;
 	}
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
+	error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 	if (error)
 		goto do_end_trans;
 
@@ -2345,7 +2346,7 @@ static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
 	if (offset + length > inode->i_size)
 		length = inode->i_size - offset;
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
+	error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 	if (error)
 		return error;
 	gfs2_trans_add_meta(ip->i_gl, dibh);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index a1f47696a2ecfb..8719c8316e2801 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -122,7 +122,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
 	struct buffer_head *dibh;
 	int error;
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
+	error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 	if (error)
 		return error;
 
@@ -221,7 +221,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
 	}
 
 out:
-	error = gfs2_meta_inode_buffer(ip, &dibh);
+	error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 	if (error)
 		return error;
 
@@ -246,7 +246,7 @@ static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, __be64 *buf,
 	struct buffer_head *dibh;
 	int error;
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
+	error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 	if (!error) {
 		memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size);
 		brelse(dibh);
@@ -844,7 +844,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
 	}
 
 
-	error = gfs2_meta_inode_buffer(ip, &bh);
+	error = gfs2_meta_inode_buffer(ip, 0, &bh);
 	if (error)
 		return ERR_PTR(error);
 	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
@@ -915,7 +915,7 @@ static int dir_make_exhash(struct inode *inode)
 	u64 bn;
 	int error;
 
-	error = gfs2_meta_inode_buffer(dip, &dibh);
+	error = gfs2_meta_inode_buffer(dip, 0, &dibh);
 	if (error)
 		return error;
 
@@ -1115,7 +1115,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 
 	oleaf->lf_depth = nleaf->lf_depth;
 
-	error = gfs2_meta_inode_buffer(dip, &dibh);
+	error = gfs2_meta_inode_buffer(dip, 0, &dibh);
 	if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
 		gfs2_trans_add_meta(dip->i_gl, dibh);
 		gfs2_add_inode_blocks(&dip->i_inode, 1);
@@ -1169,7 +1169,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 		return -ENOMEM;
 
 	h = hc2;
-	error = gfs2_meta_inode_buffer(dip, &dibh);
+	error = gfs2_meta_inode_buffer(dip, 0, &dibh);
 	if (error)
 		goto out_kfree;
 
@@ -1586,7 +1586,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
 		return -EIO;
 	}
 
-	error = gfs2_meta_inode_buffer(dip, &dibh);
+	error = gfs2_meta_inode_buffer(dip, 0, &dibh);
 	if (error)
 		return error;
 
@@ -1758,7 +1758,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 	brelse(bh);
 	brelse(obh);
 
-	error = gfs2_meta_inode_buffer(ip, &bh);
+	error = gfs2_meta_inode_buffer(ip, 0, &bh);
 	if (error)
 		return error;
 	gfs2_trans_add_meta(ip->i_gl, bh);
@@ -2063,7 +2063,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 		goto out_end_trans;
 	}
 
-	error = gfs2_meta_inode_buffer(dip, &dibh);
+	error = gfs2_meta_inode_buffer(dip, 0, &dibh);
 	if (error)
 		goto out_end_trans;
 
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 992ca4effb505e..ac4ed6abfc6c24 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -257,7 +257,7 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask)
 	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
 	if (error)
 		goto out;
-	error = gfs2_meta_inode_buffer(ip, &bh);
+	error = gfs2_meta_inode_buffer(ip, 0, &bh);
 	if (error)
 		goto out_trans_end;
 	inode_set_ctime_current(inode);
@@ -1179,7 +1179,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
 	struct buffer_head *dibh;
 	int error;
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
+	error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 	if (unlikely(error))
 		return error;
 
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 45653cbc8a87d1..cc38aa81d2467d 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -483,7 +483,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
 	struct buffer_head *dibh;
 	int error;
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
+	error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 	if (error)
 		return error;
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6bfc9383b7b8ec..a417650e378f96 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1027,7 +1027,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 			goto out_ipres;
 	}
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
+	error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 	if (error)
 		goto out_end_trans;
 
@@ -1833,7 +1833,7 @@ static const char *gfs2_get_link(struct dentry *dentry,
 		goto out;
 	}
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
+	error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 	if (error) {
 		buf = ERR_PTR(error);
 		goto out;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0827e1f58a5a03..c3c04898971f9c 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -504,13 +504,16 @@ void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
  * @ip: The GFS2 inode
  * @mtype: The block type (GFS2_METATYPE_*)
  * @num: The block number (device relative) of the buffer
+ * @fgp_flags: FGP_NOWAIT if sleeping is prohibited
  * @bhp: the buffer is returned here
  *
+ * Returns -EAGAIN if the FGP_NOWAIT flag is set and the function would sleep.
+ *
  * Returns: errno
  */
 
 int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num,
-		     struct buffer_head **bhp)
+		     fgf_t fgp_flags, struct buffer_head **bhp)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_glock *gl = ip->i_gl;
@@ -521,7 +524,7 @@ int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num,
 	if (num == ip->i_no_addr)
 		rahead = ip->i_rahead;
 
-	ret = gfs2_meta_read(gl, num, 0, rahead, &bh);
+	ret = gfs2_meta_read(gl, num, fgp_flags, rahead, &bh);
 	if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
 		brelse(bh);
 		ret = -EIO;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 6ca37e9f1c955e..7b51220781138e 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -66,12 +66,13 @@ enum {
 void gfs2_remove_from_journal(struct buffer_head *bh, int meta);
 void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
 int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num,
-		     struct buffer_head **bhp);
+		     fgf_t fgp_flags, struct buffer_head **bhp);
 
-static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
+static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, fgf_t fgp_flags,
 					 struct buffer_head **bhp)
 {
-	return gfs2_meta_buffer(ip, GFS2_METATYPE_DI, ip->i_no_addr, bhp);
+	return gfs2_meta_buffer(ip, GFS2_METATYPE_DI, ip->i_no_addr, fgp_flags,
+				bhp);
 }
 
 struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1281e60be63900..1a01adb48f5bbb 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -698,7 +698,7 @@ static int init_statfs(struct gfs2_sbd *sdp)
 		goto free_local;
 	}
 	/* read in the local statfs buffer - other nodes don't change it. */
-	error = gfs2_meta_inode_buffer(ip, &sdp->sd_sc_bh);
+	error = gfs2_meta_inode_buffer(ip, 0, &sdp->sd_sc_bh);
 	if (error) {
 		fs_err(sdp, "Cannot read in local statfs: %d\n", error);
 		goto unlock_sd_gh;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f4fe7039f725b0..4db3ca9c3b0270 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -317,7 +317,7 @@ static int update_statfs_inode(struct gfs2_jdesc *jd,
 	BUG_ON(!inode);
 	ip = GFS2_I(inode);
 
-	error = gfs2_meta_inode_buffer(ip, &bh);
+	error = gfs2_meta_inode_buffer(ip, 0, &bh);
 	if (error)
 		goto out;
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9fa4fc7f4bcfe9..c9aec3dba80c6d 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -2452,7 +2452,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
 	if (!dinode) {
 		ip->i_goal = block + *nblocks - 1;
-		error = gfs2_meta_inode_buffer(ip, &dibh);
+		error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 		if (error == 0) {
 			struct gfs2_dinode *di =
 				(struct gfs2_dinode *)dibh->b_data;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index e5f79466340d2a..7bfdd5924a5457 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -192,7 +192,7 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp)
 	if (error)
 		return error;
 
-	error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+	error = gfs2_meta_inode_buffer(m_ip, 0, &m_bh);
 	if (error)
 		goto out;
 
@@ -282,7 +282,7 @@ int gfs2_statfs_sync(struct super_block *sb, int type)
 	if (error)
 		goto out;
 
-	error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+	error = gfs2_meta_inode_buffer(m_ip, 0, &m_bh);
 	if (error)
 		goto out_unlock;
 
@@ -521,7 +521,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
 		need_endtrans = 1;
 	}
 
-	ret = gfs2_meta_inode_buffer(ip, &bh);
+	ret = gfs2_meta_inode_buffer(ip, 0, &bh);
 	if (ret == 0) {
 		gfs2_trans_add_meta(ip->i_gl, bh);
 		gfs2_dinode_out(ip, bh->b_data);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 745c7cf7851918..e2ff0fc8e4459a 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1360,7 +1360,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 
 	ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
+	error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 	if (!error) {
 		gfs2_trans_add_meta(ip->i_gl, dibh);
 		gfs2_dinode_out(ip, dibh->b_data);
@@ -1412,7 +1412,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
 	gfs2_add_inode_blocks(&ip->i_inode, -1);
 
 	if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) {
-		error = gfs2_meta_inode_buffer(ip, &dibh);
+		error = gfs2_meta_inode_buffer(ip, 0, &dibh);
 		if (!error) {
 			gfs2_trans_add_meta(ip->i_gl, dibh);
 			gfs2_dinode_out(ip, dibh->b_data);

From 2afca0b0afc4a300fc5556c6d75c6e641493342f Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 16 Jan 2024 21:44:31 +0100
Subject: [PATCH 142/707] gfs2: Pass FGP flags to gfs2_dirent_search

Add an fgp_flags argument to gfs2_dirent_search().  When the FGP_NOWAIT
flag is set and gfs2_dirent_search() would sleep, -EAGAIN is returned
instead.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Andrew Price <anprice@redhat.com>
---
 fs/gfs2/dir.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 8719c8316e2801..81d1ea61f44f3e 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -805,6 +805,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index,
 static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
 					      const struct qstr *name,
 					      gfs2_dscan_t scan,
+					      fgf_t fgp_flags,
 					      struct buffer_head **pbh)
 {
 	struct buffer_head *bh;
@@ -817,6 +818,11 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
 		unsigned int hsize = BIT(ip->i_depth);
 		unsigned int index;
 		u64 ln;
+
+		/* no lockless lookup inside ExHash directories */
+		if (fgp_flags & FGP_NOWAIT)
+			return ERR_PTR(-EAGAIN);
+
 		if (hsize * sizeof(u64) != i_size_read(inode)) {
 			gfs2_consist_inode(ip);
 			return ERR_PTR(-EIO);
@@ -843,8 +849,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
 		return error ? ERR_PTR(error) : NULL;
 	}
 
-
-	error = gfs2_meta_inode_buffer(ip, 0, &bh);
+	error = gfs2_meta_inode_buffer(ip, fgp_flags, &bh);
 	if (error)
 		return ERR_PTR(error);
 	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
@@ -1647,7 +1652,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
 	u64 addr, formal_ino;
 	u16 dtype;
 
-	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
+	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, 0, &bh);
 	if (dent) {
 		struct inode *inode;
 		u16 rahead;
@@ -1677,7 +1682,7 @@ int gfs2_dir_check(struct inode *dir, const struct qstr *name,
 	struct gfs2_dirent *dent;
 	int ret = -ENOENT;
 
-	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
+	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, 0, &bh);
 	if (dent) {
 		if (IS_ERR(dent))
 			return PTR_ERR(dent);
@@ -1805,7 +1810,8 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 	while(1) {
 		if (da->bh == NULL) {
 			dent = gfs2_dirent_search(inode, name,
-						  gfs2_dirent_find_space, &bh);
+						  gfs2_dirent_find_space, 0,
+						  &bh);
 		}
 		if (dent) {
 			if (IS_ERR(dent))
@@ -1880,7 +1886,8 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
 
 	/* Returns _either_ the entry (if its first in block) or the
 	   previous entry otherwise */
-	dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh);
+	dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, 0,
+				  &bh);
 	if (!dent) {
 		gfs2_consist_inode(dip);
 		return -EIO;
@@ -1939,7 +1946,8 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 	struct buffer_head *bh;
 	struct gfs2_dirent *dent;
 
-	dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
+	dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, 0,
+				  &bh);
 	if (!dent) {
 		gfs2_consist_inode(dip);
 		return -EIO;
@@ -2166,7 +2174,7 @@ int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
 	da->bh = NULL;
 	da->dent = NULL;
 
-	dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
+	dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, 0, &bh);
 	if (!dent) {
 		da->nr_blocks = sdp->sd_max_dirres;
 		if (!(ip->i_diskflags & GFS2_DIF_EXHASH) &&

From 77c59cfc8ac1d696cd48e8ac438f65a117399bd4 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 16 Jan 2024 23:10:08 +0100
Subject: [PATCH 143/707] gfs2: Pass FGP flags to gfs2_dir_check

Pass a fgp_flags argument to gfs2_dir_check().  If the FGP_NOWAIT flag
is set and gfs2_dir_check() would sleep, -EAGAIN is returned instead.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Andrew Price <anprice@redhat.com>
---
 fs/gfs2/dentry.c     | 2 +-
 fs/gfs2/dir.c        | 4 ++--
 fs/gfs2/dir.h        | 2 +-
 fs/gfs2/inode.c      | 6 +++---
 fs/gfs2/ops_fstype.c | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index c6483fb9862450..f179a6d94cbefe 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -72,7 +72,7 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
 			goto out;
 	}
 
-	error = gfs2_dir_check(dinode, &dentry->d_name, ip);
+	error = gfs2_dir_check(dinode, &dentry->d_name, ip, 0);
 	valid = inode ? !error : (error == -ENOENT);
 
 	if (!had_lock)
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 81d1ea61f44f3e..d3a9c9094db085 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1676,13 +1676,13 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
 }
 
 int gfs2_dir_check(struct inode *dir, const struct qstr *name,
-		   const struct gfs2_inode *ip)
+		   const struct gfs2_inode *ip, fgf_t fgp_flags)
 {
 	struct buffer_head *bh;
 	struct gfs2_dirent *dent;
 	int ret = -ENOENT;
 
-	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, 0, &bh);
+	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, fgp_flags, &bh);
 	if (dent) {
 		if (IS_ERR(dent))
 			return PTR_ERR(dent);
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 25a857c78b538b..cfec869a11c8cc 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -27,7 +27,7 @@ struct inode *gfs2_dir_search(struct inode *dir,
 			      const struct qstr *filename,
 			      bool fail_on_exist);
 int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
-		   const struct gfs2_inode *ip);
+		   const struct gfs2_inode *ip, fgf_t fgp_flags);
 int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
 		 const struct gfs2_inode *ip, struct gfs2_diradd *da);
 static inline void gfs2_dir_no_add(struct gfs2_diradd *da)
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index a417650e378f96..948f5c203a3860 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -980,7 +980,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (error)
 		goto out_gunlock;
 
-	error = gfs2_dir_check(dir, &dentry->d_name, NULL);
+	error = gfs2_dir_check(dir, &dentry->d_name, NULL, 0);
 	switch (error) {
 	case -ENOENT:
 		break;
@@ -1096,7 +1096,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 	if (error)
 		return error;
 
-	return gfs2_dir_check(&dip->i_inode, name, ip);
+	return gfs2_dir_check(&dip->i_inode, name, ip, 0);
 }
 
 /**
@@ -1522,7 +1522,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		if (error)
 			goto out_gunlock;
 
-		error = gfs2_dir_check(ndir, &ndentry->d_name, NULL);
+		error = gfs2_dir_check(ndir, &ndentry->d_name, NULL, 0);
 		switch (error) {
 		case -ENOENT:
 			error = 0;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1a01adb48f5bbb..f6c80ff618cc5d 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -584,7 +584,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
 		name.len = sprintf(buf, "journal%u", sdp->sd_journals);
 		name.hash = gfs2_disk_hash(name.name, name.len);
 
-		error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
+		error = gfs2_dir_check(sdp->sd_jindex, &name, NULL, 0);
 		if (error == -ENOENT) {
 			error = 0;
 			break;

From 949eda30775ddda2704ad72c999e6d4112957fb3 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 17 Jan 2024 13:49:19 +0100
Subject: [PATCH 144/707] gfs2: Minor gfs2_drevalidate cleanup

Get rid of the had_lock and valid variables in gfs2_drevalidate().

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Andrew Price <anprice@redhat.com>
---
 fs/gfs2/dentry.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index f179a6d94cbefe..99239d80982b58 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -38,8 +38,9 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
 	struct inode *dinode, *inode;
 	struct gfs2_holder d_gh;
 	struct gfs2_inode *ip = NULL;
-	int error, valid = 0;
-	int had_lock = 0;
+	int error;
+
+	gfs2_holder_mark_uninitialized(&d_gh);
 
 	if (flags & LOOKUP_RCU) {
 		dinode = d_inode_rcu(READ_ONCE(dentry->d_parent));
@@ -54,18 +55,19 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
 	inode = d_inode(dentry);
 
 	if (inode) {
-		if (is_bad_inode(inode))
+		if (is_bad_inode(inode)) {
+			error = 0;
 			goto out;
+		}
 		ip = GFS2_I(inode);
 	}
 
 	if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) {
-		valid = 1;
+		error = 1;
 		goto out;
 	}
 
-	had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
-	if (!had_lock) {
+	if (!gfs2_glock_is_locked_by_me(dip->i_gl)) {
 		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
 					   flags & LOOKUP_RCU ? GL_NOBLOCK : 0, &d_gh);
 		if (error)
@@ -73,13 +75,13 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
 	}
 
 	error = gfs2_dir_check(dinode, &dentry->d_name, ip, 0);
-	valid = inode ? !error : (error == -ENOENT);
+	error = inode ? !error : (error == -ENOENT);
 
-	if (!had_lock)
+	if (gfs2_holder_initialized(&d_gh))
 		gfs2_glock_dq_uninit(&d_gh);
 out:
 	dput(parent);
-	return valid;
+	return error;
 }
 
 static int gfs2_dhash(const struct dentry *dentry, struct qstr *str)

From acd2d246f4b26460d0499bc4e0042f63380e526b Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 16 Jan 2024 23:15:18 +0100
Subject: [PATCH 145/707] gfs2: Fix LOOKUP_RCU support in gfs2_drevalidate

Fix non-sleeping lookups in gfs2_drevalidate() by passing the FGP_NOWAIT
flag down to gfs2_dir_check().  This will cause gfs2_dir_check() to
return -EAGAIN when it would otherwise sleep.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Andrew Price <anprice@redhat.com>
---
 fs/gfs2/dentry.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 99239d80982b58..b14956cdab0ec2 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -74,12 +74,20 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
 			goto out;
 	}
 
-	error = gfs2_dir_check(dinode, &dentry->d_name, ip, 0);
+	if (flags & LOOKUP_RCU) {
+		error = gfs2_dir_check(dinode, &dentry->d_name, ip, FGP_NOWAIT);
+		if (error == -EAGAIN) {
+			error = -ECHILD;
+			goto out;
+		}
+	} else {
+		error = gfs2_dir_check(dinode, &dentry->d_name, ip, 0);
+	}
 	error = inode ? !error : (error == -ENOENT);
 
+out:
 	if (gfs2_holder_initialized(&d_gh))
 		gfs2_glock_dq_uninit(&d_gh);
-out:
 	dput(parent);
 	return error;
 }

From 189a4edb774b9ee5daf345e703e53fffaf509285 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Mon, 22 Jan 2024 16:49:34 -0800
Subject: [PATCH 146/707] lkdtm: Make lkdtm_do_action() return to avoid tail
 call optimization

The comments for lkdtm_do_action() explicitly call out that it
shouldn't be inlined because we want it to show up in stack
crawls. However, at least with some compilers / options it's still
vanishing due to tail call optimization. Let's add a return value to
the function to make it harder for the compiler to do tail call
optimization here.

Now that we have a return value, we can actually use it in the
callers, which is a minor improvement in the code.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20240122164935.1.I345e485f36babad76370c59659a706723750d950@changeid
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/misc/lkdtm/core.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
index 0772e4a4757e9f..5732fd59a227d0 100644
--- a/drivers/misc/lkdtm/core.c
+++ b/drivers/misc/lkdtm/core.c
@@ -153,12 +153,17 @@ static const struct crashtype *find_crashtype(const char *name)
 /*
  * This is forced noinline just so it distinctly shows up in the stackdump
  * which makes validation of expected lkdtm crashes easier.
+ *
+ * NOTE: having a valid return value helps prevent the compiler from doing
+ * tail call optimizations and taking this out of the stack trace.
  */
-static noinline void lkdtm_do_action(const struct crashtype *crashtype)
+static noinline int lkdtm_do_action(const struct crashtype *crashtype)
 {
 	if (WARN_ON(!crashtype || !crashtype->func))
-		return;
+		return -EINVAL;
 	crashtype->func();
+
+	return 0;
 }
 
 static int lkdtm_register_cpoint(struct crashpoint *crashpoint,
@@ -167,10 +172,8 @@ static int lkdtm_register_cpoint(struct crashpoint *crashpoint,
 	int ret;
 
 	/* If this doesn't have a symbol, just call immediately. */
-	if (!crashpoint->kprobe.symbol_name) {
-		lkdtm_do_action(crashtype);
-		return 0;
-	}
+	if (!crashpoint->kprobe.symbol_name)
+		return lkdtm_do_action(crashtype);
 
 	if (lkdtm_kprobe != NULL)
 		unregister_kprobe(lkdtm_kprobe);
@@ -216,7 +219,7 @@ static int lkdtm_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
 	spin_unlock_irqrestore(&crash_count_lock, flags);
 
 	if (do_it)
-		lkdtm_do_action(lkdtm_crashtype);
+		return lkdtm_do_action(lkdtm_crashtype);
 
 	return 0;
 }
@@ -303,6 +306,7 @@ static ssize_t direct_entry(struct file *f, const char __user *user_buf,
 {
 	const struct crashtype *crashtype;
 	char *buf;
+	int err;
 
 	if (count >= PAGE_SIZE)
 		return -EINVAL;
@@ -326,9 +330,11 @@ static ssize_t direct_entry(struct file *f, const char __user *user_buf,
 		return -EINVAL;
 
 	pr_info("Performing direct entry %s\n", crashtype->name);
-	lkdtm_do_action(crashtype);
+	err = lkdtm_do_action(crashtype);
 	*off += count;
 
+	if (err)
+		return err;
 	return count;
 }
 

From edb6538da3df83806fffcfc1b873d0895c81b9e8 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Mon, 22 Jan 2024 16:49:35 -0800
Subject: [PATCH 147/707] lkdtm/bugs: Adjust lkdtm_HUNG_TASK() to avoid tail
 call optimization

When testing with lkdtm_HUNG_TASK() and looking at the output, I
expected to see lkdtm_HUNG_TASK() in the stack crawl but it wasn't
there. Instead, the top function on at least some devices was
schedule() due to tail call optimization.

Let's do two things to help here:
1. We'll mark this as "__noreturn". On GCC at least this is documented
   to prevent tail call optimization. The docs [1] say "In order to
   preserve backtraces, GCC will never turn calls to noreturn
   functions into tail calls."
2. We'll add a BUG_ON(1) at the end which means that schedule() is no
   longer a tail call. Note that this is potentially important because
   if we _did_ end up returning from schedule() due to some weird
   issue then we'd potentially be violating the "noreturn" that we
   told the compiler about. BUG is the right thing to do here.

[1] https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20240122164935.2.I26e8f68c312824fcc80c19d4e91de2d2bef958f0@changeid
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/misc/lkdtm/bugs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c
index b080eb2335eba8..d1222d3eda2f19 100644
--- a/drivers/misc/lkdtm/bugs.c
+++ b/drivers/misc/lkdtm/bugs.c
@@ -294,10 +294,11 @@ static void lkdtm_SPINLOCKUP(void)
 	__release(&lock_me_up);
 }
 
-static void lkdtm_HUNG_TASK(void)
+static void __noreturn lkdtm_HUNG_TASK(void)
 {
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	schedule();
+	BUG_ON(1);
 }
 
 static volatile unsigned int huge = INT_MAX - 2;

From ace4b31b297dfd7b8c969ff5046c8128c3e025be Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 18 Jan 2024 16:19:13 +0530
Subject: [PATCH 148/707] cpufreq: Move dev_pm_opp_{init|free}_cpufreq_table()
 to pm_opp.h

Move the declaration of functions defined in the OPP core to pm_opp.h.
These were added to cpufreq.h as it was the only user of the APIs, but
that was a mistake perhaps. Fix it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/cpufreq.h | 20 --------------------
 include/linux/pm_opp.h  | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index afda5f24d3ddc6..8ff3e79727d80c 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -694,26 +694,6 @@ struct cpufreq_frequency_table {
 				    * order */
 };
 
-#if defined(CONFIG_CPU_FREQ) && defined(CONFIG_PM_OPP)
-int dev_pm_opp_init_cpufreq_table(struct device *dev,
-				  struct cpufreq_frequency_table **table);
-void dev_pm_opp_free_cpufreq_table(struct device *dev,
-				   struct cpufreq_frequency_table **table);
-#else
-static inline int dev_pm_opp_init_cpufreq_table(struct device *dev,
-						struct cpufreq_frequency_table
-						**table)
-{
-	return -EINVAL;
-}
-
-static inline void dev_pm_opp_free_cpufreq_table(struct device *dev,
-						 struct cpufreq_frequency_table
-						 **table)
-{
-}
-#endif
-
 /*
  * cpufreq_for_each_entry -	iterate over a cpufreq_frequency_table
  * @pos:	the cpufreq_frequency_table * to use as a loop cursor.
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 76dcb7f37bcdff..f1ac8bde09cb56 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -16,6 +16,7 @@
 #include <linux/notifier.h>
 
 struct clk;
+struct cpufreq_frequency_table;
 struct regulator;
 struct dev_pm_opp;
 struct device;
@@ -444,6 +445,21 @@ static inline int dev_pm_opp_sync_regulators(struct device *dev)
 
 #endif		/* CONFIG_PM_OPP */
 
+#if defined(CONFIG_CPU_FREQ) && defined(CONFIG_PM_OPP)
+int dev_pm_opp_init_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table);
+void dev_pm_opp_free_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table);
+#else
+static inline int dev_pm_opp_init_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table)
+{
+	return -EINVAL;
+}
+
+static inline void dev_pm_opp_free_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table)
+{
+}
+#endif
+
+
 #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF)
 int dev_pm_opp_of_add_table(struct device *dev);
 int dev_pm_opp_of_add_table_indexed(struct device *dev, int index);

From 52501486483e1646852f78f3f5af89ab573d2caf Mon Sep 17 00:00:00 2001
From: Wen Yang <wenyang.linux@foxmail.com>
Date: Mon, 15 Jan 2024 23:27:00 +0800
Subject: [PATCH 149/707] eventfd: move 'eventfd-count' printing out of
 spinlock

When printing eventfd->count, interrupts will be disabled and a spinlock
will be obtained, competing with eventfd_write(). By moving the
"eventfd-count" print out of the spinlock and merging multiple
seq_printf() into one, it could improve a bit, just like timerfd_show().

Signed-off-by: Wen Yang <wenyang.linux@foxmail.com>
Link: https://lore.kernel.org/r/tencent_B0B3D2BD9861FD009E03AB18A81783322709@qq.com
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dylan Yudaken <dylany@fb.com>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Eric Biggers <ebiggers@google.com>
Cc: <linux-fsdevel@vger.kernel.org>
Cc: <linux-kernel@vger.kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/eventfd.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 0252b71099fbca..fc4d8109076392 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -283,13 +283,18 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
 {
 	struct eventfd_ctx *ctx = f->private_data;
+	__u64 cnt;
 
 	spin_lock_irq(&ctx->wqh.lock);
-	seq_printf(m, "eventfd-count: %16llx\n",
-		   (unsigned long long)ctx->count);
+	cnt = ctx->count;
 	spin_unlock_irq(&ctx->wqh.lock);
-	seq_printf(m, "eventfd-id: %d\n", ctx->id);
-	seq_printf(m, "eventfd-semaphore: %d\n",
+
+	seq_printf(m,
+		   "eventfd-count: %16llx\n"
+		   "eventfd-id: %d\n"
+		   "eventfd-semaphore: %d\n",
+		   cnt,
+		   ctx->id,
 		   !!(ctx->flags & EFD_SEMAPHORE));
 }
 #endif

From 5b3d743da951aeaedda401304d88b7b6ec1969d7 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Tue, 23 Jan 2024 09:34:50 +0100
Subject: [PATCH 150/707] dt-bindings: sram: narrow regex for unit address to
 hex numbers

Regular expression used to match the unit address part should not allow
non-hex numbers.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20240123083450.20996-1-krzysztof.kozlowski@linaro.org
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 .../bindings/sram/allwinner,sun4i-a10-system-control.yaml       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/sram/allwinner,sun4i-a10-system-control.yaml b/Documentation/devicetree/bindings/sram/allwinner,sun4i-a10-system-control.yaml
index a1c96985951ff2..cf07b8f787a6ed 100644
--- a/Documentation/devicetree/bindings/sram/allwinner,sun4i-a10-system-control.yaml
+++ b/Documentation/devicetree/bindings/sram/allwinner,sun4i-a10-system-control.yaml
@@ -56,7 +56,7 @@ properties:
   ranges: true
 
 patternProperties:
-  "^sram@[a-z0-9]+":
+  "^sram@[a-f0-9]+":
     $ref: /schemas/sram/sram.yaml#
     unevaluatedProperties: false
 

From 26ca757780d1ba9a982f8b79aef8e4cf5d171182 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 20 Jan 2024 21:18:57 -0800
Subject: [PATCH 151/707] clk: sunxi: usb: fix kernel-doc warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the function description comment to immediately above the
function implementation, the add function parameter descriptions to
prevent kernel-doc warnings:

clk-usb.c:80: warning: expecting prototype for sunxi_usb_clk_setup(). Prototype was for SUNXI_USB_MAX_SIZE() instead
clk-usb.c:91: warning: Function parameter or struct member 'node' not described in 'sunxi_usb_clk_setup'
clk-usb.c:91: warning: Function parameter or struct member 'data' not described in 'sunxi_usb_clk_setup'
clk-usb.c:91: warning: Function parameter or struct member 'lock' not described in 'sunxi_usb_clk_setup'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Emilio López <emilio@elopez.com.ar>
Cc: Michael Turquette <mturquette@baylibre.com>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc:  <linux-clk@vger.kernel.org>
Cc: Chen-Yu Tsai <wens@csie.org>
Cc: Jernej Skrabec <jernej.skrabec@gmail.com>
Cc: Samuel Holland <samuel@sholland.org>
Cc:  <linux-arm-kernel@lists.infradead.org>
Cc:  <linux-sunxi@lists.linux.dev>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20240121051858.17647-1-rdunlap@infradead.org
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 drivers/clk/sunxi/clk-usb.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/clk/sunxi/clk-usb.c b/drivers/clk/sunxi/clk-usb.c
index 5460218f3467ab..3c53f65002a285 100644
--- a/drivers/clk/sunxi/clk-usb.c
+++ b/drivers/clk/sunxi/clk-usb.c
@@ -73,9 +73,6 @@ static const struct reset_control_ops sunxi_usb_reset_ops = {
 	.deassert	= sunxi_usb_reset_deassert,
 };
 
-/**
- * sunxi_usb_clk_setup() - Setup function for usb gate clocks
- */
 
 #define SUNXI_USB_MAX_SIZE 32
 
@@ -85,6 +82,12 @@ struct usb_clk_data {
 	bool reset_needs_clk;
 };
 
+/**
+ * sunxi_usb_clk_setup() - Setup function for usb gate clocks
+ * @node: &struct device_node for the clock
+ * @data: &struct usb_clk_data for the clock
+ * @lock: spinlock for the clock
+ */
 static void __init sunxi_usb_clk_setup(struct device_node *node,
 				       const struct usb_clk_data *data,
 				       spinlock_t *lock)

From 64b4ea871b86e0177c3938360ff83d528ac10018 Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Wed, 10 Jan 2024 14:28:40 -0600
Subject: [PATCH 152/707] dt-bindings: iio: adc: Add binding for AD7380 ADCs

This adds a binding specification for the Analog Devices Inc. AD7380
family of ADCs.

Signed-off-by: David Lechner <dlechner@baylibre.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20240110-ad7380-mainline-v4-1-93a1d96b50fa@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 .../bindings/iio/adc/adi,ad7380.yaml          | 86 +++++++++++++++++++
 MAINTAINERS                                   |  9 ++
 2 files changed, 95 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml

diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml b/Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml
new file mode 100644
index 00000000000000..5a70d1ee768b5a
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/adc/adi,ad7380.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Analog Devices Simultaneous Sampling Analog to Digital Converters
+
+maintainers:
+  - Michael Hennerich <Michael.Hennerich@analog.com>
+  - Nuno Sá <nuno.sa@analog.com>
+
+description: |
+  * https://www.analog.com/en/products/ad7380.html
+  * https://www.analog.com/en/products/ad7381.html
+  * https://www.analog.com/en/products/ad7383.html
+  * https://www.analog.com/en/products/ad7384.html
+
+$ref: /schemas/spi/spi-peripheral-props.yaml#
+
+properties:
+  compatible:
+    enum:
+      - adi,ad7380
+      - adi,ad7381
+      - adi,ad7383
+      - adi,ad7384
+
+  reg:
+    maxItems: 1
+
+  spi-max-frequency:
+    maximum: 80000000
+  spi-cpol: true
+  spi-cpha: true
+
+  vcc-supply:
+    description: A 3V to 3.6V supply that powers the chip.
+
+  vlogic-supply:
+    description:
+      A 1.65V to 3.6V supply for the logic pins.
+
+  refio-supply:
+    description:
+      A 2.5V to 3.3V supply for the external reference voltage. When omitted,
+      the internal 2.5V reference is used.
+
+  interrupts:
+    description:
+      When the device is using 1-wire mode, this property is used to optionally
+      specify the ALERT interrupt.
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - vcc-supply
+  - vlogic-supply
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/irq.h>
+
+    spi {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        adc@0 {
+            compatible = "adi,ad7380";
+            reg = <0>;
+
+            spi-cpol;
+            spi-cpha;
+            spi-max-frequency = <80000000>;
+
+            interrupts = <27 IRQ_TYPE_EDGE_FALLING>;
+            interrupt-parent = <&gpio0>;
+
+            vcc-supply = <&supply_3_3V>;
+            vlogic-supply = <&supply_3_3V>;
+            refio-supply = <&supply_2_5V>;
+        };
+    };
diff --git a/MAINTAINERS b/MAINTAINERS
index dcf99f9f5b8402..da83fdae811b79 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -427,6 +427,15 @@ W:	http://wiki.analog.com/AD7142
 W:	https://ez.analog.com/linux-software-drivers
 F:	drivers/input/misc/ad714x.c
 
+AD738X ADC DRIVER (AD7380/1/2/4)
+M:	Michael Hennerich <michael.hennerich@analog.com>
+M:	Nuno Sá <nuno.sa@analog.com>
+R:	David Lechner <dlechner@baylibre.com>
+S:	Supported
+W:	https://wiki.analog.com/resources/tools-software/linux-drivers/iio-adc/ad738x
+W:	https://ez.analog.com/linux-software-drivers
+F:	Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml
+
 AD7877 TOUCHSCREEN DRIVER
 M:	Michael Hennerich <michael.hennerich@analog.com>
 S:	Supported

From 6f34271e6342c50974fb4676b05d648d4560f2e1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 19 Jan 2024 08:04:42 -0700
Subject: [PATCH 153/707] block/mq-deadline: pass in queue directly to
 dd_insert_request()

The hardware queue isn't relevant, deadline only operates on the queue
itself. Pass in the queue directly rather than the hardware queue, as
that more clearly explains what is being operated on.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/mq-deadline.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index f958e79277b8bc..9b7563e9d638ed 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -792,10 +792,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
 /*
  * add rq to rbtree and fifo
  */
-static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+static void dd_insert_request(struct request_queue *q, struct request *rq,
 			      blk_insert_t flags, struct list_head *free)
 {
-	struct request_queue *q = hctx->queue;
 	struct deadline_data *dd = q->elevator->elevator_data;
 	const enum dd_data_dir data_dir = rq_data_dir(rq);
 	u16 ioprio = req_get_ioprio(rq);
@@ -875,7 +874,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
 
 		rq = list_first_entry(list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
-		dd_insert_request(hctx, rq, flags, &free);
+		dd_insert_request(q, rq, flags, &free);
 	}
 	spin_unlock(&dd->lock);
 

From b0f6732db3959da8595586b4145cc35af3c5cb9a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 18 Jan 2024 10:46:52 -0700
Subject: [PATCH 154/707] block/mq-deadline: serialize request dispatching

If we're entering request dispatch but someone else is already
dispatching, then just skip this dispatch. We know IO is inflight and
this will trigger another dispatch event for any completion. This will
potentially cause slightly lower queue depth for contended cases, but
those are slowed down anyway and this should not cause an issue.

By itself, this patch doesn't help a whole lot, as the dispatch
lock contention reduction is just eating up by the same dd->lock now
seeing increased insertion contention. But it's required work to be
able to reduce the lock contention in general.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/mq-deadline.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 9b7563e9d638ed..79bc3b6784b373 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -79,10 +79,20 @@ struct dd_per_prio {
 	struct io_stats_per_prio stats;
 };
 
+enum {
+	DD_DISPATCHING	= 0,
+};
+
 struct deadline_data {
 	/*
 	 * run time data
 	 */
+	struct {
+		spinlock_t lock;
+		spinlock_t zone_lock;
+	} ____cacheline_aligned_in_smp;
+
+	unsigned long run_state;
 
 	struct dd_per_prio per_prio[DD_PRIO_COUNT];
 
@@ -100,9 +110,6 @@ struct deadline_data {
 	int front_merges;
 	u32 async_depth;
 	int prio_aging_expire;
-
-	spinlock_t lock;
-	spinlock_t zone_lock;
 };
 
 /* Maps an I/O priority class to a deadline scheduler priority. */
@@ -600,6 +607,18 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	struct request *rq;
 	enum dd_prio prio;
 
+	/*
+	 * If someone else is already dispatching, skip this one. This will
+	 * defer the next dispatch event to when something completes, and could
+	 * potentially lower the queue depth for contended cases.
+	 *
+	 * See the logic in blk_mq_do_dispatch_sched(), which loops and
+	 * retries if nothing is dispatched.
+	 */
+	if (test_bit(DD_DISPATCHING, &dd->run_state) ||
+	    test_and_set_bit_lock(DD_DISPATCHING, &dd->run_state))
+		return NULL;
+
 	spin_lock(&dd->lock);
 	rq = dd_dispatch_prio_aged_requests(dd, now);
 	if (rq)
@@ -616,6 +635,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	}
 
 unlock:
+	clear_bit_unlock(DD_DISPATCHING, &dd->run_state);
 	spin_unlock(&dd->lock);
 
 	return rq;
@@ -706,6 +726,9 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 
 	eq->elevator_data = dd;
 
+	spin_lock_init(&dd->lock);
+	spin_lock_init(&dd->zone_lock);
+
 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
 		struct dd_per_prio *per_prio = &dd->per_prio[prio];
 
@@ -722,8 +745,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 	dd->last_dir = DD_WRITE;
 	dd->fifo_batch = fifo_batch;
 	dd->prio_aging_expire = prio_aging_expire;
-	spin_lock_init(&dd->lock);
-	spin_lock_init(&dd->zone_lock);
 
 	/* We dispatch from request queue wide instead of hw queue */
 	blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);

From 8f764b91fdf2965956b036be6f9e79f77bd6c903 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 19 Jan 2024 08:16:36 -0700
Subject: [PATCH 155/707] block/mq-deadline: skip expensive merge lookups if
 contended

We do several stages of merging in the block layer - the most likely one
to work is also the cheap one, merging direct in the per-task plug when
IO is submitted. Getting merges outside of that is a lot less likely,
but IO schedulers may still maintain internal data structures to
facilitate merge lookups outside of the plug.

Make mq-deadline skip expensive merge lookups if the queue lock is
already contended. The likelihood of getting a merge here is not very
high, hence it should not be a problem skipping the attempt in the also
unlikely event that the queue is already contended.

Perf diff shows the difference between a random read/write workload
with 4 threads doing IO, with expensive merges turned on and off:

    25.00%    +61.94%  [kernel.kallsyms]  [k] queued_spin_lock_slowpath

where we almost quadruple the lock contention by attempting these
expensive merges.

Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/mq-deadline.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 79bc3b6784b373..740b94f36cacf7 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -800,7 +800,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
 	struct request *free = NULL;
 	bool ret;
 
-	spin_lock(&dd->lock);
+	/*
+	 * bio merging is called for every bio queued, and it's very easy
+	 * to run into contention because of that. If we fail getting
+	 * the dd lock, just skip this merge attempt. For related IO, the
+	 * plug will be the successful merging point. If we get here, we
+	 * already failed doing the obvious merge. Chances of actually
+	 * getting a merge off this path is a lot slimmer, so skipping an
+	 * occassional lookup that will most likely not succeed anyway should
+	 * not be a problem.
+	 */
+	if (!spin_trylock(&dd->lock))
+		return false;
+
 	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
 	spin_unlock(&dd->lock);
 

From 574e7779cf583171acb5bf6365047bb0941b387c Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 18 Jan 2024 10:53:34 -0700
Subject: [PATCH 156/707] block/mq-deadline: use separate insertion lists

Reduce lock contention on dd->lock by calling dd_insert_request() from
inside the dispatch callback instead of from the insert callback. This
patch is inspired by a patch from Jens.

With the previous dispatch and merge optimization, this drastically
reduces contention for a sample cases of 32 threads doing IO to devices.
The test case looks as follows:

fio --bs=512 --group_reporting=1 --gtod_reduce=1 --invalidate=1 \
	--ioengine=io_uring --norandommap --runtime=60 --rw=randread \
	--thread --time_based=1 --buffered=0 --fixedbufs=1 --numjobs=32 \
	--iodepth=4 --iodepth_batch_submit=4 --iodepth_batch_complete=4 \
	--name=scaletest --filename=/dev/$DEV

Before:

Device		IOPS	sys	contention	diff
====================================================
null_blk	879K	89%	93.6%
nvme0n1		901K	86%	94.5%

and after this and the previous two patches:

Device		IOPS	sys	contention	diff
====================================================
null_blk	2867K	11.1%	~6.0%		+226%
nvme0n1		3162K	 9.9%	~5.0%		+250%

which basically eliminates all of the lock contention, it's down to
more normal levels. The throughput increases show that nicely, with more
than a 300% improvement for both cases.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
[axboe: expand commit message with more details and perf results]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/mq-deadline.c | 66 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 13 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 740b94f36cacf7..1b0de4fc39582b 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -89,11 +89,15 @@ struct deadline_data {
 	 */
 	struct {
 		spinlock_t lock;
+		spinlock_t insert_lock;
 		spinlock_t zone_lock;
 	} ____cacheline_aligned_in_smp;
 
 	unsigned long run_state;
 
+	struct list_head at_head;
+	struct list_head at_tail;
+
 	struct dd_per_prio per_prio[DD_PRIO_COUNT];
 
 	/* Data direction of latest dispatched request. */
@@ -120,6 +124,9 @@ static const enum dd_prio ioprio_class_to_prio[] = {
 	[IOPRIO_CLASS_IDLE]	= DD_IDLE_PRIO,
 };
 
+static void dd_insert_request(struct request_queue *q, struct request *rq,
+			      blk_insert_t flags, struct list_head *free);
+
 static inline struct rb_root *
 deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
 {
@@ -592,6 +599,33 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
 	return NULL;
 }
 
+static void __dd_do_insert(struct request_queue *q, blk_insert_t flags,
+			   struct list_head *list, struct list_head *free)
+{
+	while (!list_empty(list)) {
+		struct request *rq;
+
+		rq = list_first_entry(list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		dd_insert_request(q, rq, flags, free);
+	}
+}
+
+static void dd_do_insert(struct request_queue *q, struct list_head *free)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+	LIST_HEAD(at_head);
+	LIST_HEAD(at_tail);
+
+	spin_lock(&dd->insert_lock);
+	list_splice_init(&dd->at_head, &at_head);
+	list_splice_init(&dd->at_tail, &at_tail);
+	spin_unlock(&dd->insert_lock);
+
+	__dd_do_insert(q, BLK_MQ_INSERT_AT_HEAD, &at_head, free);
+	__dd_do_insert(q, 0, &at_tail, free);
+}
+
 /*
  * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
  *
@@ -602,10 +636,12 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
  */
 static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
-	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+	struct request_queue *q = hctx->queue;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	const unsigned long now = jiffies;
 	struct request *rq;
 	enum dd_prio prio;
+	LIST_HEAD(free);
 
 	/*
 	 * If someone else is already dispatching, skip this one. This will
@@ -620,6 +656,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 		return NULL;
 
 	spin_lock(&dd->lock);
+	dd_do_insert(q, &free);
 	rq = dd_dispatch_prio_aged_requests(dd, now);
 	if (rq)
 		goto unlock;
@@ -638,6 +675,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	clear_bit_unlock(DD_DISPATCHING, &dd->run_state);
 	spin_unlock(&dd->lock);
 
+	blk_mq_free_requests(&free);
 	return rq;
 }
 
@@ -727,8 +765,12 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 	eq->elevator_data = dd;
 
 	spin_lock_init(&dd->lock);
+	spin_lock_init(&dd->insert_lock);
 	spin_lock_init(&dd->zone_lock);
 
+	INIT_LIST_HEAD(&dd->at_head);
+	INIT_LIST_HEAD(&dd->at_tail);
+
 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
 		struct dd_per_prio *per_prio = &dd->per_prio[prio];
 
@@ -899,19 +941,13 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
 {
 	struct request_queue *q = hctx->queue;
 	struct deadline_data *dd = q->elevator->elevator_data;
-	LIST_HEAD(free);
-
-	spin_lock(&dd->lock);
-	while (!list_empty(list)) {
-		struct request *rq;
 
-		rq = list_first_entry(list, struct request, queuelist);
-		list_del_init(&rq->queuelist);
-		dd_insert_request(q, rq, flags, &free);
-	}
-	spin_unlock(&dd->lock);
-
-	blk_mq_free_requests(&free);
+	spin_lock(&dd->insert_lock);
+	if (flags & BLK_MQ_INSERT_AT_HEAD)
+		list_splice_init(list, &dd->at_head);
+	else
+		list_splice_init(list, &dd->at_tail);
+	spin_unlock(&dd->insert_lock);
 }
 
 /* Callback from inside blk_mq_rq_ctx_init(). */
@@ -990,6 +1026,10 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
 	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
 	enum dd_prio prio;
 
+	if (!list_empty_careful(&dd->at_head) ||
+	    !list_empty_careful(&dd->at_tail))
+		return true;
+
 	for (prio = 0; prio <= DD_PRIO_MAX; prio++)
 		if (dd_has_work_for_prio(&dd->per_prio[prio]))
 			return true;

From 88b61090083299c3c88162c357cabd30c1c61698 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 20 Jan 2024 14:50:58 -0700
Subject: [PATCH 157/707] block/bfq: pass in queue directly to
 bfq_insert_request()

The hardware queue isn't relevant, bfq only operates on the queue
itself. Pass in the queue directly rather than the hardware queue, as
that more clearly explains what is being operated on.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3cce6de464a7b7..7d08442474ec31 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -6236,10 +6236,9 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
 
 static struct bfq_queue *bfq_init_rq(struct request *rq);
 
-static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+static void bfq_insert_request(struct request_queue *q, struct request *rq,
 			       blk_insert_t flags)
 {
-	struct request_queue *q = hctx->queue;
 	struct bfq_data *bfqd = q->elevator->elevator_data;
 	struct bfq_queue *bfqq;
 	bool idle_timer_disabled = false;
@@ -6301,7 +6300,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
 
 		rq = list_first_entry(list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
-		bfq_insert_request(hctx, rq, flags);
+		bfq_insert_request(hctx->queue, rq, flags);
 	}
 }
 

From b1daebfc62466586bd256c98def34a21461ed128 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 20 Jan 2024 14:54:37 -0700
Subject: [PATCH 158/707] block/bfq: serialize request dispatching

If we're entering request dispatch but someone else is already
dispatching, then just skip this dispatch. We know IO is inflight and
this will trigger another dispatch event for any completion. This will
potentially cause slightly lower queue depth for contended cases, but
those are slowed down anyway and this should not cause an issue.

By itself, this patch doesn't help a whole lot, as the dispatch
lock contention reduction is just eaten up by the same bfqd->lock now
seeing increased insertion contention. But it's required work to be
able to reduce the lock contention in general.

Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 17 +++++++++++++++--
 block/bfq-iosched.h | 12 ++++++++++--
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 7d08442474ec31..5ef4a4eba572c8 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5304,6 +5304,18 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	struct bfq_queue *in_serv_queue;
 	bool waiting_rq, idle_timer_disabled = false;
 
+	/*
+	 * If someone else is already dispatching, skip this one. This will
+	 * defer the next dispatch event to when something completes, and could
+	 * potentially lower the queue depth for contended cases.
+	 *
+	 * See the logic in blk_mq_do_dispatch_sched(), which loops and
+	 * retries if nothing is dispatched.
+	 */
+	if (test_bit(BFQ_DISPATCHING, &bfqd->run_state) ||
+	    test_and_set_bit_lock(BFQ_DISPATCHING, &bfqd->run_state))
+		return NULL;
+
 	spin_lock_irq(&bfqd->lock);
 
 	in_serv_queue = bfqd->in_service_queue;
@@ -5315,6 +5327,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 			waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
 	}
 
+	clear_bit_unlock(BFQ_DISPATCHING, &bfqd->run_state);
 	spin_unlock_irq(&bfqd->lock);
 	bfq_update_dispatch_stats(hctx->queue, rq,
 			idle_timer_disabled ? in_serv_queue : NULL,
@@ -7210,6 +7223,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	q->elevator = eq;
 	spin_unlock_irq(&q->queue_lock);
 
+	spin_lock_init(&bfqd->lock);
+
 	/*
 	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
 	 * Grab a permanent reference to it, so that the normal code flow
@@ -7328,8 +7343,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	/* see comments on the definition of next field inside bfq_data */
 	bfqd->actuator_load_threshold = 4;
 
-	spin_lock_init(&bfqd->lock);
-
 	/*
 	 * The invocation of the next bfq_create_group_hierarchy
 	 * function is the head of a chain of function calls
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 467e8cfc41a249..56ff69f2216327 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -504,12 +504,22 @@ struct bfq_io_cq {
 	unsigned int requests;	/* Number of requests this process has in flight */
 };
 
+enum {
+	BFQ_DISPATCHING	= 0,
+};
+
 /**
  * struct bfq_data - per-device data structure.
  *
  * All the fields are protected by @lock.
  */
 struct bfq_data {
+	struct {
+		spinlock_t lock;
+	} ____cacheline_aligned_in_smp;
+
+	unsigned long run_state;
+
 	/* device request queue */
 	struct request_queue *queue;
 	/* dispatch queue */
@@ -795,8 +805,6 @@ struct bfq_data {
 	/* fallback dummy bfqq for extreme OOM conditions */
 	struct bfq_queue oom_bfqq;
 
-	spinlock_t lock;
-
 	/*
 	 * bic associated with the task issuing current bio for
 	 * merging. This and the next field are used as a support to

From 58ea8280b7a56f541ceba62efef77babaf45a680 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 20 Jan 2024 14:58:37 -0700
Subject: [PATCH 159/707] block/bfq: skip expensive merge lookups if contended

We do several stages of merging in the block layer - the most likely one
to work is also the cheap one, merging direct in the per-task plug when
IO is submitted. Getting merges outside of that is a lot less likely,
but IO schedulers may still maintain internal data structures to
facilitate merge lookups outside of the plug.

Make BFQ skip expensive merge lookups if the queue lock or bfqd lock is
already contended. The likelihood of getting a merge here is not very
high, hence it should not be a problem skipping the attempt in the also
unlikely event that either the queue or bfqd are already contended.

Perf diff shows the difference between a random read/write workload
with 4 threads doing IO, with expensive merges turned on and off:

    31.70%    +54.80%  [kernel.kallsyms]  [k] queued_spin_lock_slowpath

where we almost triple the lock contention (~32% -> ~87%) by attempting
these expensive merges, and performance drops from 1630K to 1050K IOPS.
At the same time, sys time drops from 37% to 14%.

Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 5ef4a4eba572c8..ea16a0c5308263 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -467,6 +467,21 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
 	return icq;
 }
 
+static struct bfq_io_cq *bfq_bic_try_lookup(struct request_queue *q)
+{
+	if (!current->io_context)
+		return NULL;
+	if (spin_trylock_irq(&q->queue_lock)) {
+		struct bfq_io_cq *icq;
+
+		icq = icq_to_bic(ioc_lookup_icq(q));
+		spin_unlock_irq(&q->queue_lock);
+		return icq;
+	}
+
+	return NULL;
+}
+
 /*
  * Scheduler run of queue, if there are requests pending and no one in the
  * driver that will restart queueing.
@@ -2454,10 +2469,21 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
 	 * returned by bfq_bic_lookup does not go away before
 	 * bfqd->lock is taken.
 	 */
-	struct bfq_io_cq *bic = bfq_bic_lookup(q);
+	struct bfq_io_cq *bic = bfq_bic_try_lookup(q);
 	bool ret;
 
-	spin_lock_irq(&bfqd->lock);
+	/*
+	 * bio merging is called for every bio queued, and it's very easy
+	 * to run into contention because of that. If we fail getting
+	 * the dd lock, just skip this merge attempt. For related IO, the
+	 * plug will be the successful merging point. If we get here, we
+	 * already failed doing the obvious merge. Chances of actually
+	 * getting a merge off this path is a lot slimmer, so skipping an
+	 * occassional lookup that will most likely not succeed anyway should
+	 * not be a problem.
+	 */
+	if (!spin_trylock_irq(&bfqd->lock))
+		return false;
 
 	if (bic) {
 		/*

From 1ee906ccc23b027b28505b84a37eb5c9e75db9ae Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 20 Jan 2024 15:04:51 -0700
Subject: [PATCH 160/707] block/bfq: use separate insertion lists

Based on the similar patch for mq-deadline, this uses separate
insertion lists so we can defer touching dd->lock until dispatch
time.

This improves the following fio workload:

fio --bs=512 --group_reporting=1 --gtod_reduce=1 --invalidate=1 \
        --ioengine=io_uring --norandommap --runtime=60 --rw=randread \
        --thread --time_based=1 --buffered=0 --fixedbufs=1 --numjobs=32 \
        --iodepth=4 --iodepth_batch_submit=4 --iodepth_batch_complete=4 \
        --name=/dev/nvme0n1 --filename=/dev/nvme0n1

from:

/dev/nvme0n1: (groupid=0, jobs=32): err= 0: pid=1113: Fri Jan 19 20:59:26 2024
  read: IOPS=567k, BW=277MiB/s (290MB/s)(1820MiB/6575msec)
   bw (  KiB/s): min=274824, max=291156, per=100.00%, avg=283930.08, stdev=143.01, samples=416
   iops        : min=549648, max=582312, avg=567860.31, stdev=286.01, samples=416
  cpu          : usr=0.18%, sys=86.04%, ctx=866079, majf=0, minf=0
  IO depths    : 1=0.0%, 2=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=3728344,0,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=4

with 96% lock contention and 86% sys time, to:

/dev/nvme0n1: (groupid=0, jobs=32): err= 0: pid=8922: Sat Jan 20 11:16:20 2024
  read: IOPS=1550k, BW=757MiB/s (794MB/s)(19.6GiB/26471msec)
   bw (  KiB/s): min=754668, max=848896, per=100.00%, avg=775459.33, stdev=624.43, samples=1664
   iops        : min=1509336, max=1697793, avg=1550918.83, stdev=1248.87, samples=1664
  cpu          : usr=1.34%, sys=14.49%, ctx=9950560, majf=0, minf=0
  IO depths    : 1=0.0%, 2=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=41042924,0,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=4

with ~30% lock contention and 14.5% sys time, by applying the lessons
learnt with scaling mq-deadline.

Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 70 ++++++++++++++++++++++++++++++++++-----------
 block/bfq-iosched.h |  4 +++
 2 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index ea16a0c5308263..9bd57baa4b0b93 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5174,6 +5174,10 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
 {
 	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
 
+	if (!list_empty_careful(&bfqd->at_head) ||
+	    !list_empty_careful(&bfqd->at_tail))
+		return true;
+
 	/*
 	 * Avoiding lock: a race on bfqd->queued should cause at
 	 * most a call to dispatch for nothing
@@ -5323,12 +5327,44 @@ static inline void bfq_update_dispatch_stats(struct request_queue *q,
 					     bool idle_timer_disabled) {}
 #endif /* CONFIG_BFQ_CGROUP_DEBUG */
 
+static void bfq_insert_request(struct request_queue *q, struct request *rq,
+			       blk_insert_t flags, struct list_head *free);
+
+static void __bfq_do_insert(struct request_queue *q, blk_insert_t flags,
+			    struct list_head *list, struct list_head *free)
+{
+	while (!list_empty(list)) {
+		struct request *rq;
+
+		rq = list_first_entry(list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		bfq_insert_request(q, rq, flags, free);
+	}
+}
+
+static void bfq_do_insert(struct request_queue *q, struct list_head *free)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	LIST_HEAD(at_head);
+	LIST_HEAD(at_tail);
+
+	spin_lock(&bfqd->insert_lock);
+	list_splice_init(&bfqd->at_head, &at_head);
+	list_splice_init(&bfqd->at_tail, &at_tail);
+	spin_unlock(&bfqd->insert_lock);
+
+	__bfq_do_insert(q, BLK_MQ_INSERT_AT_HEAD, &at_head, free);
+	__bfq_do_insert(q, 0, &at_tail, free);
+}
+
 static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
-	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+	struct request_queue *q = hctx->queue;
+	struct bfq_data *bfqd = q->elevator->elevator_data;
 	struct request *rq;
 	struct bfq_queue *in_serv_queue;
 	bool waiting_rq, idle_timer_disabled = false;
+	LIST_HEAD(free);
 
 	/*
 	 * If someone else is already dispatching, skip this one. This will
@@ -5344,6 +5380,8 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 
 	spin_lock_irq(&bfqd->lock);
 
+	bfq_do_insert(hctx->queue, &free);
+
 	in_serv_queue = bfqd->in_service_queue;
 	waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
 
@@ -5355,6 +5393,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 
 	clear_bit_unlock(BFQ_DISPATCHING, &bfqd->run_state);
 	spin_unlock_irq(&bfqd->lock);
+	blk_mq_free_requests(&free);
 	bfq_update_dispatch_stats(hctx->queue, rq,
 			idle_timer_disabled ? in_serv_queue : NULL,
 				idle_timer_disabled);
@@ -6276,25 +6315,20 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
 static struct bfq_queue *bfq_init_rq(struct request *rq);
 
 static void bfq_insert_request(struct request_queue *q, struct request *rq,
-			       blk_insert_t flags)
+			       blk_insert_t flags, struct list_head *free)
 {
 	struct bfq_data *bfqd = q->elevator->elevator_data;
 	struct bfq_queue *bfqq;
 	bool idle_timer_disabled = false;
 	blk_opf_t cmd_flags;
-	LIST_HEAD(free);
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
 		bfqg_stats_update_legacy_io(q, rq);
 #endif
-	spin_lock_irq(&bfqd->lock);
 	bfqq = bfq_init_rq(rq);
-	if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
-		spin_unlock_irq(&bfqd->lock);
-		blk_mq_free_requests(&free);
+	if (blk_mq_sched_try_insert_merge(q, rq, free))
 		return;
-	}
 
 	trace_block_rq_insert(rq);
 
@@ -6324,8 +6358,6 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq,
 	 * merge).
 	 */
 	cmd_flags = rq->cmd_flags;
-	spin_unlock_irq(&bfqd->lock);
-
 	bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
 				cmd_flags);
 }
@@ -6334,13 +6366,15 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
 				struct list_head *list,
 				blk_insert_t flags)
 {
-	while (!list_empty(list)) {
-		struct request *rq;
+	struct request_queue *q = hctx->queue;
+	struct bfq_data *bfqd = q->elevator->elevator_data;
 
-		rq = list_first_entry(list, struct request, queuelist);
-		list_del_init(&rq->queuelist);
-		bfq_insert_request(hctx->queue, rq, flags);
-	}
+	spin_lock_irq(&bfqd->insert_lock);
+	if (flags & BLK_MQ_INSERT_AT_HEAD)
+		list_splice_init(list, &bfqd->at_head);
+	else
+		list_splice_init(list, &bfqd->at_tail);
+	spin_unlock_irq(&bfqd->insert_lock);
 }
 
 static void bfq_update_hw_tag(struct bfq_data *bfqd)
@@ -7250,6 +7284,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	spin_unlock_irq(&q->queue_lock);
 
 	spin_lock_init(&bfqd->lock);
+	spin_lock_init(&bfqd->insert_lock);
+
+	INIT_LIST_HEAD(&bfqd->at_head);
+	INIT_LIST_HEAD(&bfqd->at_tail);
 
 	/*
 	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 56ff69f2216327..f44f5d4ec2f4a2 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -516,10 +516,14 @@ enum {
 struct bfq_data {
 	struct {
 		spinlock_t lock;
+		spinlock_t insert_lock;
 	} ____cacheline_aligned_in_smp;
 
 	unsigned long run_state;
 
+	struct list_head at_head;
+	struct list_head at_tail;
+
 	/* device request queue */
 	struct request_queue *queue;
 	/* dispatch queue */

From 18975ce057997fdee8b3d7df2b36f3144856fe24 Mon Sep 17 00:00:00 2001
From: Terry Tritton <terry.tritton@linaro.org>
Date: Wed, 24 Jan 2024 14:13:55 +0000
Subject: [PATCH 161/707] selftests/seccomp: Handle EINVAL on
 unshare(CLONE_NEWPID)

unshare(CLONE_NEWPID) can return EINVAL if the kernel does not have the
CONFIG_PID_NS option enabled.

Add a check on these calls to skip the test if we receive EINVAL.

Signed-off-by: Terry Tritton <terry.tritton@linaro.org>
Link: https://lore.kernel.org/r/20240124141357.1243457-2-terry.tritton@linaro.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 38f6514699682b..5e705674b70670 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -3709,7 +3709,12 @@ TEST(user_notification_sibling_pid_ns)
 	ASSERT_GE(pid, 0);
 
 	if (pid == 0) {
-		ASSERT_EQ(unshare(CLONE_NEWPID), 0);
+		ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
+			if (errno == EPERM)
+				SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
+			else if (errno == EINVAL)
+				SKIP(return, "CLONE_NEWPID is invalid (missing CONFIG_PID_NS?)");
+		}
 
 		pid2 = fork();
 		ASSERT_GE(pid2, 0);
@@ -3727,6 +3732,8 @@ TEST(user_notification_sibling_pid_ns)
 	ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
 		if (errno == EPERM)
 			SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
+		else if (errno == EINVAL)
+			SKIP(return, "CLONE_NEWPID is invalid (missing CONFIG_PID_NS?)");
 	}
 	ASSERT_EQ(errno, 0);
 

From fbcdf41167fea3d7d17a9ae6ffa79e2afb41881a Mon Sep 17 00:00:00 2001
From: Terry Tritton <terry.tritton@linaro.org>
Date: Wed, 24 Jan 2024 14:13:56 +0000
Subject: [PATCH 162/707] selftests/seccomp: Change the syscall used in
 KILL_THREAD test

The Bionic version of pthread_create used on Android calls the prctl
function to give the stack and thread local storage a useful name. This
will cause the KILL_THREAD test to fail as it will kill the thread as
soon as it is created.

change the test to use getpid instead of prctl.

Signed-off-by: Terry Tritton <terry.tritton@linaro.org>
Link: https://lore.kernel.org/r/20240124141357.1243457-3-terry.tritton@linaro.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 5e705674b70670..da11b95b88721b 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -784,7 +784,7 @@ void *kill_thread(void *data)
 	bool die = (bool)data;
 
 	if (die) {
-		prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
+		syscall(__NR_getpid);
 		return (void *)SIBLING_EXIT_FAILURE;
 	}
 
@@ -803,11 +803,11 @@ void kill_thread_or_group(struct __test_metadata *_metadata,
 {
 	pthread_t thread;
 	void *status;
-	/* Kill only when calling __NR_prctl. */
+	/* Kill only when calling __NR_getpid. */
 	struct sock_filter filter_thread[] = {
 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
 			offsetof(struct seccomp_data, nr)),
-		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 	};
@@ -819,7 +819,7 @@ void kill_thread_or_group(struct __test_metadata *_metadata,
 	struct sock_filter filter_process[] = {
 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
 			offsetof(struct seccomp_data, nr)),
-		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
 		BPF_STMT(BPF_RET|BPF_K, kill),
 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 	};

From 0c6f28a844311b8f81b4b117f586189c704d3f33 Mon Sep 17 00:00:00 2001
From: Terry Tritton <terry.tritton@linaro.org>
Date: Wed, 24 Jan 2024 14:13:57 +0000
Subject: [PATCH 163/707] selftests/seccomp: user_notification_addfd check
 nextfd is available

Currently the user_notification_addfd test checks what the next expected
file descriptor will be by incrementing a variable nextfd. This does not
account for file descriptors that may already be open before the test is
started and will cause the test to fail if any exist.

Replace nextfd++ with a function get_next_fd which will check and return
the next available file descriptor.

Signed-off-by: Terry Tritton <terry.tritton@linaro.org>
Link: https://lore.kernel.org/r/20240124141357.1243457-4-terry.tritton@linaro.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 24 +++++++++++++++----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index da11b95b88721b..cacf6507f69055 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -4044,6 +4044,16 @@ TEST(user_notification_filter_empty_threaded)
 	EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
 }
 
+
+int get_next_fd(int prev_fd)
+{
+	for (int i = prev_fd + 1; i < FD_SETSIZE; ++i) {
+		if (fcntl(i, F_GETFD) == -1)
+			return i;
+	}
+	_exit(EXIT_FAILURE);
+}
+
 TEST(user_notification_addfd)
 {
 	pid_t pid;
@@ -4060,7 +4070,7 @@ TEST(user_notification_addfd)
 	/* There may be arbitrary already-open fds at test start. */
 	memfd = memfd_create("test", 0);
 	ASSERT_GE(memfd, 0);
-	nextfd = memfd + 1;
+	nextfd = get_next_fd(memfd);
 
 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 	ASSERT_EQ(0, ret) {
@@ -4071,7 +4081,8 @@ TEST(user_notification_addfd)
 	/* Check that the basic notification machinery works */
 	listener = user_notif_syscall(__NR_getppid,
 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
-	ASSERT_EQ(listener, nextfd++);
+	ASSERT_EQ(listener, nextfd);
+	nextfd = get_next_fd(nextfd);
 
 	pid = fork();
 	ASSERT_GE(pid, 0);
@@ -4126,14 +4137,16 @@ TEST(user_notification_addfd)
 
 	/* Verify we can set an arbitrary remote fd */
 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
-	EXPECT_EQ(fd, nextfd++);
+	EXPECT_EQ(fd, nextfd);
+	nextfd = get_next_fd(nextfd);
 	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
 
 	/* Verify we can set an arbitrary remote fd with large size */
 	memset(&big, 0x0, sizeof(big));
 	big.addfd = addfd;
 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big);
-	EXPECT_EQ(fd, nextfd++);
+	EXPECT_EQ(fd, nextfd);
+	nextfd = get_next_fd(nextfd);
 
 	/* Verify we can set a specific remote fd */
 	addfd.newfd = 42;
@@ -4171,7 +4184,8 @@ TEST(user_notification_addfd)
 	 * Child has earlier "low" fds and now 42, so we expect the next
 	 * lowest available fd to be assigned here.
 	 */
-	EXPECT_EQ(fd, nextfd++);
+	EXPECT_EQ(fd, nextfd);
+	nextfd = get_next_fd(nextfd);
 	ASSERT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
 
 	/*

From 367122c529f35b4655acbe33c0cc4d6d3b32ba71 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Wed, 24 Jan 2024 15:13:40 -0300
Subject: [PATCH 164/707] libfs: Attempt exact-match comparison first during
 casefolded lookup

Casefolded comparisons are (obviously) way more costly than a simple
memcmp.  Try the case-sensitive comparison first, falling-back to the
case-insensitive lookup only when needed.  This allows any exact-match
lookup to complete without having to walk the utf8 trie.

Note that, for strict mode, generic_ci_d_compare used to reject an
invalid UTF-8 string, which would now be considered valid if it
exact-matches the disk-name.  But, if that is the case, the filesystem
is corrupt.  More than that, it really doesn't matter in practice,
because the name-under-lookup will have already been rejected by
generic_ci_d_hash and we won't even get here.

The memcmp is safe under RCU because we are operating on str/len instead
of dentry->d_name directly, and the caller guarantees their consistency
between each other in __d_lookup_rcu_op_compare.

Link: https://lore.kernel.org/r/87ttn2sip7.fsf_-_@mailhost.krisman.be
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
---
 fs/libfs.c | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/fs/libfs.c b/fs/libfs.c
index eec6031b015544..306a0510b7dc25 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1704,16 +1704,28 @@ bool is_empty_dir_inode(struct inode *inode)
 static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
 				const char *str, const struct qstr *name)
 {
-	const struct dentry *parent = READ_ONCE(dentry->d_parent);
-	const struct inode *dir = READ_ONCE(parent->d_inode);
-	const struct super_block *sb = dentry->d_sb;
-	const struct unicode_map *um = sb->s_encoding;
-	struct qstr qstr = QSTR_INIT(str, len);
+	const struct dentry *parent;
+	const struct inode *dir;
 	char strbuf[DNAME_INLINE_LEN];
-	int ret;
+	struct qstr qstr;
+
+	/*
+	 * Attempt a case-sensitive match first. It is cheaper and
+	 * should cover most lookups, including all the sane
+	 * applications that expect a case-sensitive filesystem.
+	 *
+	 * This comparison is safe under RCU because the caller
+	 * guarantees the consistency between str and len. See
+	 * __d_lookup_rcu_op_compare() for details.
+	 */
+	if (len == name->len && !memcmp(str, name->name, len))
+		return 0;
 
+	parent = READ_ONCE(dentry->d_parent);
+	dir = READ_ONCE(parent->d_inode);
 	if (!dir || !IS_CASEFOLDED(dir))
-		goto fallback;
+		return 1;
+
 	/*
 	 * If the dentry name is stored in-line, then it may be concurrently
 	 * modified by a rename.  If this happens, the VFS will eventually retry
@@ -1724,20 +1736,14 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
 	if (len <= DNAME_INLINE_LEN - 1) {
 		memcpy(strbuf, str, len);
 		strbuf[len] = 0;
-		qstr.name = strbuf;
+		str = strbuf;
 		/* prevent compiler from optimizing out the temporary buffer */
 		barrier();
 	}
-	ret = utf8_strncasecmp(um, name, &qstr);
-	if (ret >= 0)
-		return ret;
+	qstr.len = len;
+	qstr.name = str;
 
-	if (sb_has_strict_encoding(sb))
-		return -EINVAL;
-fallback:
-	if (len != name->len)
-		return 1;
-	return !!memcmp(str, name->name, len);
+	return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
 }
 
 /**

From 0b3bbd8f9baf245ec77d86f6f5bc902105b4bfa9 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:05:11 -0800
Subject: [PATCH 165/707] counter: linux/counter.h: fix Excess kernel-doc
 description warning

Remove the @priv: line to prevent the kernel-doc warning:

include/linux/counter.h:400: warning: Excess struct member 'priv' description in 'counter_device'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Fixes: f2ee4759fb70 ("counter: remove old and now unused registration API")
Link: https://lore.kernel.org/r/20231223050511.13849-1-rdunlap@infradead.org
Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
---
 include/linux/counter.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/counter.h b/include/linux/counter.h
index 702e9108bbb44e..b767b5c821f58e 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -359,7 +359,6 @@ struct counter_ops {
  * @num_counts:		number of Counts specified in @counts
  * @ext:		optional array of Counter device extensions
  * @num_ext:		number of Counter device extensions specified in @ext
- * @priv:		optional private data supplied by driver
  * @dev:		internal device structure
  * @chrdev:		internal character device structure
  * @events_list:	list of current watching Counter events

From 0410516aaef4bf08030845547fb94f4ff989fac0 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 24 Jan 2024 16:02:16 +0200
Subject: [PATCH 166/707] x86/mm: Fix memory encryption features advertisement

When memory encryption is enabled, the kernel prints the encryption
flavor that the system supports.

The check assumes that everything is AMD SME/SEV if it doesn't have
the TDX CPU feature set.

Hyper-V vTOM sets cc_vendor to CC_VENDOR_INTEL when it runs as L2 guest
on top of TDX, but not X86_FEATURE_TDX_GUEST. Hyper-V only needs memory
encryption enabled for I/O without the rest of CoCo enabling.

To avoid confusion, check the cc_vendor directly.

  [ bp: Massage commit message. ]

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Jeremi Piotrowski <jpiotrowski@linux.microsoft.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Acked-by: Kai Huang <kai.huang@intel.com>
Link: https://lore.kernel.org/r/20240124140217.533748-1-kirill.shutemov@linux.intel.com
---
 arch/x86/mm/mem_encrypt.c | 56 +++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index c290c55b632bd7..d035bce3a2b020 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -42,38 +42,42 @@ bool force_dma_unencrypted(struct device *dev)
 
 static void print_mem_encrypt_feature_info(void)
 {
-	pr_info("Memory Encryption Features active:");
+	pr_info("Memory Encryption Features active: ");
 
-	if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
-		pr_cont(" Intel TDX\n");
-		return;
-	}
-
-	pr_cont(" AMD");
+	switch (cc_vendor) {
+	case CC_VENDOR_INTEL:
+		pr_cont("Intel TDX\n");
+		break;
+	case CC_VENDOR_AMD:
+		pr_cont("AMD");
 
-	/* Secure Memory Encryption */
-	if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
+		/* Secure Memory Encryption */
+		if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
 		/*
 		 * SME is mutually exclusive with any of the SEV
 		 * features below.
-		 */
-		pr_cont(" SME\n");
-		return;
+		*/
+			pr_cont(" SME\n");
+			return;
+		}
+
+		/* Secure Encrypted Virtualization */
+		if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+			pr_cont(" SEV");
+
+		/* Encrypted Register State */
+		if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
+			pr_cont(" SEV-ES");
+
+		/* Secure Nested Paging */
+		if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+			pr_cont(" SEV-SNP");
+
+		pr_cont("\n");
+		break;
+	default:
+		pr_cont("Unknown\n");
 	}
-
-	/* Secure Encrypted Virtualization */
-	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
-		pr_cont(" SEV");
-
-	/* Encrypted Register State */
-	if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
-		pr_cont(" SEV-ES");
-
-	/* Secure Nested Paging */
-	if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
-		pr_cont(" SEV-SNP");
-
-	pr_cont("\n");
 }
 
 /* Architecture __weak replacement functions */

From 653b325d96f0f71b5baad0f192b0bc7f08f6243d Mon Sep 17 00:00:00 2001
From: Dani Liberman <dliberman@habana.ai>
Date: Thu, 21 Sep 2023 17:02:33 +0300
Subject: [PATCH 167/707] accel/habanalabs/gaudi2: add interrupt affinity for
 user interrupts

User interrupts are MSIx interrupts coming from Gaudi2, that have
specific range of IDs and are assigned to the sole use of the user
process that opened the Gaudi2 device (reminder: there can be only
a single user process running on Gaudi2 at any given time).

The interrupts are allocated and managed by the driver and therefore,
the user expects the driver to initialize them properly, which also
includes setting the affinity to the related CPU cores of the
device's NUMA node to get maximum performance.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c     | 32 ++++++++++++++++++++
 drivers/accel/habanalabs/common/habanalabs.h |  5 +++
 drivers/accel/habanalabs/gaudi2/gaudi2.c     |  5 +++
 3 files changed, 42 insertions(+)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index a73bd4be94b156..5eacbc73f1bb98 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -2801,3 +2801,35 @@ void hl_enable_err_info_capture(struct hl_error_info *captured_err_info)
 	atomic_set(&captured_err_info->cs_timeout.write_enable, 1);
 	captured_err_info->undef_opcode.write_enable = true;
 }
+
+void hl_init_cpu_for_irq(struct hl_device *hdev)
+{
+#ifdef CONFIG_NUMA
+	struct cpumask *available_mask = &hdev->irq_affinity_mask;
+	int numa_node = hdev->pdev->dev.numa_node, i;
+	static struct cpumask cpu_mask;
+
+	if (numa_node < 0)
+		return;
+
+	if (!cpumask_and(&cpu_mask, cpumask_of_node(numa_node), cpu_online_mask)) {
+		dev_err(hdev->dev, "No available affinities in current numa node\n");
+		return;
+	}
+
+	/* Remove HT siblings */
+	for_each_cpu(i, &cpu_mask)
+		cpumask_set_cpu(cpumask_first(topology_sibling_cpumask(i)), available_mask);
+#endif
+}
+
+void hl_set_irq_affinity(struct hl_device *hdev, int irq)
+{
+	if (cpumask_empty(&hdev->irq_affinity_mask)) {
+		dev_dbg(hdev->dev, "affinity mask is empty\n");
+		return;
+	}
+
+	if (irq_set_affinity_hint(irq, &hdev->irq_affinity_mask))
+		dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq);
+}
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 2a900c9941fee6..b1a7b229e16160 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -3257,6 +3257,7 @@ struct hl_reset_info {
  * @clk_throttling: holds information about current/previous clock throttling events
  * @captured_err_info: holds information about errors.
  * @reset_info: holds current device reset information.
+ * @irq_affinity_mask: mask of available CPU cores for user and decoder interrupt handling.
  * @stream_master_qid_arr: pointer to array with QIDs of master streams.
  * @fw_inner_major_ver: the major of current loaded preboot inner version.
  * @fw_inner_minor_ver: the minor of current loaded preboot inner version.
@@ -3446,6 +3447,8 @@ struct hl_device {
 
 	struct hl_reset_info		reset_info;
 
+	cpumask_t			irq_affinity_mask;
+
 	u32				*stream_master_qid_arr;
 	u32				fw_inner_major_ver;
 	u32				fw_inner_minor_ver;
@@ -4032,6 +4035,8 @@ void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_
 void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info);
 void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_count);
 void hl_enable_err_info_capture(struct hl_error_info *captured_err_info);
+void hl_init_cpu_for_irq(struct hl_device *hdev);
+void hl_set_irq_affinity(struct hl_device *hdev, int irq);
 
 #ifdef CONFIG_DEBUG_FS
 
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index e0e5615ef9b0f6..fd01525b1ea204 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -4254,6 +4254,8 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
 	if (gaudi2->hw_cap_initialized & HW_CAP_MSIX)
 		return 0;
 
+	hl_init_cpu_for_irq(hdev);
+
 	rc = pci_alloc_irq_vectors(hdev->pdev, GAUDI2_MSIX_ENTRIES, GAUDI2_MSIX_ENTRIES,
 					PCI_IRQ_MSIX);
 	if (rc < 0) {
@@ -4307,6 +4309,7 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
 			i++, j++, user_irq_init_cnt++) {
 
 		irq = pci_irq_vector(hdev->pdev, i);
+		hl_set_irq_affinity(hdev, irq);
 		rc = request_irq(irq, hl_irq_user_interrupt_handler, 0, gaudi2_irq_name(i),
 				&hdev->user_interrupt[j]);
 		if (rc) {
@@ -4333,6 +4336,7 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
 			i < GAUDI2_IRQ_NUM_USER_FIRST + user_irq_init_cnt ; i++, j++) {
 
 		irq = pci_irq_vector(hdev->pdev, i);
+		irq_set_affinity_hint(irq, NULL);
 		free_irq(irq, &hdev->user_interrupt[j]);
 	}
 	irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_UNEXPECTED_ERROR);
@@ -4413,6 +4417,7 @@ static void gaudi2_disable_msix(struct hl_device *hdev)
 			k < hdev->asic_prop.user_interrupt_count ; i++, j++, k++) {
 
 		irq = pci_irq_vector(hdev->pdev, i);
+		irq_set_affinity_hint(irq, NULL);
 		free_irq(irq, &hdev->user_interrupt[j]);
 	}
 

From df3d17212018903324c4606743d74ff8b0ba3da3 Mon Sep 17 00:00:00 2001
From: Koby Elbaz <kelbaz@habana.ai>
Date: Mon, 11 Dec 2023 10:03:29 +0200
Subject: [PATCH 168/707] accel/habanalabs: increase HL_MAX_STR to 64 bytes to
 avoid warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix a warning of a buffer overflow:
‘snprintf’ output between 38 and 47 bytes into a destination of size 32

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/habanalabs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index b1a7b229e16160..253873315888e1 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -2547,7 +2547,7 @@ struct hl_state_dump_specs {
  * DEVICES
  */
 
-#define HL_STR_MAX	32
+#define HL_STR_MAX	64
 
 #define HL_DEV_STS_MAX (HL_DEVICE_STATUS_LAST + 1)
 

From 6101e4be8a6f4afa0d119ca9a0510032193661d5 Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Thu, 14 Dec 2023 10:38:06 +0200
Subject: [PATCH 169/707] accel/habanalabs: fix DRAM BAR base address
 calculation

When the DRAM region size in the BAR is not a power of 2, calculating
the corresponding BAR base address should be done using the offset from
the DRAM start address, and not using directly the DRAM address.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 5eacbc73f1bb98..5c46826e365929 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -55,7 +55,8 @@ static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_regi
 	if (is_power_of_2(prop->dram_pci_bar_size))
 		bar_base_addr = addr & ~(prop->dram_pci_bar_size - 0x1ull);
 	else
-		bar_base_addr = DIV_ROUND_DOWN_ULL(addr, prop->dram_pci_bar_size) *
+		bar_base_addr = region->region_base +
+				div64_u64((addr - region->region_base), prop->dram_pci_bar_size) *
 				prop->dram_pci_bar_size;
 
 	old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr);

From f1282caa974f6a7d6e0860b2d56104da63ae83bd Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Mon, 25 Dec 2023 00:28:36 +0200
Subject: [PATCH 170/707] accel/habanalabs: abort device reset for consecutive
 heartbeat failures

The mechanism of aborting device reset for consecutive fatal errors is
currently only for fatal errors that are reported by FW.
A non-responsive FW and consecutive heartbeat failures is also
considered fatal, so add them as well to this mechanism to avoid
recurring device reset in such a case.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 5c46826e365929..cf004baf5e6213 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1769,14 +1769,16 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		hdev->device_cpu_disabled = false;
 		hdev->reset_info.hard_reset_pending = false;
 
+		/*
+		 * Put the device in an unusable state if there are 2 back to back resets due to
+		 * fatal errors.
+		 */
 		if (hdev->reset_info.reset_trigger_repeated &&
-				(hdev->reset_info.prev_reset_trigger ==
-						HL_DRV_RESET_FW_FATAL_ERR)) {
-			/* if there 2 back to back resets from FW,
-			 * ensure driver puts the driver in a unusable state
-			 */
+				(hdev->reset_info.prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR ||
+						hdev->reset_info.prev_reset_trigger ==
+								HL_DRV_RESET_HEARTBEAT)) {
 			dev_crit(hdev->dev,
-				"%s Consecutive FW fatal errors received, stopping hard reset\n",
+				"%s Consecutive fatal errors, stopping hard reset\n",
 				dev_name(&(hdev)->pdev->dev));
 			rc = -EIO;
 			goto out_err;

From d545e7d8763e1f6cd75cc8de22cb414aca914c1c Mon Sep 17 00:00:00 2001
From: Farah Kassabri <fkassabri@habana.ai>
Date: Thu, 2 Nov 2023 11:53:29 +0200
Subject: [PATCH 171/707] accel/habanalabs/gaudi2: move HMMU page tables to
 device memory

Currently the HMMU page tables reside in the host memory,
which will cause host access from the device for every page walk.
This can affect PCIe bandwidth in certain scenarios.

To prevent that problem, HMMU page tables will be moved to the device
memory so the miss transaction will read the hops from there instead of
going to the host.

Signed-off-by: Farah Kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/habanalabs.h  |  26 ++
 drivers/accel/habanalabs/common/hw_queue.c    |  17 +
 drivers/accel/habanalabs/common/mmu/Makefile  |   2 +-
 drivers/accel/habanalabs/common/mmu/mmu.c     | 223 ++++++++++-
 drivers/accel/habanalabs/common/mmu/mmu_v1.c  | 352 +++---------------
 drivers/accel/habanalabs/common/mmu/mmu_v2.c  | 338 +++++++++++++++++
 drivers/accel/habanalabs/gaudi/gaudi.c        |   1 +
 drivers/accel/habanalabs/gaudi2/gaudi2.c      | 245 ++++++++----
 drivers/accel/habanalabs/gaudi2/gaudi2P.h     |  12 +-
 .../include/hw_ip/mmu/mmu_general.h           |   2 +
 10 files changed, 836 insertions(+), 382 deletions(-)
 create mode 100644 drivers/accel/habanalabs/common/mmu/mmu_v2.c

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 253873315888e1..7397ce86b7f03a 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -443,18 +443,22 @@ enum hl_collective_mode {
  *                  a CB handle can be provided for jobs on this queue.
  *                  Otherwise, a CB address must be provided.
  * @collective_mode: collective mode of current queue
+ * @q_dram_bd_address: PQ dram address, used when PQ need to reside in DRAM.
  * @driver_only: true if only the driver is allowed to send a job to this queue,
  *               false otherwise.
  * @binned: True if the queue is binned out and should not be used
  * @supports_sync_stream: True if queue supports sync stream
+ * @dram_bd: True if the bd should be copied to dram, needed for PQ which has been allocated on dram
  */
 struct hw_queue_properties {
 	enum hl_queue_type		type;
 	enum queue_cb_alloc_flags	cb_alloc_flags;
 	enum hl_collective_mode		collective_mode;
+	u64				q_dram_bd_address;
 	u8				driver_only;
 	u8				binned;
 	u8				supports_sync_stream;
+	u8				dram_bd;
 };
 
 /**
@@ -1052,6 +1056,8 @@ struct hl_encaps_signals_mgr {
  * @collective_mode: collective mode of current queue
  * @kernel_address: holds the queue's kernel virtual address.
  * @bus_address: holds the queue's DMA address.
+ * @pq_dram_address: hold the dram address when the PQ is allocated, used when dram_bd is true in
+ *                   queue properites.
  * @pi: holds the queue's pi value.
  * @ci: holds the queue's ci value, AS CALCULATED BY THE DRIVER (not real ci).
  * @hw_queue_id: the id of the H/W queue.
@@ -1061,6 +1067,7 @@ struct hl_encaps_signals_mgr {
  * @valid: is the queue valid (we have array of 32 queues, not all of them
  *         exist).
  * @supports_sync_stream: True if queue supports sync stream
+ * @dram_bd: True if the bd should be copied to dram, needed for PQ which has been allocated on dram
  */
 struct hl_hw_queue {
 	struct hl_cs_job			**shadow_queue;
@@ -1069,6 +1076,7 @@ struct hl_hw_queue {
 	enum hl_collective_mode			collective_mode;
 	void					*kernel_address;
 	dma_addr_t				bus_address;
+	u64					pq_dram_address;
 	u32					pi;
 	atomic_t				ci;
 	u32					hw_queue_id;
@@ -1077,6 +1085,7 @@ struct hl_hw_queue {
 	u16					int_queue_len;
 	u8					valid;
 	u8					supports_sync_stream;
+	u8					dram_bd;
 };
 
 /**
@@ -3889,6 +3898,7 @@ int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_
 							struct hl_hr_mmu_funcs *hr_func);
 int hl_mmu_if_set_funcs(struct hl_device *hdev);
 void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
+void hl_mmu_v2_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
 void hl_mmu_v2_hr_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
 int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr);
 int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
@@ -3896,6 +3906,22 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
 u64 hl_mmu_scramble_addr(struct hl_device *hdev, u64 addr);
 u64 hl_mmu_descramble_addr(struct hl_device *hdev, u64 addr);
 bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr);
+struct pgt_info *hl_mmu_dr_get_pgt_info(struct hl_ctx *ctx, u64 hop_addr);
+void hl_mmu_dr_free_hop(struct hl_ctx *ctx, u64 hop_addr);
+void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info);
+u64 hl_mmu_dr_get_phys_hop0_addr(struct hl_ctx *ctx);
+u64 hl_mmu_dr_get_hop0_addr(struct hl_ctx *ctx);
+void hl_mmu_dr_write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val);
+void hl_mmu_dr_write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val);
+void hl_mmu_dr_clear_pte(struct hl_ctx *ctx, u64 pte_addr);
+u64 hl_mmu_dr_get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
+void hl_mmu_dr_get_pte(struct hl_ctx *ctx, u64 hop_addr);
+int hl_mmu_dr_put_pte(struct hl_ctx *ctx, u64 hop_addr);
+u64 hl_mmu_dr_get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte, bool *is_new_hop);
+u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx);
+void hl_mmu_dr_flush(struct hl_ctx *ctx);
+int hl_mmu_dr_init(struct hl_device *hdev);
+void hl_mmu_dr_fini(struct hl_device *hdev);
 
 int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
 				void __iomem *dst, u32 src_offset, u32 size);
diff --git a/drivers/accel/habanalabs/common/hw_queue.c b/drivers/accel/habanalabs/common/hw_queue.c
index d0087c0ec48c9f..3d04a7507cce3c 100644
--- a/drivers/accel/habanalabs/common/hw_queue.c
+++ b/drivers/accel/habanalabs/common/hw_queue.c
@@ -84,6 +84,8 @@ void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
 		u32 ctl, u32 len, u64 ptr)
 {
 	struct hl_bd *bd;
+	u64 addr;
+	int i;
 
 	bd = q->kernel_address;
 	bd += hl_pi_2_offset(q->pi);
@@ -91,7 +93,16 @@ void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
 	bd->len = cpu_to_le32(len);
 	bd->ptr = cpu_to_le64(ptr);
 
+	if (q->dram_bd)
+		for (i = 0 ; i < 2 ; i++) {
+			addr = q->pq_dram_address +
+			((hl_pi_2_offset(q->pi) * sizeof(struct hl_bd))	+ (i * sizeof(u64)));
+			hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM,	addr,
+						(u64 *)(bd) + i, DEBUGFS_WRITE64);
+		}
+
 	q->pi = hl_queue_inc_ptr(q->pi);
+
 	hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
 }
 
@@ -1087,12 +1098,18 @@ int hl_hw_queues_create(struct hl_device *hdev)
 		q->supports_sync_stream =
 				asic->hw_queues_props[i].supports_sync_stream;
 		q->collective_mode = asic->hw_queues_props[i].collective_mode;
+		q->dram_bd = asic->hw_queues_props[i].dram_bd;
+
 		rc = queue_init(hdev, q, i);
 		if (rc) {
 			dev_err(hdev->dev,
 				"failed to initialize queue %d\n", i);
 			goto release_queues;
 		}
+
+		/* Set DRAM PQ address for the queue if it should be at DRAM */
+		if (q->dram_bd)
+			q->pq_dram_address = asic->hw_queues_props[i].q_dram_bd_address;
 	}
 
 	return 0;
diff --git a/drivers/accel/habanalabs/common/mmu/Makefile b/drivers/accel/habanalabs/common/mmu/Makefile
index 1806c524e04aca..f4b815bf4f7d63 100644
--- a/drivers/accel/habanalabs/common/mmu/Makefile
+++ b/drivers/accel/habanalabs/common/mmu/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 HL_COMMON_MMU_FILES := common/mmu/mmu.o common/mmu/mmu_v1.o \
-			common/mmu/mmu_v2_hr.o
+			common/mmu/mmu_v2.o common/mmu/mmu_v2_hr.o
diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c
index b654302a68fc08..fa7919dba783c4 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu.c
@@ -585,6 +585,8 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
 
 int hl_mmu_if_set_funcs(struct hl_device *hdev)
 {
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+
 	if (hdev->mmu_disable)
 		return 0;
 
@@ -597,8 +599,9 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev)
 	case ASIC_GAUDI2:
 	case ASIC_GAUDI2B:
 	case ASIC_GAUDI2C:
-		/* MMUs in Gaudi2 are always host resident */
-		hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]);
+		hl_mmu_v2_set_funcs(hdev, &hdev->mmu_func[MMU_DR_PGT]);
+		if (prop->pmmu.host_resident)
+			hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]);
 		break;
 	default:
 		dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
@@ -1209,3 +1212,219 @@ int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_
 	return 0;
 }
 
+struct pgt_info *hl_mmu_dr_get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = NULL;
+
+	hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node,
+			(unsigned long) hop_addr)
+		if (hop_addr == pgt_info->shadow_addr)
+			break;
+
+	return pgt_info;
+}
+
+void hl_mmu_dr_free_hop(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = hl_mmu_dr_get_pgt_info(ctx, hop_addr);
+
+	hl_mmu_dr_free_pgt_node(ctx, pgt_info);
+}
+
+void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info)
+{
+	struct hl_device *hdev = ctx->hdev;
+
+	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, pgt_info->phys_addr,
+			hdev->asic_prop.mmu_hop_table_size);
+	hash_del(&pgt_info->node);
+	kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
+	kfree(pgt_info);
+}
+
+u64 hl_mmu_dr_get_phys_hop0_addr(struct hl_ctx *ctx)
+{
+	return ctx->hdev->asic_prop.mmu_pgt_addr +
+			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
+}
+
+u64 hl_mmu_dr_get_hop0_addr(struct hl_ctx *ctx)
+{
+	return (u64) (uintptr_t) ctx->hdev->mmu_priv.dr.mmu_shadow_hop0 +
+			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
+}
+
+u64 hl_mmu_dr_get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
+{
+	u64 page_mask = ctx->hdev->asic_prop.mmu_hop_table_size - 1;
+	u64 shadow_hop_addr = shadow_addr & (~page_mask);
+	u64 pte_offset = shadow_addr & page_mask;
+	u64 phys_hop_addr;
+
+	if (shadow_hop_addr != hl_mmu_dr_get_hop0_addr(ctx))
+		phys_hop_addr = hl_mmu_dr_get_pgt_info(ctx, shadow_hop_addr)->phys_addr;
+	else
+		phys_hop_addr = hl_mmu_dr_get_phys_hop0_addr(ctx);
+
+	return phys_hop_addr + pte_offset;
+}
+
+void hl_mmu_dr_write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
+{
+	u64 phys_val = hl_mmu_dr_get_phys_addr(ctx, val);
+
+	ctx->hdev->asic_funcs->write_pte(ctx->hdev, hl_mmu_dr_get_phys_addr(ctx, shadow_pte_addr),
+					phys_val);
+
+	*(u64 *) (uintptr_t) shadow_pte_addr = val;
+}
+
+void hl_mmu_dr_write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
+{
+	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
+				hl_mmu_dr_get_phys_addr(ctx, shadow_pte_addr), val);
+	*(u64 *) (uintptr_t) shadow_pte_addr = val;
+}
+
+void hl_mmu_dr_clear_pte(struct hl_ctx *ctx, u64 pte_addr)
+{
+	hl_mmu_dr_write_final_pte(ctx, pte_addr, 0);
+}
+
+void hl_mmu_dr_get_pte(struct hl_ctx *ctx, u64 hop_addr)
+{
+	hl_mmu_dr_get_pgt_info(ctx, hop_addr)->num_of_ptes++;
+}
+
+int hl_mmu_dr_put_pte(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = hl_mmu_dr_get_pgt_info(ctx, hop_addr);
+	int num_of_ptes_left;
+
+	pgt_info->num_of_ptes--;
+
+	/*
+	 * Need to save the number of ptes left because hl_mmu_free_hop might free
+	 * the pgt_info
+	 */
+	num_of_ptes_left = pgt_info->num_of_ptes;
+	if (!num_of_ptes_left)
+		hl_mmu_dr_free_pgt_node(ctx, pgt_info);
+
+	return num_of_ptes_left;
+}
+
+u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct pgt_info *pgt_info;
+	u64 phys_addr, shadow_addr;
+
+	pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
+	if (!pgt_info)
+		return ULLONG_MAX;
+
+	phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.dr.mmu_pgt_pool,
+					prop->mmu_hop_table_size);
+	if (!phys_addr) {
+		dev_err(hdev->dev, "failed to allocate page\n");
+		goto pool_add_err;
+	}
+
+	shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
+						GFP_KERNEL);
+	if (!shadow_addr)
+		goto shadow_err;
+
+	pgt_info->phys_addr = phys_addr;
+	pgt_info->shadow_addr = shadow_addr;
+	pgt_info->ctx = ctx;
+	pgt_info->num_of_ptes = 0;
+	hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr);
+
+	return shadow_addr;
+
+shadow_err:
+	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool,
+			phys_addr, prop->mmu_hop_table_size);
+pool_add_err:
+	kfree(pgt_info);
+
+	return ULLONG_MAX;
+}
+
+u64 hl_mmu_dr_get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte, bool *is_new_hop)
+{
+	u64 hop_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte);
+
+	if (hop_addr == ULLONG_MAX) {
+		hop_addr = hl_mmu_dr_alloc_hop(ctx);
+		*is_new_hop = (hop_addr != ULLONG_MAX);
+	}
+
+	return hop_addr;
+}
+
+void hl_mmu_dr_flush(struct hl_ctx *ctx)
+{
+	/* flush all writes from all cores to reach PCI */
+	mb();
+	ctx->hdev->asic_funcs->read_pte(ctx->hdev, hl_mmu_dr_get_phys_hop0_addr(ctx));
+}
+
+int hl_mmu_dr_init(struct hl_device *hdev)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int rc;
+
+	hdev->mmu_priv.dr.mmu_pgt_pool =
+			gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
+
+	if (!hdev->mmu_priv.dr.mmu_pgt_pool) {
+		dev_err(hdev->dev, "Failed to create page gen pool\n");
+		return -ENOMEM;
+	}
+
+	rc = gen_pool_add(hdev->mmu_priv.dr.mmu_pgt_pool, prop->mmu_pgt_addr +
+			prop->mmu_hop0_tables_total_size,
+			prop->dmmu.pgt_size - prop->mmu_hop0_tables_total_size,
+			-1);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
+		goto err_pool_add;
+	}
+
+	hdev->mmu_priv.dr.mmu_shadow_hop0 = kvcalloc(prop->max_asid,
+						prop->mmu_hop_table_size, GFP_KERNEL);
+	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
+		rc = -ENOMEM;
+		goto err_pool_add;
+	}
+
+	/* MMU H/W init will be done in device hw_init() */
+
+	return 0;
+
+err_pool_add:
+	gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
+
+	return rc;
+}
+
+void hl_mmu_dr_fini(struct hl_device *hdev)
+{
+	/* MMU H/W fini was already done in device hw_fini() */
+
+	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0))
+		return;
+
+	kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0);
+	gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
+
+	/* Make sure that if we arrive here again without init was
+	 * called we won't cause kernel panic. This can happen for
+	 * example if we fail during hard reset code at certain points
+	 */
+	hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL;
+}
diff --git a/drivers/accel/habanalabs/common/mmu/mmu_v1.c b/drivers/accel/habanalabs/common/mmu/mmu_v1.c
index d925dc4dd09725..64b5c8fbb166d9 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu_v1.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu_v1.c
@@ -12,166 +12,6 @@
 
 #define MMU_V1_MAX_HOPS	(MMU_HOP4 + 1)
 
-static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
-
-static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
-{
-	struct pgt_info *pgt_info = NULL;
-
-	hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node,
-				(unsigned long) hop_addr)
-		if (hop_addr == pgt_info->shadow_addr)
-			break;
-
-	return pgt_info;
-}
-
-static void _free_hop(struct hl_ctx *ctx, struct pgt_info *pgt_info)
-{
-	struct hl_device *hdev = ctx->hdev;
-
-	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, pgt_info->phys_addr,
-			hdev->asic_prop.mmu_hop_table_size);
-	hash_del(&pgt_info->node);
-	kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
-	kfree(pgt_info);
-}
-
-static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
-{
-	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
-
-	_free_hop(ctx, pgt_info);
-}
-
-static u64 alloc_hop(struct hl_ctx *ctx)
-{
-	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	struct pgt_info *pgt_info;
-	u64 phys_addr, shadow_addr;
-
-	pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
-	if (!pgt_info)
-		return ULLONG_MAX;
-
-	phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.dr.mmu_pgt_pool,
-					prop->mmu_hop_table_size);
-	if (!phys_addr) {
-		dev_err(hdev->dev, "failed to allocate page\n");
-		goto pool_add_err;
-	}
-
-	shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
-						GFP_KERNEL);
-	if (!shadow_addr)
-		goto shadow_err;
-
-	pgt_info->phys_addr = phys_addr;
-	pgt_info->shadow_addr = shadow_addr;
-	pgt_info->ctx = ctx;
-	pgt_info->num_of_ptes = 0;
-	hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr);
-
-	return shadow_addr;
-
-shadow_err:
-	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, phys_addr,
-			prop->mmu_hop_table_size);
-pool_add_err:
-	kfree(pgt_info);
-
-	return ULLONG_MAX;
-}
-
-static inline u64 get_phys_hop0_addr(struct hl_ctx *ctx)
-{
-	return ctx->hdev->asic_prop.mmu_pgt_addr +
-			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
-}
-
-static inline u64 get_hop0_addr(struct hl_ctx *ctx)
-{
-	return (u64) (uintptr_t) ctx->hdev->mmu_priv.dr.mmu_shadow_hop0 +
-			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
-}
-
-static void flush(struct hl_ctx *ctx)
-{
-	/* flush all writes from all cores to reach PCI */
-	mb();
-	ctx->hdev->asic_funcs->read_pte(ctx->hdev, get_phys_hop0_addr(ctx));
-}
-
-/* transform the value to physical address when writing to H/W */
-static inline void write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
-{
-	/*
-	 * The value to write is actually the address of the next shadow hop +
-	 * flags at the 12 LSBs.
-	 * Hence in order to get the value to write to the physical PTE, we
-	 * clear the 12 LSBs and translate the shadow hop to its associated
-	 * physical hop, and add back the original 12 LSBs.
-	 */
-	u64 phys_val = get_phys_addr(ctx, val & HOP_PHYS_ADDR_MASK) |
-				(val & FLAGS_MASK);
-
-	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
-					get_phys_addr(ctx, shadow_pte_addr),
-					phys_val);
-
-	*(u64 *) (uintptr_t) shadow_pte_addr = val;
-}
-
-/* do not transform the value to physical address when writing to H/W */
-static inline void write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr,
-					u64 val)
-{
-	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
-					get_phys_addr(ctx, shadow_pte_addr),
-					val);
-	*(u64 *) (uintptr_t) shadow_pte_addr = val;
-}
-
-/* clear the last and present bits */
-static inline void clear_pte(struct hl_ctx *ctx, u64 pte_addr)
-{
-	/* no need to transform the value to physical address */
-	write_final_pte(ctx, pte_addr, 0);
-}
-
-static inline void get_pte(struct hl_ctx *ctx, u64 hop_addr)
-{
-	get_pgt_info(ctx, hop_addr)->num_of_ptes++;
-}
-
-/*
- * put_pte - decrement the num of ptes and free the hop if possible
- *
- * @ctx: pointer to the context structure
- * @hop_addr: addr of the hop
- *
- * This function returns the number of ptes left on this hop. If the number is
- * 0, it means the pte was freed.
- */
-static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr)
-{
-	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
-	int num_of_ptes_left;
-
-	pgt_info->num_of_ptes--;
-
-	/*
-	 * Need to save the number of ptes left because free_hop might free
-	 * the pgt_info
-	 */
-	num_of_ptes_left = pgt_info->num_of_ptes;
-	if (!num_of_ptes_left)
-		_free_hop(ctx, pgt_info);
-
-	return num_of_ptes_left;
-}
-
 static inline u64 get_hop_pte_addr(struct hl_ctx *ctx, struct hl_mmu_properties *mmu_prop,
 					u64 *hop_addr_arr, u64 virt_addr, enum mmu_hop_num hop_idx)
 {
@@ -183,35 +23,6 @@ static inline u64 get_hop_pte_addr(struct hl_ctx *ctx, struct hl_mmu_properties
 			ctx->hdev->asic_prop.mmu_pte_size * ((virt_addr & mask) >> shift);
 }
 
-static inline u64 get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte,
-						bool *is_new_hop)
-{
-	u64 hop_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte);
-
-	if (hop_addr == ULLONG_MAX) {
-		hop_addr = alloc_hop(ctx);
-		*is_new_hop = (hop_addr != ULLONG_MAX);
-	}
-
-	return hop_addr;
-}
-
-/* translates shadow address inside hop to a physical address */
-static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
-{
-	u64 page_mask = (ctx->hdev->asic_prop.mmu_hop_table_size - 1);
-	u64 shadow_hop_addr = shadow_addr & ~page_mask;
-	u64 pte_offset = shadow_addr & page_mask;
-	u64 phys_hop_addr;
-
-	if (shadow_hop_addr != get_hop0_addr(ctx))
-		phys_hop_addr = get_pgt_info(ctx, shadow_hop_addr)->phys_addr;
-	else
-		phys_hop_addr = get_phys_hop0_addr(ctx);
-
-	return phys_hop_addr + pte_offset;
-}
-
 static int dram_default_mapping_init(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
@@ -236,9 +47,9 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 	if (!ctx->dram_default_hops)
 		return -ENOMEM;
 
-	hop0_addr = get_hop0_addr(ctx);
+	hop0_addr = hl_mmu_dr_get_hop0_addr(ctx);
 
-	hop1_addr = alloc_hop(ctx);
+	hop1_addr = hl_mmu_dr_alloc_hop(ctx);
 	if (hop1_addr == ULLONG_MAX) {
 		dev_err(hdev->dev, "failed to alloc hop 1\n");
 		rc = -ENOMEM;
@@ -247,7 +58,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 
 	ctx->dram_default_hops[total_hops - 1] = hop1_addr;
 
-	hop2_addr = alloc_hop(ctx);
+	hop2_addr = hl_mmu_dr_alloc_hop(ctx);
 	if (hop2_addr == ULLONG_MAX) {
 		dev_err(hdev->dev, "failed to alloc hop 2\n");
 		rc = -ENOMEM;
@@ -257,7 +68,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 	ctx->dram_default_hops[total_hops - 2] = hop2_addr;
 
 	for (i = 0 ; i < num_of_hop3 ; i++) {
-		ctx->dram_default_hops[i] = alloc_hop(ctx);
+		ctx->dram_default_hops[i] = hl_mmu_dr_alloc_hop(ctx);
 		if (ctx->dram_default_hops[i] == ULLONG_MAX) {
 			dev_err(hdev->dev, "failed to alloc hop 3, i: %d\n", i);
 			rc = -ENOMEM;
@@ -268,18 +79,18 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 
 	/* need only pte 0 in hops 0 and 1 */
 	pte_val = (hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-	write_pte(ctx, hop0_addr, pte_val);
+	hl_mmu_dr_write_pte(ctx, hop0_addr, pte_val);
 
 	pte_val = (hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-	write_pte(ctx, hop1_addr, pte_val);
-	get_pte(ctx, hop1_addr);
+	hl_mmu_dr_write_pte(ctx, hop1_addr, pte_val);
+	hl_mmu_dr_get_pte(ctx, hop1_addr);
 
 	hop2_pte_addr = hop2_addr;
 	for (i = 0 ; i < num_of_hop3 ; i++) {
 		pte_val = (ctx->dram_default_hops[i] & HOP_PHYS_ADDR_MASK) |
 				PAGE_PRESENT_MASK;
-		write_pte(ctx, hop2_pte_addr, pte_val);
-		get_pte(ctx, hop2_addr);
+		hl_mmu_dr_write_pte(ctx, hop2_pte_addr, pte_val);
+		hl_mmu_dr_get_pte(ctx, hop2_addr);
 		hop2_pte_addr += HL_PTE_SIZE;
 	}
 
@@ -289,23 +100,23 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 	for (i = 0 ; i < num_of_hop3 ; i++) {
 		hop3_pte_addr = ctx->dram_default_hops[i];
 		for (j = 0 ; j < HOP_PTE_ENTRIES_512 ; j++) {
-			write_final_pte(ctx, hop3_pte_addr, pte_val);
-			get_pte(ctx, ctx->dram_default_hops[i]);
+			hl_mmu_dr_write_final_pte(ctx, hop3_pte_addr, pte_val);
+			hl_mmu_dr_get_pte(ctx, ctx->dram_default_hops[i]);
 			hop3_pte_addr += HL_PTE_SIZE;
 		}
 	}
 
-	flush(ctx);
+	hl_mmu_dr_flush(ctx);
 
 	return 0;
 
 hop3_err:
 	for (i = 0 ; i < hop3_allocated ; i++)
-		free_hop(ctx, ctx->dram_default_hops[i]);
+		hl_mmu_dr_free_hop(ctx, ctx->dram_default_hops[i]);
 
-	free_hop(ctx, hop2_addr);
+	hl_mmu_dr_free_hop(ctx, hop2_addr);
 hop2_err:
-	free_hop(ctx, hop1_addr);
+	hl_mmu_dr_free_hop(ctx, hop1_addr);
 hop1_err:
 	kfree(ctx->dram_default_hops);
 
@@ -329,7 +140,7 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx)
 	do_div(num_of_hop3, prop->dram_page_size);
 	do_div(num_of_hop3, HOP_PTE_ENTRIES_512);
 
-	hop0_addr = get_hop0_addr(ctx);
+	hop0_addr = hl_mmu_dr_get_hop0_addr(ctx);
 	/* add hop1 and hop2 */
 	total_hops = num_of_hop3 + 2;
 	hop1_addr = ctx->dram_default_hops[total_hops - 1];
@@ -338,101 +149,26 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx)
 	for (i = 0 ; i < num_of_hop3 ; i++) {
 		hop3_pte_addr = ctx->dram_default_hops[i];
 		for (j = 0 ; j < HOP_PTE_ENTRIES_512 ; j++) {
-			clear_pte(ctx, hop3_pte_addr);
-			put_pte(ctx, ctx->dram_default_hops[i]);
+			hl_mmu_dr_clear_pte(ctx, hop3_pte_addr);
+			hl_mmu_dr_put_pte(ctx, ctx->dram_default_hops[i]);
 			hop3_pte_addr += HL_PTE_SIZE;
 		}
 	}
 
 	hop2_pte_addr = hop2_addr;
 	for (i = 0 ; i < num_of_hop3 ; i++) {
-		clear_pte(ctx, hop2_pte_addr);
-		put_pte(ctx, hop2_addr);
+		hl_mmu_dr_clear_pte(ctx, hop2_pte_addr);
+		hl_mmu_dr_put_pte(ctx, hop2_addr);
 		hop2_pte_addr += HL_PTE_SIZE;
 	}
 
-	clear_pte(ctx, hop1_addr);
-	put_pte(ctx, hop1_addr);
-	clear_pte(ctx, hop0_addr);
+	hl_mmu_dr_clear_pte(ctx, hop1_addr);
+	hl_mmu_dr_put_pte(ctx, hop1_addr);
+	hl_mmu_dr_clear_pte(ctx, hop0_addr);
 
 	kfree(ctx->dram_default_hops);
 
-	flush(ctx);
-}
-
-/**
- * hl_mmu_v1_init() - initialize the MMU module.
- * @hdev: habanalabs device structure.
- *
- * This function does the following:
- * - Create a pool of pages for pgt_infos.
- * - Create a shadow table for pgt
- *
- * Return: 0 for success, non-zero for failure.
- */
-static int hl_mmu_v1_init(struct hl_device *hdev)
-{
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	int rc;
-
-	hdev->mmu_priv.dr.mmu_pgt_pool =
-			gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
-
-	if (!hdev->mmu_priv.dr.mmu_pgt_pool) {
-		dev_err(hdev->dev, "Failed to create page gen pool\n");
-		return -ENOMEM;
-	}
-
-	rc = gen_pool_add(hdev->mmu_priv.dr.mmu_pgt_pool, prop->mmu_pgt_addr +
-			prop->mmu_hop0_tables_total_size,
-			prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size,
-			-1);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
-		goto err_pool_add;
-	}
-
-	hdev->mmu_priv.dr.mmu_shadow_hop0 = kvcalloc(prop->max_asid, prop->mmu_hop_table_size,
-										GFP_KERNEL);
-	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
-		rc = -ENOMEM;
-		goto err_pool_add;
-	}
-
-	/* MMU H/W init will be done in device hw_init() */
-
-	return 0;
-
-err_pool_add:
-	gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
-
-	return rc;
-}
-
-/**
- * hl_mmu_v1_fini() - release the MMU module.
- * @hdev: habanalabs device structure.
- *
- * This function does the following:
- * - Disable MMU in H/W.
- * - Free the pgt_infos pool.
- *
- * All contexts should be freed before calling this function.
- */
-static void hl_mmu_v1_fini(struct hl_device *hdev)
-{
-	/* MMU H/W fini was already done in device hw_fini() */
-
-	if (!ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
-		kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0);
-		gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
-
-		/* Make sure that if we arrive here again without init was
-		 * called we won't cause kernel panic. This can happen for
-		 * example if we fail during hard reset code at certain points
-		 */
-		hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL;
-	}
+	hl_mmu_dr_flush(ctx);
 }
 
 /**
@@ -476,7 +212,7 @@ static void hl_mmu_v1_ctx_fini(struct hl_ctx *ctx)
 		dev_err_ratelimited(hdev->dev,
 			"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
 			pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
-		_free_hop(ctx, pgt_info);
+		hl_mmu_dr_free_pgt_node(ctx, pgt_info);
 	}
 }
 
@@ -495,7 +231,7 @@ static int hl_mmu_v1_unmap(struct hl_ctx *ctx,
 
 	for (hop_idx = MMU_HOP0; hop_idx < MMU_HOP4; hop_idx++) {
 		if (hop_idx == MMU_HOP0) {
-			hop_addr[hop_idx] = get_hop0_addr(ctx);
+			hop_addr[hop_idx] = hl_mmu_dr_get_hop0_addr(ctx);
 		} else {
 			hop_addr[hop_idx] = hl_mmu_get_next_hop_addr(ctx, curr_pte);
 			if (hop_addr[hop_idx] == ULLONG_MAX)
@@ -546,30 +282,30 @@ static int hl_mmu_v1_unmap(struct hl_ctx *ctx,
 		}
 
 		hop_idx = MMU_HOP3;
-		write_final_pte(ctx, hop_pte_addr[hop_idx], default_pte);
-		put_pte(ctx, hop_addr[hop_idx]);
+		hl_mmu_dr_write_final_pte(ctx, hop_pte_addr[hop_idx], default_pte);
+		hl_mmu_dr_put_pte(ctx, hop_addr[hop_idx]);
 	} else {
 		if (!(curr_pte & PAGE_PRESENT_MASK))
 			goto not_mapped;
 
 		if (hop_addr[MMU_HOP4])
-			clear_pte(ctx, hop_pte_addr[MMU_HOP4]);
+			hl_mmu_dr_clear_pte(ctx, hop_pte_addr[MMU_HOP4]);
 		else
-			clear_pte(ctx, hop_pte_addr[MMU_HOP3]);
+			hl_mmu_dr_clear_pte(ctx, hop_pte_addr[MMU_HOP3]);
 
-		if (hop_addr[MMU_HOP4] && !put_pte(ctx, hop_addr[MMU_HOP4]))
+		if (hop_addr[MMU_HOP4] && !hl_mmu_dr_put_pte(ctx, hop_addr[MMU_HOP4]))
 			clear_hop3 = true;
 
 		if (!clear_hop3)
 			goto mapped;
 
 		for (hop_idx = MMU_HOP3; hop_idx >= 0; hop_idx--) {
-			clear_pte(ctx, hop_pte_addr[hop_idx]);
+			hl_mmu_dr_clear_pte(ctx, hop_pte_addr[hop_idx]);
 
 			if (hop_idx == MMU_HOP0)
 				break;
 
-			if (put_pte(ctx, hop_addr[hop_idx]))
+			if (hl_mmu_dr_put_pte(ctx, hop_addr[hop_idx]))
 				goto mapped;
 		}
 	}
@@ -616,10 +352,10 @@ static int hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 
 	for (hop_idx = MMU_HOP0; hop_idx < num_hops; hop_idx++) {
 		if (hop_idx == MMU_HOP0) {
-			hop_addr[hop_idx] = get_hop0_addr(ctx);
+			hop_addr[hop_idx] = hl_mmu_dr_get_hop0_addr(ctx);
 		} else {
 			hop_addr[hop_idx] =
-					get_alloc_next_hop_addr(ctx, curr_pte, &hop_new[hop_idx]);
+				hl_mmu_dr_get_alloc_next_hop_addr(ctx, curr_pte, &hop_new[hop_idx]);
 			if (hop_addr[hop_idx] == ULLONG_MAX)
 				goto err;
 		}
@@ -666,27 +402,27 @@ static int hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 	curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | mmu_prop->last_mask
 			| PAGE_PRESENT_MASK;
 
-	write_final_pte(ctx, hop_pte_addr[num_hops - 1], curr_pte);
+	hl_mmu_dr_write_final_pte(ctx, hop_pte_addr[num_hops - 1], curr_pte);
 
 	for (hop_idx = MMU_HOP1; hop_idx < num_hops; hop_idx++) {
 		prev_hop = hop_idx - 1;
 
 		if (hop_new[hop_idx]) {
 			curr_pte = (hop_addr[hop_idx] & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-			write_pte(ctx, hop_pte_addr[prev_hop], curr_pte);
+			hl_mmu_dr_write_pte(ctx, hop_pte_addr[prev_hop], curr_pte);
 			if (hop_idx != MMU_HOP1)
-				get_pte(ctx, hop_addr[prev_hop]);
+				hl_mmu_dr_get_pte(ctx, hop_addr[prev_hop]);
 		}
 	}
 
-	get_pte(ctx, hop_addr[num_hops - 1]);
+	hl_mmu_dr_get_pte(ctx, hop_addr[num_hops - 1]);
 
 	return 0;
 
 err:
 	for (hop_idx = num_hops; hop_idx > MMU_HOP0; hop_idx--) {
 		if (hop_new[hop_idx])
-			free_hop(ctx, hop_addr[hop_idx]);
+			hl_mmu_dr_free_hop(ctx, hop_addr[hop_idx]);
 	}
 
 	return rc;
@@ -752,7 +488,7 @@ static int hl_mmu_v1_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
 	if (is_huge)
 		used_hops--;
 
-	hops->hop_info[0].hop_addr = get_phys_hop0_addr(ctx);
+	hops->hop_info[0].hop_addr = hl_mmu_dr_get_phys_hop0_addr(ctx);
 	hops->hop_info[0].hop_pte_addr =
 			hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0,
 					hops->hop_info[0].hop_addr, virt_addr);
@@ -801,13 +537,13 @@ static int hl_mmu_v1_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
  */
 void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu)
 {
-	mmu->init = hl_mmu_v1_init;
-	mmu->fini = hl_mmu_v1_fini;
+	mmu->init = hl_mmu_dr_init;
+	mmu->fini = hl_mmu_dr_fini;
 	mmu->ctx_init = hl_mmu_v1_ctx_init;
 	mmu->ctx_fini = hl_mmu_v1_ctx_fini;
 	mmu->map = hl_mmu_v1_map;
 	mmu->unmap = hl_mmu_v1_unmap;
-	mmu->flush = flush;
+	mmu->flush = hl_mmu_dr_flush;
 	mmu->swap_out = hl_mmu_v1_swap_out;
 	mmu->swap_in = hl_mmu_v1_swap_in;
 	mmu->get_tlb_info = hl_mmu_v1_get_tlb_info;
diff --git a/drivers/accel/habanalabs/common/mmu/mmu_v2.c b/drivers/accel/habanalabs/common/mmu/mmu_v2.c
new file mode 100644
index 00000000000000..4bc0268fff1cf0
--- /dev/null
+++ b/drivers/accel/habanalabs/common/mmu/mmu_v2.c
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2020 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include "../habanalabs.h"
+#include "../../include/hw_ip/mmu/mmu_general.h"
+#include "../../include/hw_ip/mmu/mmu_v2_0.h"
+
+#include <linux/slab.h>
+
+/**
+ * hl_mmu_v2_ctx_init() - initialize a context for using the MMU module.
+ * @ctx: pointer to the context structure to initialize.
+ *
+ * Initialize a mutex to protect the concurrent mapping flow, a hash to hold all
+ * page tables hops related to this context.
+ * Return: 0 on success, non-zero otherwise.
+ */
+static int hl_mmu_v2_ctx_init(struct hl_ctx *ctx)
+{
+	hash_init(ctx->mmu_shadow_hash);
+
+	return 0;
+}
+
+/*
+ * hl_mmu_v2_ctx_fini - disable a ctx from using the mmu module
+ *
+ * @ctx: pointer to the context structure
+ *
+ * This function does the following:
+ * - Free any pgts which were not freed yet
+ * - Free the mutex
+ * - Free DRAM default page mapping hops
+ */
+static void hl_mmu_v2_ctx_fini(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct pgt_info *pgt_info;
+	struct hlist_node *tmp;
+	int i;
+
+	if (!hash_empty(ctx->mmu_shadow_hash))
+		dev_err(hdev->dev, "ctx %d is freed while it has pgts in use\n",
+			ctx->asid);
+
+	hash_for_each_safe(ctx->mmu_shadow_hash, i, tmp, pgt_info, node) {
+		dev_err_ratelimited(hdev->dev,
+			"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
+			pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
+		hl_mmu_dr_free_pgt_node(ctx, pgt_info);
+	}
+}
+
+static int hl_mmu_v2_unmap(struct hl_ctx *ctx,	u64 virt_addr, bool is_dram_addr)
+{
+	u64 hop_addr[MMU_ARCH_6_HOPS] = { 0 }, hop_pte_addr[MMU_ARCH_6_HOPS] = { 0 }, curr_pte,
+							scrambled_virt_addr;
+	struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_mmu_properties *mmu_prop;
+	bool is_huge = false;
+	int i, hop_last;
+
+	/* device resident in V2 are allowed only for HMMU */
+	if (!is_dram_addr)
+		return -EINVAL;
+
+	mmu_prop = &prop->dmmu;
+
+	hop_last = mmu_prop->num_hops - 1;
+
+	scrambled_virt_addr = hdev->asic_funcs->scramble_addr(hdev, virt_addr);
+
+	hop_addr[0] = hl_mmu_dr_get_hop0_addr(ctx);
+	hop_pte_addr[0] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0,
+					hop_addr[0], scrambled_virt_addr);
+	if (hop_pte_addr[0] == U64_MAX)
+		return -EFAULT;
+
+	curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[0];
+
+	for (i = 1 ; i < mmu_prop->num_hops ; i++) {
+		hop_addr[i] = hl_mmu_get_next_hop_addr(ctx, curr_pte);
+		if (hop_addr[i] == ULLONG_MAX)
+			goto not_mapped;
+
+		hop_pte_addr[i] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i,
+					hop_addr[i], scrambled_virt_addr);
+		if (hop_pte_addr[i] == U64_MAX)
+			return -EFAULT;
+
+		curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[i];
+
+		if ((i <= hop_last) && (curr_pte & mmu_prop->last_mask)) {
+			hop_last = i;
+			is_huge = true;
+			break;
+		}
+	}
+
+	if (is_dram_addr && !is_huge) {
+		dev_err(hdev->dev, "DRAM unmapping should use huge pages only\n");
+		return -EFAULT;
+	}
+
+	if (!(curr_pte & PAGE_PRESENT_MASK))
+		goto not_mapped;
+
+	for (i = hop_last ; i > 0 ; i--) {
+		hl_mmu_dr_clear_pte(ctx, hop_pte_addr[i]);
+		if (hl_mmu_dr_put_pte(ctx, hop_addr[i]))
+			goto mapped;
+	}
+	hl_mmu_dr_clear_pte(ctx, hop_pte_addr[0]);
+
+mapped:
+	return 0;
+
+not_mapped:
+	dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n",
+		virt_addr);
+
+	return -EINVAL;
+}
+
+static int hl_mmu_v2_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
+							u32 page_size, bool is_dram_addr)
+{
+	u64 hop_addr[MMU_ARCH_6_HOPS] = { 0 }, hop_pte_addr[MMU_ARCH_6_HOPS] = { 0 },
+			curr_pte = 0, scrambled_virt_addr, scrambled_phys_addr;
+	struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
+	bool hop_new[MMU_ARCH_6_HOPS] = { false };
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_mmu_properties *mmu_prop;
+	int rc, i, hop_last;
+
+	/* device resident in V2 are allowed only for HMMU */
+	if (!is_dram_addr)
+		return -EINVAL;
+
+	mmu_prop = &prop->dmmu;
+
+	hop_last = mmu_prop->num_hops - 1;
+
+	scrambled_virt_addr = hdev->asic_funcs->scramble_addr(hdev, virt_addr);
+	scrambled_phys_addr = hdev->asic_funcs->scramble_addr(hdev, phys_addr);
+
+	/* First hop is preallocated therefore it is treated differently  */
+	hop_addr[0] = hl_mmu_dr_get_hop0_addr(ctx);
+	hop_pte_addr[0] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0,
+						hop_addr[0], scrambled_virt_addr);
+	curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[0];
+
+	/* Handle hop1 to hop_last */
+	for (i = 1 ; i <= hop_last ; i++) {
+		hop_addr[i] = hl_mmu_dr_get_alloc_next_hop_addr(ctx, curr_pte, &hop_new[i]);
+		if (hop_addr[i] == ULLONG_MAX) {
+			rc = -ENOMEM;
+			goto err;
+		}
+
+		hop_pte_addr[i] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i,
+					hop_addr[i], scrambled_virt_addr);
+		if (hop_pte_addr[i] == U64_MAX) {
+			rc = -EINVAL;
+			goto err;
+		}
+
+		if (!hop_pte_addr[i]) {
+			rc = -EINVAL;
+			goto err;
+		}
+
+		curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[i];
+	}
+
+	if (curr_pte & PAGE_PRESENT_MASK) {
+		dev_err(hdev->dev,
+			"mapping already exists for virt_addr 0x%llx\n",
+				virt_addr);
+
+		for (i = 0 ; i <= hop_last ; i++)
+			dev_dbg(hdev->dev, "hop%d pte: 0x%llx (0x%llx)\n",
+				i, *(u64 *) (uintptr_t) hop_pte_addr[i],
+				hop_pte_addr[i]);
+
+		rc = -EINVAL;
+		goto err;
+	}
+
+	curr_pte = (scrambled_phys_addr & HOP_PHYS_ADDR_MASK)
+					| mmu_prop->last_mask | PAGE_PRESENT_MASK;
+
+	/* Write the PTEs */
+	hl_mmu_dr_write_final_pte(ctx, hop_pte_addr[hop_last], curr_pte);
+
+	/* for each new hop, add its address to the table of previous-hop */
+	for (i = 1 ; i <= hop_last ; i++) {
+		if (hop_new[i]) {
+			curr_pte = (hop_addr[i] & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+			hl_mmu_dr_write_pte(ctx, hop_pte_addr[i - 1], curr_pte);
+
+			if (i - 1)
+				hl_mmu_dr_get_pte(ctx, hop_addr[i - 1]);
+		}
+	}
+	hl_mmu_dr_get_pte(ctx, hop_addr[hop_last]);
+
+	return 0;
+
+err:
+	for (i = 1 ; i <= hop_last ; i++)
+		if (hop_new[i] && (hop_addr[i] != U64_MAX))
+			hl_mmu_dr_free_hop(ctx, hop_addr[i]);
+
+	return rc;
+}
+
+/*
+ * hl_mmu_v2_swap_out - marks all mapping of the given ctx as swapped out
+ *
+ * @ctx: pointer to the context structure
+ *
+ */
+static void hl_mmu_v2_swap_out(struct hl_ctx *ctx)
+{
+
+}
+
+/*
+ * hl_mmu_v2_swap_in - marks all mapping of the given ctx as swapped in
+ *
+ * @ctx: pointer to the context structure
+ *
+ */
+static void hl_mmu_v2_swap_in(struct hl_ctx *ctx)
+{
+
+}
+
+static int hl_mmu_v2_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_info *hops)
+{
+	struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_mmu_properties *mmu_prop;
+	bool is_dram_addr;
+	int i;
+
+	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
+						prop->dmmu.start_addr,
+						prop->dmmu.end_addr);
+
+	/* device resident in V2 are allowed only for HMMU */
+	if (!is_dram_addr)
+		return -EINVAL;
+
+	mmu_prop = &prop->dmmu;
+	hops->range_type = HL_VA_RANGE_TYPE_DRAM;
+
+	hops->scrambled_vaddr = hdev->asic_funcs->scramble_addr(hdev, virt_addr);
+
+	hops->hop_info[0].hop_addr = hl_mmu_dr_get_phys_hop0_addr(ctx);
+	hops->hop_info[0].hop_pte_addr = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0,
+						hops->hop_info[0].hop_addr,
+							hops->scrambled_vaddr);
+	if (hops->hop_info[0].hop_pte_addr == U64_MAX)
+		return -EFAULT;
+
+	hops->hop_info[0].hop_pte_val = hdev->asic_funcs->read_pte(hdev,
+						hops->hop_info[0].hop_pte_addr);
+	if (hops->hop_info[0].hop_pte_val == U64_MAX)
+		return -EFAULT;
+
+	for (i = 1 ; i < mmu_prop->num_hops ; i++) {
+		hops->hop_info[i].hop_addr =
+			hl_mmu_get_next_hop_addr(ctx, hops->hop_info[i - 1].hop_pte_val);
+		if (hops->hop_info[i].hop_addr == ULLONG_MAX)
+			return -EFAULT;
+
+		hops->hop_info[i].hop_pte_addr =
+				hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i,
+						hops->hop_info[i].hop_addr,
+						hops->scrambled_vaddr);
+		if (hops->hop_info[i].hop_pte_addr == U64_MAX)
+			return -EFAULT;
+
+		hops->hop_info[i].hop_pte_val =
+				hdev->asic_funcs->read_pte(hdev,
+					hops->hop_info[i].hop_pte_addr);
+
+		if (!(hops->hop_info[i].hop_pte_val & PAGE_PRESENT_MASK))
+			return -EFAULT;
+
+		if (hops->hop_info[i].hop_pte_val & mmu_prop->last_mask)
+			break;
+	}
+
+	/* if passed over all hops then no last hop was found */
+	if (i == mmu_prop->num_hops)
+		return -EFAULT;
+
+	if (!(hops->hop_info[i].hop_pte_val & PAGE_PRESENT_MASK))
+		return -EFAULT;
+
+	if (hops->scrambled_vaddr != virt_addr)
+		hops->unscrambled_paddr = hdev->asic_funcs->descramble_addr
+				(hdev, hops->hop_info[i].hop_pte_val);
+	else
+		hops->unscrambled_paddr = hops->hop_info[i].hop_pte_val;
+
+	hops->used_hops = i + 1;
+
+	return 0;
+}
+
+/*
+ * hl_mmu_v2_prepare - prepare mmu_if for working with mmu v2
+ *
+ * @hdev: pointer to the device structure
+ * @mmu_if: pointer to the mmu interface structure
+ */
+void hl_mmu_v2_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu)
+{
+	mmu->init = hl_mmu_dr_init;
+	mmu->fini = hl_mmu_dr_fini;
+	mmu->ctx_init = hl_mmu_v2_ctx_init;
+	mmu->ctx_fini = hl_mmu_v2_ctx_fini;
+	mmu->map = hl_mmu_v2_map;
+	mmu->unmap = hl_mmu_v2_unmap;
+	mmu->flush = hl_mmu_dr_flush;
+	mmu->swap_out = hl_mmu_v2_swap_out;
+	mmu->swap_in = hl_mmu_v2_swap_in;
+	mmu->get_tlb_info = hl_mmu_v2_get_tlb_info;
+}
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index 53292d4c15c865..dde3839fe0e070 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -649,6 +649,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	prop->dmmu.start_addr = (VA_HOST_SPACE_START + VA_HOST_SPACE_SIZE / 2);
 	prop->dmmu.end_addr = VA_HOST_SPACE_END;
 	prop->dmmu.page_size = PAGE_SIZE_2MB;
+	prop->dmmu.pgt_size = prop->mmu_pgt_size;
 
 	prop->cfg_size = CFG_SIZE;
 	prop->max_asid = MAX_ASID;
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index fd01525b1ea204..5863c904913433 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2308,11 +2308,26 @@ static int set_number_of_functional_hbms(struct hl_device *hdev)
 	return 0;
 }
 
+static bool gaudi2_is_edma_queue_id(u32 queue_id)
+{
+
+	switch (queue_id) {
+	case GAUDI2_QUEUE_ID_DCORE0_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE0_EDMA_1_3:
+	case GAUDI2_QUEUE_ID_DCORE1_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE1_EDMA_1_3:
+	case GAUDI2_QUEUE_ID_DCORE2_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE2_EDMA_1_3:
+	case GAUDI2_QUEUE_ID_DCORE3_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE3_EDMA_1_3:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static int gaudi2_set_dram_properties(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	u32 basic_hbm_page_size;
-	int rc;
+	u64 hbm_drv_base_offset = 0, edma_pq_base_addr;
+	u32 basic_hbm_page_size, edma_idx = 0;
+	int rc, i;
 
 	rc = set_number_of_functional_hbms(hdev);
 	if (rc)
@@ -2356,9 +2371,35 @@ static int gaudi2_set_dram_properties(struct hl_device *hdev)
 	prop->dmmu.start_addr = prop->dram_base_address +
 			(prop->dram_page_size *
 				DIV_ROUND_UP_SECTOR_T(prop->dram_size, prop->dram_page_size));
-
 	prop->dmmu.end_addr = prop->dmmu.start_addr + prop->dram_page_size *
 			div_u64((VA_HBM_SPACE_END - prop->dmmu.start_addr), prop->dmmu.page_size);
+	/*
+	 * Driver can't share an (48MB) HBM page with the F/W in order to prevent FW to block
+	 * the driver part by range register, so it must start at the next (48MB) page
+	 */
+	hbm_drv_base_offset = roundup(CPU_FW_IMAGE_SIZE, prop->num_functional_hbms * SZ_8M);
+
+	/*
+	 * The NIC driver section size and the HMMU page tables section in the HBM needs
+	 * to be the remaining size in the first dram page after taking into
+	 * account the F/W image size
+	 */
+
+	/* Reserve region in HBM for HMMU page tables */
+	prop->mmu_pgt_addr = DRAM_PHYS_BASE + hbm_drv_base_offset +
+				((prop->dram_page_size - hbm_drv_base_offset) -
+				(HMMU_PAGE_TABLES_SIZE + EDMA_PQS_SIZE + EDMA_SCRATCHPAD_SIZE));
+
+	/* Set EDMA PQs HBM addresses */
+	edma_pq_base_addr = prop->mmu_pgt_addr + HMMU_PAGE_TABLES_SIZE;
+
+	for (i = 0 ; i < GAUDI2_QUEUE_ID_CPU_PQ ; i++) {
+		if (gaudi2_is_edma_queue_id(i)) {
+			prop->hw_queues_props[i].q_dram_bd_address = edma_pq_base_addr +
+							(edma_idx * HL_QUEUE_SIZE_IN_BYTES);
+			edma_idx++;
+		}
+	}
 
 	return 0;
 }
@@ -2368,7 +2409,7 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct hw_queue_properties *q_props;
 	u32 num_sync_stream_queues = 0;
-	int i;
+	int i, rc;
 
 	prop->max_queues = GAUDI2_QUEUE_ID_SIZE;
 	prop->hw_queues_props = kcalloc(prop->max_queues, sizeof(struct hw_queue_properties),
@@ -2391,6 +2432,9 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 		}
 
 		q_props[i].cb_alloc_flags = CB_ALLOC_USER;
+
+		if (gaudi2_is_edma_queue_id(i))
+			q_props[i].dram_bd = 1;
 	}
 
 	q_props[GAUDI2_QUEUE_ID_CPU_PQ].type = QUEUE_TYPE_CPU;
@@ -2419,40 +2463,39 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 
 	prop->rotator_enabled_mask = BIT(NUM_OF_ROT) - 1;
 
-	if (hdev->pldm)
-		prop->mmu_pgt_size = 0x800000; /* 8MB */
-	else
-		prop->mmu_pgt_size = MMU_PAGE_TABLES_INITIAL_SIZE;
+	prop->max_asid = 2;
 
+	prop->dmmu.pgt_size = HMMU_PAGE_TABLES_SIZE;
 	prop->mmu_pte_size = HL_PTE_SIZE;
 	prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
-	prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
+	prop->mmu_hop0_tables_total_size = HOP_TABLE_SIZE_512_PTE * prop->max_asid;
 
 	prop->dmmu.hop_shifts[MMU_HOP0] = DHOP0_SHIFT;
 	prop->dmmu.hop_shifts[MMU_HOP1] = DHOP1_SHIFT;
 	prop->dmmu.hop_shifts[MMU_HOP2] = DHOP2_SHIFT;
 	prop->dmmu.hop_shifts[MMU_HOP3] = DHOP3_SHIFT;
-	prop->dmmu.hop_shifts[MMU_HOP4] = DHOP4_SHIFT;
 	prop->dmmu.hop_masks[MMU_HOP0] = DHOP0_MASK;
 	prop->dmmu.hop_masks[MMU_HOP1] = DHOP1_MASK;
 	prop->dmmu.hop_masks[MMU_HOP2] = DHOP2_MASK;
 	prop->dmmu.hop_masks[MMU_HOP3] = DHOP3_MASK;
-	prop->dmmu.hop_masks[MMU_HOP4] = DHOP4_MASK;
 	prop->dmmu.page_size = PAGE_SIZE_1GB;
-	prop->dmmu.num_hops = MMU_ARCH_6_HOPS;
+	prop->dmmu.num_hops = MMU_ARCH_4_HOPS;
 	prop->dmmu.last_mask = LAST_MASK;
-	prop->dmmu.host_resident = 1;
+	prop->dmmu.host_resident = 0;
 	prop->dmmu.hop_table_size = prop->mmu_hop_table_size;
 	prop->dmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
 
-	/*
-	 * this is done in order to be able to validate FW descriptor (i.e. validating that
-	 * the addresses and allocated space for FW image does not cross memory bounds).
-	 * for this reason we set the DRAM size to the minimum possible and later it will
-	 * be modified according to what reported in the cpucp info packet
+	/* As we need to set the pgt address in dram for HMMU init so we cannot
+	 * wait to the fw cpucp info to set the dram props as mmu init comes before
+	 * hw init
 	 */
-	prop->dram_size = (GAUDI2_HBM_NUM - 1) * SZ_16G;
+	rc = hdev->asic_funcs->set_dram_properties(hdev);
+	if (rc)
+		goto free_qprops;
+
+	prop->mmu_pgt_size = PMMU_PAGE_TABLES_SIZE;
 
+	prop->pmmu.pgt_size = prop->mmu_pgt_size;
 	hdev->pmmu_huge_range = true;
 	prop->pmmu.host_resident = 1;
 	prop->pmmu.num_hops = MMU_ARCH_6_HOPS;
@@ -2516,7 +2559,6 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 	prop->max_num_of_engines = GAUDI2_ENGINE_ID_SIZE;
 	prop->num_engine_cores = CPU_ID_MAX;
 	prop->cfg_size = CFG_SIZE;
-	prop->max_asid = MAX_ASID;
 	prop->num_of_events = GAUDI2_EVENT_SIZE;
 
 	prop->supports_engine_modes = true;
@@ -2560,6 +2602,10 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 	prop->hbw_flush_reg = mmPCIE_WRAP_SPECIAL_GLBL_SPARE_0;
 
 	return 0;
+
+free_qprops:
+	kfree(prop->hw_queues_props);
+	return rc;
 }
 
 static int gaudi2_pci_bars_map(struct hl_device *hdev)
@@ -3033,6 +3079,25 @@ static int gaudi2_fetch_psoc_frequency(struct hl_device *hdev)
 	return 0;
 }
 
+static int gaudi2_mmu_clear_pgt_range(struct hl_device *hdev)
+{
+	struct gaudi2_device *gaudi2 = hdev->asic_specific;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int rc;
+
+	if (!(gaudi2->hw_cap_initialized & HW_CAP_MMU_MASK))
+		return 0;
+
+	if (prop->dmmu.host_resident)
+		return 0;
+
+	rc = gaudi2_memset_device_memory(hdev, prop->mmu_pgt_addr, prop->dmmu.pgt_size, 0);
+	if (rc)
+		dev_err(hdev->dev, "Failed to clear mmu pgt");
+
+	return rc;
+}
+
 static int gaudi2_early_init(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -3258,6 +3323,12 @@ static int gaudi2_late_init(struct hl_device *hdev)
 		goto disable_pci_access;
 	}
 
+	rc = gaudi2_mmu_clear_pgt_range(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to clear MMU page tables range\n");
+		goto disable_pci_access;
+	}
+
 	gaudi2_init_arcs(hdev);
 
 	rc = gaudi2_scrub_arcs_dccm(hdev);
@@ -3697,13 +3768,7 @@ static int gaudi2_sw_init(struct hl_device *hdev)
 
 	spin_lock_init(&gaudi2->hw_queues_lock);
 
-	gaudi2->scratchpad_kernel_address = hl_asic_dma_alloc_coherent(hdev, PAGE_SIZE,
-							&gaudi2->scratchpad_bus_address,
-							GFP_KERNEL | __GFP_ZERO);
-	if (!gaudi2->scratchpad_kernel_address) {
-		rc = -ENOMEM;
-		goto free_virt_msix_db_mem;
-	}
+	gaudi2->scratchpad_bus_address = prop->mmu_pgt_addr + HMMU_PAGE_TABLES_SIZE + EDMA_PQS_SIZE;
 
 	gaudi2_user_mapped_blocks_init(hdev);
 
@@ -3727,7 +3792,7 @@ static int gaudi2_sw_init(struct hl_device *hdev)
 
 	rc = gaudi2_special_blocks_iterator_config(hdev);
 	if (rc)
-		goto free_scratchpad_mem;
+		goto free_virt_msix_db_mem;
 
 	rc = gaudi2_test_queues_msgs_alloc(hdev);
 	if (rc)
@@ -3737,9 +3802,6 @@ static int gaudi2_sw_init(struct hl_device *hdev)
 
 special_blocks_free:
 	gaudi2_special_blocks_iterator_free(hdev);
-free_scratchpad_mem:
-	hl_asic_dma_free_coherent(hdev, PAGE_SIZE, gaudi2->scratchpad_kernel_address,
-				  gaudi2->scratchpad_bus_address);
 free_virt_msix_db_mem:
 	hl_cpu_accessible_dma_pool_free(hdev, prop->pmmu.page_size, gaudi2->virt_msix_db_cpu_addr);
 free_cpu_accessible_dma_pool:
@@ -3770,9 +3832,6 @@ static int gaudi2_sw_fini(struct hl_device *hdev)
 	hl_asic_dma_free_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE, hdev->cpu_accessible_dma_mem,
 						hdev->cpu_accessible_dma_address);
 
-	hl_asic_dma_free_coherent(hdev, PAGE_SIZE, gaudi2->scratchpad_kernel_address,
-					gaudi2->scratchpad_bus_address);
-
 	dma_pool_destroy(hdev->dma_pool);
 
 	kfree(gaudi2);
@@ -4962,10 +5021,17 @@ static void gaudi2_init_qman_pq(struct hl_device *hdev, u32 reg_base,
 		q = &hdev->kernel_queues[queue_id_base + pq_id];
 		pq_offset = pq_id * 4;
 
-		WREG32(reg_base + QM_PQ_BASE_LO_0_OFFSET + pq_offset,
-				lower_32_bits(q->bus_address));
-		WREG32(reg_base + QM_PQ_BASE_HI_0_OFFSET + pq_offset,
-				upper_32_bits(q->bus_address));
+		if (q->dram_bd) {
+			WREG32(reg_base + QM_PQ_BASE_LO_0_OFFSET + pq_offset,
+					lower_32_bits(q->pq_dram_address));
+			WREG32(reg_base + QM_PQ_BASE_HI_0_OFFSET + pq_offset,
+					upper_32_bits(q->pq_dram_address));
+		} else {
+			WREG32(reg_base + QM_PQ_BASE_LO_0_OFFSET + pq_offset,
+					lower_32_bits(q->bus_address));
+			WREG32(reg_base + QM_PQ_BASE_HI_0_OFFSET + pq_offset,
+					upper_32_bits(q->bus_address));
+		}
 		WREG32(reg_base + QM_PQ_SIZE_0_OFFSET + pq_offset, ilog2(HL_QUEUE_LENGTH));
 		WREG32(reg_base + QM_PQ_PI_0_OFFSET + pq_offset, 0);
 		WREG32(reg_base + QM_PQ_CI_0_OFFSET + pq_offset, 0);
@@ -5852,7 +5918,8 @@ static int gaudi2_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_har
 	return rc;
 }
 
-static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base)
+static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base,
+									bool host_resident_pgt)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	u64 hop0_addr;
@@ -5864,7 +5931,11 @@ static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base)
 		max_asid = min((u32) 8, max_asid);
 
 	for (asid = 0 ; asid < max_asid ; asid++) {
-		hop0_addr = hdev->mmu_priv.hr.mmu_asid_hop0[asid].phys_addr;
+		if (host_resident_pgt)
+			hop0_addr = hdev->mmu_priv.hr.mmu_asid_hop0[asid].phys_addr;
+		else
+			hop0_addr = prop->mmu_pgt_addr + (asid * prop->mmu_hop_table_size);
+
 		rc = gaudi2_mmu_update_asid_hop0_addr(hdev, stlb_base, asid, hop0_addr);
 		if (rc) {
 			dev_err(hdev->dev, "failed to set hop0 addr for asid %d\n", asid);
@@ -5875,7 +5946,8 @@ static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base)
 	return 0;
 }
 
-static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb_base)
+static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb_base,
+								bool host_resident_pgt)
 {
 	u32 status, timeout_usec;
 	int rc;
@@ -5898,7 +5970,7 @@ static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb
 	if (rc)
 		dev_notice_ratelimited(hdev->dev, "Timeout when waiting for MMU SRAM init\n");
 
-	rc = gaudi2_mmu_update_hop0_addr(hdev, stlb_base);
+	rc = gaudi2_mmu_update_hop0_addr(hdev, stlb_base, host_resident_pgt);
 	if (rc)
 		return rc;
 
@@ -5922,6 +5994,7 @@ static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb
 
 static int gaudi2_pci_mmu_init(struct hl_device *hdev)
 {
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
 	u32 mmu_base, stlb_base;
 	int rc;
@@ -5961,7 +6034,7 @@ static int gaudi2_pci_mmu_init(struct hl_device *hdev)
 
 	WREG32(mmu_base + MMU_SPI_SEI_MASK_OFFSET, GAUDI2_PMMU_SPI_SEI_ENABLE_MASK);
 
-	rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base);
+	rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base, prop->pmmu.host_resident);
 	if (rc)
 		return rc;
 
@@ -6013,7 +6086,7 @@ static int gaudi2_dcore_hmmu_init(struct hl_device *hdev, int dcore_id,
 
 	WREG32(mmu_base + MMU_SPI_SEI_MASK_OFFSET, GAUDI2_HMMU_SPI_SEI_ENABLE_MASK);
 
-	rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base);
+	rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base, prop->dmmu.host_resident);
 	if (rc)
 		return rc;
 
@@ -7051,7 +7124,7 @@ static int gaudi2_test_queues(struct hl_device *hdev)
 
 	/* send test message on all enabled Qs */
 	for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) {
-		if (!gaudi2_is_queue_enabled(hdev, i))
+		if (!gaudi2_is_queue_enabled(hdev, i) || gaudi2_is_edma_queue_id(i))
 			continue;
 
 		msg_info = &gaudi2->queues_test_info[i - GAUDI2_QUEUE_ID_PDMA_0_0];
@@ -7068,7 +7141,7 @@ static int gaudi2_test_queues(struct hl_device *hdev)
 
 	/* verify that all messages were processed */
 	for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) {
-		if (!gaudi2_is_queue_enabled(hdev, i))
+		if (!gaudi2_is_queue_enabled(hdev, i) || gaudi2_is_edma_queue_id(i))
 			continue;
 
 		rc = gaudi2_test_queue_wait_completion(hdev, i, sob_val);
@@ -8988,7 +9061,6 @@ static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool
 	if (is_pmmu) {
 		dev_err_ratelimited(hdev->dev, "PMMU page fault on va 0x%llx\n", addr);
 	} else {
-
 		addr = gaudi2_mmu_descramble_addr(hdev, addr);
 		addr &= HW_UNSCRAMBLED_BITS_MASK;
 		dev_err_ratelimited(hdev->dev, "HMMU page fault on va range 0x%llx - 0x%llx\n",
@@ -10255,11 +10327,11 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 }
 
 static int gaudi2_memset_memory_chunk_using_edma_qm(struct hl_device *hdev,
-			struct packet_lin_dma *lin_dma_pkt, dma_addr_t pkt_dma_addr,
-			u32 hw_queue_id, u32 size, u64 addr, u32 val)
+			struct packet_lin_dma *lin_dma_pkt,
+			u64 phys_addr, u32 hw_queue_id, u32 size, u64 addr, u32 val)
 {
 	u32 ctl, pkt_size;
-	int rc = 0;
+	int rc = 0, i;
 
 	ctl = FIELD_PREP(GAUDI2_PKT_CTL_OPCODE_MASK, PACKET_LIN_DMA);
 	ctl |= FIELD_PREP(GAUDI2_PKT_LIN_DMA_CTL_MEMSET_MASK, 1);
@@ -10273,7 +10345,12 @@ static int gaudi2_memset_memory_chunk_using_edma_qm(struct hl_device *hdev,
 
 	pkt_size = sizeof(struct packet_lin_dma);
 
-	rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, pkt_dma_addr);
+	for (i = 0; i < 3; i++)
+		rc = hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM,
+				phys_addr + (i * sizeof(u64)),
+				((u64 *)(lin_dma_pkt)) + i, DEBUGFS_WRITE64);
+
+	rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, phys_addr);
 	if (rc)
 		dev_err(hdev->dev, "Failed to send lin dma packet to H/W queue %d\n",
 				hw_queue_id);
@@ -10288,12 +10365,11 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 					GAUDI2_QUEUE_ID_DCORE2_EDMA_0_0,
 					GAUDI2_QUEUE_ID_DCORE3_EDMA_0_0};
 	u32 chunk_size, dcore, edma_idx, sob_offset, sob_addr, comp_val,
-		old_mmubp, mmubp, num_of_pkts, busy, pkt_size;
+		old_mmubp, mmubp, num_of_pkts, busy, pkt_size, cb_len;
 	u64 comp_addr, cur_addr = addr, end_addr = addr + size;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int rc = 0, dma_num = 0, i;
 	void *lin_dma_pkts_arr;
-	dma_addr_t pkt_dma_addr;
-	int rc = 0, dma_num = 0;
 
 	if (prop->edma_enabled_mask == 0) {
 		dev_info(hdev->dev, "non of the EDMA engines is enabled - skip dram scrubbing\n");
@@ -10311,9 +10387,19 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 	/* Calculate how many lin dma pkts we'll need */
 	num_of_pkts = div64_u64(round_up(size, SZ_2G), SZ_2G);
 	pkt_size = sizeof(struct packet_lin_dma);
+	cb_len = pkt_size * num_of_pkts;
+
+	/*
+	 * if we're not scrubing HMMU or NIC reserved sections in hbm,
+	 * then it the scrubing of the user section, as we use the start of the user section
+	 * to store the CB of the EDMA QM, so shift the start address of the scrubbing accordingly
+	 * and scrub the CB section before leaving this function.
+	 */
+	if ((addr >= prop->dram_user_base_address) &&
+				(addr < prop->dram_user_base_address + cb_len))
+		cur_addr += (prop->dram_user_base_address + cb_len) - addr;
 
-	lin_dma_pkts_arr = hl_asic_dma_alloc_coherent(hdev, pkt_size * num_of_pkts,
-					&pkt_dma_addr, GFP_KERNEL);
+	lin_dma_pkts_arr = kvcalloc(num_of_pkts, pkt_size, GFP_KERNEL);
 	if (!lin_dma_pkts_arr)
 		return -ENOMEM;
 
@@ -10359,7 +10445,7 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 
 				rc = gaudi2_memset_memory_chunk_using_edma_qm(hdev,
 					(struct packet_lin_dma *)lin_dma_pkts_arr + dma_num,
-					pkt_dma_addr + dma_num * pkt_size,
+					prop->dram_user_base_address + (dma_num * pkt_size),
 					edma_queues_id[dcore] + edma_idx * 4,
 					chunk_size, cur_addr, val);
 				if (rc)
@@ -10368,14 +10454,16 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 				dma_num++;
 				cur_addr += chunk_size;
 				if (cur_addr == end_addr)
-					break;
+					goto edma_wait;
 			}
 		}
 	}
 
+edma_wait:
 	rc = hl_poll_timeout(hdev, sob_addr, busy, (busy == dma_num), 1000, 1000000);
 	if (rc) {
-		dev_err(hdev->dev, "DMA Timeout during HBM scrubbing\n");
+		dev_err(hdev->dev, "DMA Timeout during HBM scrubbing(sob: 0x%x, dma_num: 0x%x)\n",
+						busy, dma_num);
 		goto end;
 	}
 end:
@@ -10396,8 +10484,16 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 		}
 	}
 
+	memset(lin_dma_pkts_arr, 0, sizeof(u64));
+
+	/* Zero the HBM area where we copied the CB */
+	for (i = 0; i < cb_len / sizeof(u64); i += sizeof(u64))
+		rc = hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM,
+			prop->dram_user_base_address + i,
+				(u64 *)(lin_dma_pkts_arr), DEBUGFS_WRITE64);
 	WREG32(sob_addr, 0);
-	hl_asic_dma_free_coherent(hdev, pkt_size * num_of_pkts, lin_dma_pkts_arr, pkt_dma_addr);
+
+	kfree(lin_dma_pkts_arr);
 
 	return rc;
 }
@@ -11455,7 +11551,7 @@ static int gaudi2_mmu_get_real_page_size(struct hl_device *hdev, struct hl_mmu_p
 	return 0;
 
 page_size_err:
-	dev_err(hdev->dev, "page size of %u is not %uKB aligned, can't map\n",
+	dev_err(hdev->dev, "page size of 0x%X is not 0x%X aligned, can't map\n",
 							page_size, mmu_prop->page_size >> 10);
 	return -EFAULT;
 }
@@ -11475,6 +11571,29 @@ int gaudi2_send_device_activity(struct hl_device *hdev, bool open)
 	return hl_fw_send_device_activity(hdev, open);
 }
 
+static u64 gaudi2_read_pte(struct hl_device *hdev, u64 addr)
+{
+	struct gaudi2_device *gaudi2 = hdev->asic_specific;
+	u64 val;
+
+	if (hdev->reset_info.hard_reset_pending)
+		return U64_MAX;
+
+	val = readq(hdev->pcie_bar[DRAM_BAR_ID] + (addr - gaudi2->dram_bar_cur_addr));
+
+	return val;
+}
+
+static void gaudi2_write_pte(struct hl_device *hdev, u64 addr, u64 val)
+{
+	struct gaudi2_device *gaudi2 = hdev->asic_specific;
+
+	if (hdev->reset_info.hard_reset_pending)
+		return;
+
+	writeq(val, hdev->pcie_bar[DRAM_BAR_ID] + (addr - gaudi2->dram_bar_cur_addr));
+}
+
 static const struct hl_asic_funcs gaudi2_funcs = {
 	.early_init = gaudi2_early_init,
 	.early_fini = gaudi2_early_fini,
@@ -11511,8 +11630,8 @@ static const struct hl_asic_funcs gaudi2_funcs = {
 	.add_device_attr = gaudi2_add_device_attr,
 	.handle_eqe = gaudi2_handle_eqe,
 	.get_events_stat = gaudi2_get_events_stat,
-	.read_pte = NULL,
-	.write_pte = NULL,
+	.read_pte = gaudi2_read_pte,
+	.write_pte = gaudi2_write_pte,
 	.mmu_invalidate_cache = gaudi2_mmu_invalidate_cache,
 	.mmu_invalidate_cache_range = gaudi2_mmu_invalidate_cache_range,
 	.mmu_prefetch_cache_range = NULL,
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
index 9b9eef0d97d6e8..bc508c9cee5c50 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
@@ -19,8 +19,6 @@
 #define GAUDI2_LINUX_FW_FILE	"habanalabs/gaudi2/gaudi2-fit.itb"
 #define GAUDI2_BOOT_FIT_FILE	"habanalabs/gaudi2/gaudi2-boot-fit.itb"
 
-#define MMU_PAGE_TABLES_INITIAL_SIZE	0x10000000	/* 256MB */
-
 #define GAUDI2_CPU_TIMEOUT_USEC		30000000	/* 30s */
 
 #define NUMBER_OF_PDMA_QUEUES		2
@@ -109,13 +107,11 @@
 /* DRAM Memory Map */
 
 #define CPU_FW_IMAGE_SIZE			0x10000000	/* 256MB */
-
-/* This define should be used only when working in a debug mode without dram.
- * When working with dram, the driver size will be calculated dynamically.
- */
-#define NIC_DEFAULT_DRV_SIZE			0x20000000	/* 512MB */
-
 #define CPU_FW_IMAGE_ADDR			DRAM_PHYS_BASE
+#define PMMU_PAGE_TABLES_SIZE			0x10000000      /* 256MB */
+#define EDMA_PQS_SIZE				SZ_2M
+#define EDMA_SCRATCHPAD_SIZE			SZ_1M
+#define HMMU_PAGE_TABLES_SIZE			SZ_1M
 
 #define NIC_NUMBER_OF_PORTS			NIC_NUMBER_OF_ENGINES
 
diff --git a/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h b/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h
index d408feecd4834d..b4a5e95be35421 100644
--- a/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h
+++ b/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h
@@ -26,6 +26,8 @@
 #define LAST_MASK			0x0000000000800ull
 #define FLAGS_MASK			0x0000000000FFFull
 
+#define MMU_ARCH_3_HOPS			3
+#define MMU_ARCH_4_HOPS			4
 #define MMU_ARCH_5_HOPS			5
 #define MMU_ARCH_6_HOPS			6
 

From 8aba7a26817a7a87b5b852faf93838baaca20b06 Mon Sep 17 00:00:00 2001
From: Malkoot Khan <engr.mkhan1990@gmail.com>
Date: Thu, 28 Dec 2023 21:08:58 +0000
Subject: [PATCH 172/707] accel/habanalabs: Remove unnecessary braces from if
 statement

The coding style in the Linux kernel prefers not to use
braces for single-statement if conditions.
This patch removes the unnecessary braces from an if statement
in the file drivers/accel/habanalabs/common/command_submission.c,
which also resolves a coding style warning.

Signed-off-by: Malkoot Khan <engr.mkhan1990@gmail.com>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/command_submission.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c
index 3aa6eeef443b41..39e23d625a3cbb 100644
--- a/drivers/accel/habanalabs/common/command_submission.c
+++ b/drivers/accel/habanalabs/common/command_submission.c
@@ -1360,9 +1360,8 @@ static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
 			return -EINVAL;
 		}
 
-	if (!hl_device_operational(hdev, &status)) {
+	if (!hl_device_operational(hdev, &status))
 		return -EBUSY;
-	}
 
 	if ((args->in.cs_flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
 			!hdev->supports_staged_submission) {

From 149972a8b935c9c82cd5517d49c80b3b9029bd28 Mon Sep 17 00:00:00 2001
From: Dani Liberman <dliberman@habana.ai>
Date: Mon, 1 Jan 2024 22:37:43 +0200
Subject: [PATCH 173/707] accel/habanalabs: remove call to deprecated function

In newer kernel versions, irq_set_affinity_hint() is deprecated.
Instead, use the newer version which is irq_set_affinity_and_hint().

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c | 2 +-
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index cf004baf5e6213..3b9e8a21d7df8b 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -2833,6 +2833,6 @@ void hl_set_irq_affinity(struct hl_device *hdev, int irq)
 		return;
 	}
 
-	if (irq_set_affinity_hint(irq, &hdev->irq_affinity_mask))
+	if (irq_set_affinity_and_hint(irq, &hdev->irq_affinity_mask))
 		dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq);
 }
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 5863c904913433..05e2170c815e6b 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -4395,7 +4395,7 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
 			i < GAUDI2_IRQ_NUM_USER_FIRST + user_irq_init_cnt ; i++, j++) {
 
 		irq = pci_irq_vector(hdev->pdev, i);
-		irq_set_affinity_hint(irq, NULL);
+		irq_set_affinity_and_hint(irq, NULL);
 		free_irq(irq, &hdev->user_interrupt[j]);
 	}
 	irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_UNEXPECTED_ERROR);
@@ -4476,7 +4476,7 @@ static void gaudi2_disable_msix(struct hl_device *hdev)
 			k < hdev->asic_prop.user_interrupt_count ; i++, j++, k++) {
 
 		irq = pci_irq_vector(hdev->pdev, i);
-		irq_set_affinity_hint(irq, NULL);
+		irq_set_affinity_and_hint(irq, NULL);
 		free_irq(irq, &hdev->user_interrupt[j]);
 	}
 

From 41e9304988a1eb5a7b3c02f210b76d50488f0731 Mon Sep 17 00:00:00 2001
From: Tomer Tayar <ttayar@habana.ai>
Date: Tue, 2 Jan 2024 16:51:09 +0200
Subject: [PATCH 174/707] accel/habanalabs/gaudi2: fail memory memset when
 failing to copy QM packet to device

gaudi2_memset_memory_chunk_using_edma_qm() calls the access_dev_mem()
ASIC function, but ignores its return value.
Add this missing check.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 05e2170c815e6b..1f061209ae2157 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -10345,14 +10345,20 @@ static int gaudi2_memset_memory_chunk_using_edma_qm(struct hl_device *hdev,
 
 	pkt_size = sizeof(struct packet_lin_dma);
 
-	for (i = 0; i < 3; i++)
+	for (i = 0; i < 3; i++) {
 		rc = hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM,
 				phys_addr + (i * sizeof(u64)),
 				((u64 *)(lin_dma_pkt)) + i, DEBUGFS_WRITE64);
+		if (rc) {
+			dev_err(hdev->dev, "Failed to copy lin_dma packet to HBM (%#llx)\n",
+				phys_addr);
+			return rc;
+		}
+	}
 
 	rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, phys_addr);
 	if (rc)
-		dev_err(hdev->dev, "Failed to send lin dma packet to H/W queue %d\n",
+		dev_err(hdev->dev, "Failed to send lin_dma packet to H/W queue %d\n",
 				hw_queue_id);
 
 	return rc;

From fedec9564504be39597a5ac215932e8cc017a364 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@intel.com>
Date: Sat, 6 Jan 2024 12:42:13 +0000
Subject: [PATCH 175/707] accel/habanalabs/goya: remove redundant assignment to
 pointer 'input'

The pointer input is assigned a value that is not read, it is
being re-assigned again later with the same value. Resolve this
by moving the declaration to input into the if block.

Cleans up clang scan build warning:
warning: Value stored to 'input' during its initialization is never
read [deadcode.DeadStores]

Signed-off-by: Colin Ian King <colin.i.king@intel.com>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/goya/goya_coresight.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/goya/goya_coresight.c b/drivers/accel/habanalabs/goya/goya_coresight.c
index 41cae5fd843b88..3827ea4c02f740 100644
--- a/drivers/accel/habanalabs/goya/goya_coresight.c
+++ b/drivers/accel/habanalabs/goya/goya_coresight.c
@@ -576,7 +576,6 @@ static int goya_config_spmu(struct hl_device *hdev,
 		struct hl_debug_params *params)
 {
 	u64 base_reg;
-	struct hl_debug_params_spmu *input = params->input;
 	u64 *output;
 	u32 output_arr_len;
 	u32 events_num;
@@ -592,7 +591,7 @@ static int goya_config_spmu(struct hl_device *hdev,
 	base_reg = debug_spmu_regs[params->reg_idx] - CFG_BASE;
 
 	if (params->enable) {
-		input = params->input;
+		struct hl_debug_params_spmu *input = params->input;
 
 		if (!input)
 			return -EINVAL;

From dddb2e526a3650f3aa19c9ed315ae0f49c768810 Mon Sep 17 00:00:00 2001
From: Erick Archer <erick.archer@gmx.com>
Date: Sat, 20 Jan 2024 16:10:28 +0100
Subject: [PATCH 176/707] accel/habanalabs: use kcalloc() instead of kzalloc()

As noted in the "Deprecated Interfaces, Language Features, Attributes,
and Conventions" documentation [1], size calculations (especially
multiplication) should not be performed in memory allocator (or similar)
function arguments due to the risk of them overflowing. This could lead
to values wrapping around and a smaller allocation being made than the
caller was expecting. Using those allocations could lead to linear
overflows of heap memory and other misbehaviors.

So, use the purpose specific kcalloc() function instead of the argument
size * count in the kzalloc() function.

Link: https://www.kernel.org/doc/html/next/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments [1]
Link: https://github.com/KSPP/linux/issues/162

Signed-off-by: Erick Archer <erick.archer@gmx.com>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/mmu/mmu_v1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/accel/habanalabs/common/mmu/mmu_v1.c b/drivers/accel/habanalabs/common/mmu/mmu_v1.c
index 64b5c8fbb166d9..845d16aaa63741 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu_v1.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu_v1.c
@@ -43,7 +43,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 	/* add hop1 and hop2 */
 	total_hops = num_of_hop3 + 2;
 
-	ctx->dram_default_hops = kzalloc(HL_PTE_SIZE * total_hops,  GFP_KERNEL);
+	ctx->dram_default_hops = kcalloc(total_hops, HL_PTE_SIZE,  GFP_KERNEL);
 	if (!ctx->dram_default_hops)
 		return -ENOMEM;
 

From 4e05c06df7d321e8fd8ceda2c8d4712f62ffb469 Mon Sep 17 00:00:00 2001
From: Aleksandrs Vinarskis <alex.vinarskis@gmail.com>
Date: Thu, 21 Dec 2023 19:51:41 +0100
Subject: [PATCH 177/707] mfd: intel-lpss: Switch to generalized quirk table

Introduce generic quirk table, and port existing walkaround for select
Microsoft devices to it. This is a preparation for
QUIRK_CLOCK_DIVIDER_UNITY.

Signed-off-by: Aleksandrs Vinarskis <alex.vinarskis@gmail.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20231221185142.9224-2-alex.vinarskis@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/intel-lpss-pci.c | 23 +++++++++++++++--------
 drivers/mfd/intel-lpss.c     |  2 +-
 drivers/mfd/intel-lpss.h     |  9 ++++++++-
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/drivers/mfd/intel-lpss-pci.c b/drivers/mfd/intel-lpss-pci.c
index 4621d3950b8f9f..07713a2f694f32 100644
--- a/drivers/mfd/intel-lpss-pci.c
+++ b/drivers/mfd/intel-lpss-pci.c
@@ -23,12 +23,17 @@
 
 #include "intel-lpss.h"
 
-/* Some DSDTs have an unused GEXP ACPI device conflicting with I2C4 resources */
-static const struct pci_device_id ignore_resource_conflicts_ids[] = {
-	/* Microsoft Surface Go (version 1) I2C4 */
-	{ PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x9d64, 0x152d, 0x1182), },
-	/* Microsoft Surface Go 2 I2C4 */
-	{ PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x9d64, 0x152d, 0x1237), },
+static const struct pci_device_id quirk_ids[] = {
+	{
+		/* Microsoft Surface Go (version 1) I2C4 */
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x9d64, 0x152d, 0x1182),
+		.driver_data = QUIRK_IGNORE_RESOURCE_CONFLICTS,
+	},
+	{
+		/* Microsoft Surface Go 2 I2C4 */
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x9d64, 0x152d, 0x1237),
+		.driver_data = QUIRK_IGNORE_RESOURCE_CONFLICTS,
+	},
 	{ }
 };
 
@@ -36,6 +41,7 @@ static int intel_lpss_pci_probe(struct pci_dev *pdev,
 				const struct pci_device_id *id)
 {
 	const struct intel_lpss_platform_info *data = (void *)id->driver_data;
+	const struct pci_device_id *quirk_pci_info;
 	struct intel_lpss_platform_info *info;
 	int ret;
 
@@ -55,8 +61,9 @@ static int intel_lpss_pci_probe(struct pci_dev *pdev,
 	info->mem = pci_resource_n(pdev, 0);
 	info->irq = pci_irq_vector(pdev, 0);
 
-	if (pci_match_id(ignore_resource_conflicts_ids, pdev))
-		info->ignore_resource_conflicts = true;
+	quirk_pci_info = pci_match_id(quirk_ids, pdev);
+	if (quirk_pci_info)
+		info->quirks = quirk_pci_info->driver_data;
 
 	pdev->d3cold_delay = 0;
 
diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c
index eff423f7dd2847..aafa0da5f8dbfd 100644
--- a/drivers/mfd/intel-lpss.c
+++ b/drivers/mfd/intel-lpss.c
@@ -412,7 +412,7 @@ int intel_lpss_probe(struct device *dev,
 		return ret;
 
 	lpss->cell->swnode = info->swnode;
-	lpss->cell->ignore_resource_conflicts = info->ignore_resource_conflicts;
+	lpss->cell->ignore_resource_conflicts = info->quirks & QUIRK_IGNORE_RESOURCE_CONFLICTS;
 
 	intel_lpss_init_dev(lpss);
 
diff --git a/drivers/mfd/intel-lpss.h b/drivers/mfd/intel-lpss.h
index c1d72b117ed5e6..2fa9ef9162580e 100644
--- a/drivers/mfd/intel-lpss.h
+++ b/drivers/mfd/intel-lpss.h
@@ -11,16 +11,23 @@
 #ifndef __MFD_INTEL_LPSS_H
 #define __MFD_INTEL_LPSS_H
 
+#include <linux/bits.h>
 #include <linux/pm.h>
 
+/*
+ * Some DSDTs have an unused GEXP ACPI device conflicting with I2C4 resources.
+ * Set to ignore resource conflicts with ACPI declared SystemMemory regions.
+ */
+#define QUIRK_IGNORE_RESOURCE_CONFLICTS BIT(0)
+
 struct device;
 struct resource;
 struct software_node;
 
 struct intel_lpss_platform_info {
 	struct resource *mem;
-	bool ignore_resource_conflicts;
 	int irq;
+	unsigned int quirks;
 	unsigned long clk_rate;
 	const char *clk_con_id;
 	const struct software_node *swnode;

From b47f1f55e26b98bf6811137735d2d3bc3bc6c3bc Mon Sep 17 00:00:00 2001
From: Aleksandrs Vinarskis <alex.vinarskis@gmail.com>
Date: Thu, 21 Dec 2023 19:51:42 +0100
Subject: [PATCH 178/707] mfd: intel-lpss: Introduce QUIRK_CLOCK_DIVIDER_UNITY
 for XPS 9530

Some devices (eg. Dell XPS 9530, 2023) due to a firmware bug have a
misconfigured clock divider, which should've been 1:1. This introduces
quirk which conditionally re-configures the clock divider to 1:1.

Signed-off-by: Aleksandrs Vinarskis <alex.vinarskis@gmail.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20231221185142.9224-3-alex.vinarskis@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/intel-lpss-pci.c | 5 +++++
 drivers/mfd/intel-lpss.c     | 7 +++++++
 drivers/mfd/intel-lpss.h     | 5 +++++
 3 files changed, 17 insertions(+)

diff --git a/drivers/mfd/intel-lpss-pci.c b/drivers/mfd/intel-lpss-pci.c
index 07713a2f694f32..8c00e0c695c5b9 100644
--- a/drivers/mfd/intel-lpss-pci.c
+++ b/drivers/mfd/intel-lpss-pci.c
@@ -34,6 +34,11 @@ static const struct pci_device_id quirk_ids[] = {
 		PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x9d64, 0x152d, 0x1237),
 		.driver_data = QUIRK_IGNORE_RESOURCE_CONFLICTS,
 	},
+	{
+		/* Dell XPS 9530 (2023) */
+		PCI_DEVICE_SUB(PCI_VENDOR_ID_INTEL, 0x51fb, 0x1028, 0x0beb),
+		.driver_data = QUIRK_CLOCK_DIVIDER_UNITY,
+	},
 	{ }
 };
 
diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c
index aafa0da5f8dbfd..2a9018112dfc86 100644
--- a/drivers/mfd/intel-lpss.c
+++ b/drivers/mfd/intel-lpss.c
@@ -300,6 +300,7 @@ static int intel_lpss_register_clock_divider(struct intel_lpss *lpss,
 {
 	char name[32];
 	struct clk *tmp = *clk;
+	int ret;
 
 	snprintf(name, sizeof(name), "%s-enable", devname);
 	tmp = clk_register_gate(NULL, name, __clk_get_name(tmp), 0,
@@ -316,6 +317,12 @@ static int intel_lpss_register_clock_divider(struct intel_lpss *lpss,
 		return PTR_ERR(tmp);
 	*clk = tmp;
 
+	if (lpss->info->quirks & QUIRK_CLOCK_DIVIDER_UNITY) {
+		ret = clk_set_rate(tmp, lpss->info->clk_rate);
+		if (ret)
+			return ret;
+	}
+
 	snprintf(name, sizeof(name), "%s-update", devname);
 	tmp = clk_register_gate(NULL, name, __clk_get_name(tmp),
 				CLK_SET_RATE_PARENT, lpss->priv, 31, 0, NULL);
diff --git a/drivers/mfd/intel-lpss.h b/drivers/mfd/intel-lpss.h
index 2fa9ef9162580e..6f8f668f4c6f08 100644
--- a/drivers/mfd/intel-lpss.h
+++ b/drivers/mfd/intel-lpss.h
@@ -19,6 +19,11 @@
  * Set to ignore resource conflicts with ACPI declared SystemMemory regions.
  */
 #define QUIRK_IGNORE_RESOURCE_CONFLICTS BIT(0)
+/*
+ * Some devices have misconfigured clock divider due to a firmware bug.
+ * Set this to force the clock divider to 1:1 ratio.
+ */
+#define QUIRK_CLOCK_DIVIDER_UNITY		BIT(1)
 
 struct device;
 struct resource;

From 5394040d0b67177344662bc2b928e9d67e8f431d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@linux.intel.com>
Date: Fri, 29 Dec 2023 16:50:59 +0200
Subject: [PATCH 179/707] mfd: lpc_ich: Use ALIGN_DOWN() to obtain the start of
 the SPI base range
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of open coding, use ALIGN_DOWN() for alignment.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20231229145059.6138-1-ilpo.jarvinen@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/lpc_ich.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/mfd/lpc_ich.c b/drivers/mfd/lpc_ich.c
index 73a0e7f9bd3116..f14901660147f5 100644
--- a/drivers/mfd/lpc_ich.c
+++ b/drivers/mfd/lpc_ich.c
@@ -38,6 +38,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/align.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/errno.h>
@@ -1321,7 +1322,7 @@ static int lpc_ich_init_spi(struct pci_dev *dev)
 	case INTEL_SPI_BYT:
 		pci_read_config_dword(dev, SPIBASE_BYT, &spi_base);
 		if (spi_base & SPIBASE_BYT_EN) {
-			res->start = spi_base & ~(SPIBASE_BYT_SZ - 1);
+			res->start = ALIGN_DOWN(spi_base, SPIBASE_BYT_SZ);
 			res->end = res->start + SPIBASE_BYT_SZ - 1;
 
 			info->set_writeable = lpc_ich_byt_set_writeable;

From 5a6a8580defaf01f5b59c95cdd702a3ae1c7f224 Mon Sep 17 00:00:00 2001
From: Fuyao Kashizuku <fuyao@sjterm.com>
Date: Wed, 27 Dec 2023 10:01:17 +0800
Subject: [PATCH 180/707] mfd: sun4i-gpadc: Correct specified GPADC interrupt
 numbers

The identifiers are used as IRQ resource numbers, where 0 is treated
specially.

This fixes sun4i-gpadc-iio probe failed when request irq.

The backstack:
	WARNING: CPU: 3 PID: 1 at drivers/base/platform.c:451
	__platform_get_irq_byname+0xb8/0xc4
	0 is an invalid IRQ number
	Modules linked in:
	CPU: 3 PID: 1 Comm: swapper/0 Not tainted 6.7.0-rc6 #9
	Hardware name: Allwinner sun8i Family
	 unwind_backtrace
	 show_stack
	 dump_stack_lvl
	 __warn
	 warn_slowpath_fmt
	 __platform_get_irq_byname
	 platform_get_irq_byname
	 sun4i_irq_init
	 sun4i_gpadc_probe
	 platform_probe
	 really_probe
	 __driver_probe_device
	 driver_probe_device
	 __driver_attach
	 bus_for_each_dev
	 bus_add_driver
	 driver_register
	 do_one_initcall
	 do_initcall_level
	 do_initcalls
	 kernel_init_freeable
	 kernel_init

Log reports:
sun4i-gpadc-iio sun6i-a31-gpadc-iio.0: error -EINVAL: IRQ FIFO_DATA_PENDING
not found
sun4i-gpadc-iio: probe of sun6i-a31-gpadc-iio.0 failed with error -22

Signed-off-by: Fuyao Kashizuku <fuyao@sjterm.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/ZYuFbUUus9apiCpq@debian.cyg
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/sun4i-gpadc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/mfd/sun4i-gpadc.h b/include/linux/mfd/sun4i-gpadc.h
index ea0ccf33a459ef..021f820f9d52bd 100644
--- a/include/linux/mfd/sun4i-gpadc.h
+++ b/include/linux/mfd/sun4i-gpadc.h
@@ -81,8 +81,8 @@
 #define SUN4I_GPADC_TEMP_DATA				0x20
 #define SUN4I_GPADC_DATA				0x24
 
-#define SUN4I_GPADC_IRQ_FIFO_DATA			0
-#define SUN4I_GPADC_IRQ_TEMP_DATA			1
+#define SUN4I_GPADC_IRQ_FIFO_DATA			1
+#define SUN4I_GPADC_IRQ_TEMP_DATA			2
 
 /* 10s delay before suspending the IP */
 #define SUN4I_GPADC_AUTOSUSPEND_DELAY			10000

From 91c63e4f2f88696097e93ebf59fe4c7e07d1d4ab Mon Sep 17 00:00:00 2001
From: Lee Jones <lee@kernel.org>
Date: Thu, 11 Jan 2024 16:21:13 +0000
Subject: [PATCH 181/707] mfd: omap-usb-host: Increase size of buffer to
 include all possible values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoid these nasty W=1 errors:

  drivers/mfd/omap-usb-host.c: In function ‘usbhs_omap_probe’:
  drivers/mfd/omap-usb-host.c:706:54: error: ‘_clk’ directive output may be truncated writing 4 bytes into a region of size between 1 and 11 [-Werror=format-truncation=]
  drivers/mfd/omap-usb-host.c:705:17: note: ‘snprintf’ output between 24 and 34 bytes into a destination of size 30
  drivers/mfd/omap-usb-host.c:721:56: error: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 8 [-Werror=format-truncation=]
  drivers/mfd/omap-usb-host.c:721:33: note: directive argument in the range [-2147483640, 2147483647]
  drivers/mfd/omap-usb-host.c:720:17: note: ‘snprintf’ output between 28 and 38 bytes into a destination of size 30
  drivers/mfd/omap-usb-host.c:731:55: error: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 9 [-Werror=format-truncation=]
  drivers/mfd/omap-usb-host.c:731:33: note: directive argument in the range [-2147483640, 2147483647]
  drivers/mfd/omap-usb-host.c:730:17: note: ‘snprintf’ output between 27 and 37 bytes into a destination of size 30

Cc: Tony Lindgren <tony@atomide.com>
Cc: Keshava Munegowda <keshava_mgowda@ti.com>
Cc: Roger Quadros <rogerq@ti.com>
Cc: linux-omap@vger.kernel.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/omap-usb-host.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/omap-usb-host.c b/drivers/mfd/omap-usb-host.c
index ebc62033db169d..949feb03d4f8d8 100644
--- a/drivers/mfd/omap-usb-host.c
+++ b/drivers/mfd/omap-usb-host.c
@@ -699,7 +699,7 @@ static int usbhs_omap_probe(struct platform_device *pdev)
 	}
 
 	for (i = 0; i < omap->nports; i++) {
-		char clkname[30];
+		char clkname[40];
 
 		/* clock names are indexed from 1*/
 		snprintf(clkname, sizeof(clkname),

From 47f28ec99bef945f173251d26496e77f767ecf17 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Mon, 15 Jan 2024 19:20:42 +0100
Subject: [PATCH 182/707] dt-bindings: mfd: iqs62x: Do not override
 firmware-name $ref

dtschema package defines firmware-name as string-array, so individual
bindings should not make it a string but instead just narrow the number
of expected firmware file names.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Acked-by: Jeff LaBundy <jeff@labundy.com>
Link: https://lore.kernel.org/r/20240115182042.1610134-1-krzysztof.kozlowski@linaro.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 Documentation/devicetree/bindings/mfd/iqs62x.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/mfd/iqs62x.yaml b/Documentation/devicetree/bindings/mfd/iqs62x.yaml
index 044cd7542c2bcf..f438c237496639 100644
--- a/Documentation/devicetree/bindings/mfd/iqs62x.yaml
+++ b/Documentation/devicetree/bindings/mfd/iqs62x.yaml
@@ -31,7 +31,7 @@ properties:
     maxItems: 1
 
   firmware-name:
-    $ref: /schemas/types.yaml#/definitions/string
+    maxItems: 1
     description:
       Specifies the name of the calibration and configuration file selected by
       the driver. If this property is omitted, the name is chosen based on the

From 3b75d271e161e22aff8171940a77510d2fb2ad6f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Sun, 14 Jan 2024 16:39:21 +0200
Subject: [PATCH 183/707] backlight: hx8357: Fix potential NULL pointer
 dereference

The "im" pins are optional. Add missing check in the hx8357_probe().

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/r/642e1230-3358-4006-a17f-3f297897ae74@moroto.mountain
Fixes: 7d84a63a39b7 ("backlight: hx8357: Convert to agnostic GPIO API")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Link: https://lore.kernel.org/r/20240114143921.550736-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/hx8357.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/video/backlight/hx8357.c b/drivers/video/backlight/hx8357.c
index d7298376cf74dd..bf18337ff0c2c0 100644
--- a/drivers/video/backlight/hx8357.c
+++ b/drivers/video/backlight/hx8357.c
@@ -609,11 +609,13 @@ static int hx8357_probe(struct spi_device *spi)
 	lcd->im_pins = devm_gpiod_get_array_optional(dev, "im", GPIOD_OUT_LOW);
 	if (IS_ERR(lcd->im_pins))
 		return dev_err_probe(dev, PTR_ERR(lcd->im_pins), "failed to request im GPIOs\n");
-	if (lcd->im_pins->ndescs < HX8357_NUM_IM_PINS)
-		return dev_err_probe(dev, -EINVAL, "not enough im GPIOs\n");
+	if (lcd->im_pins) {
+		if (lcd->im_pins->ndescs < HX8357_NUM_IM_PINS)
+			return dev_err_probe(dev, -EINVAL, "not enough im GPIOs\n");
 
-	for (i = 0; i < HX8357_NUM_IM_PINS; i++)
-		gpiod_set_consumer_name(lcd->im_pins->desc[i], "im_pins");
+		for (i = 0; i < HX8357_NUM_IM_PINS; i++)
+			gpiod_set_consumer_name(lcd->im_pins->desc[i], "im_pins");
+	}
 
 	lcdev = devm_lcd_device_register(&spi->dev, "mxsfb", &spi->dev, lcd,
 					&hx8357_ops);

From cd84e6bd331fd556116ec4889dc282b07c392e42 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Tue, 16 Jan 2024 03:08:27 +0200
Subject: [PATCH 184/707] dt-bindings: mfd: qcom,tcsr: Add compatibles for
 QCM2290 and SM6115

Add qcom,qcm2290-tcsr and qcom,sm6115-tcsr, compatibles for TCSR blocks
on the corresponding platforms.

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240116-usbc-phy-vls-clamp-v1-1-73b2da7691c5@linaro.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml b/Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml
index 798705ab6a4601..b97d77015335f1 100644
--- a/Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml
+++ b/Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml
@@ -19,6 +19,7 @@ properties:
       - enum:
           - qcom,msm8976-tcsr
           - qcom,msm8998-tcsr
+          - qcom,qcm2290-tcsr
           - qcom,qcs404-tcsr
           - qcom,sc7180-tcsr
           - qcom,sc7280-tcsr
@@ -28,6 +29,7 @@ properties:
           - qcom,sdx55-tcsr
           - qcom,sdx65-tcsr
           - qcom,sm4450-tcsr
+          - qcom,sm6115-tcsr
           - qcom,sm8150-tcsr
           - qcom,sm8250-tcsr
           - qcom,sm8350-tcsr

From a1958f84deb5cdba020af725fc5003a05af4819c Mon Sep 17 00:00:00 2001
From: Lukasz Majczak <lma@chromium.org>
Date: Fri, 19 Jan 2024 08:43:27 +0000
Subject: [PATCH 185/707] mfd: cros_ec: Register EC-based watchdog subdevice

Add ChromeOS EC-based watchdog as EC subdevice.

Signed-off-by: Lukasz Majczak <lma@chromium.org>
Link: https://lore.kernel.org/r/20240119084328.3135503-4-lma@chromium.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/cros_ec_dev.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
index 603b1cd5278507..4996220ce64b7b 100644
--- a/drivers/mfd/cros_ec_dev.c
+++ b/drivers/mfd/cros_ec_dev.c
@@ -91,6 +91,10 @@ static const struct mfd_cell cros_usbpd_notify_cells[] = {
 	{ .name = "cros-usbpd-notify", },
 };
 
+static const struct mfd_cell cros_ec_wdt_cells[] = {
+	{ .name = "cros-ec-wdt", }
+};
+
 static const struct cros_feature_to_cells cros_subdevices[] = {
 	{
 		.id		= EC_FEATURE_CEC,
@@ -107,6 +111,11 @@ static const struct cros_feature_to_cells cros_subdevices[] = {
 		.mfd_cells	= cros_usbpd_charger_cells,
 		.num_cells	= ARRAY_SIZE(cros_usbpd_charger_cells),
 	},
+	{
+		.id		= EC_FEATURE_HANG_DETECT,
+		.mfd_cells	= cros_ec_wdt_cells,
+		.num_cells	= ARRAY_SIZE(cros_ec_wdt_cells),
+	},
 };
 
 static const struct mfd_cell cros_ec_platform_cells[] = {

From 67421634ade0979dafd3e3f21c9b63bc93ef4760 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Tue, 23 Jan 2024 09:59:48 +0000
Subject: [PATCH 186/707] mfd: rave-sp: Avoid unnecessary use of comma operator

Although it does not seem to have any untoward side-effects,
the use of ';' to separate to assignments seems more appropriate than ','.

Flagged by clang-17 -Wcomma

No functional change intended. Compile tested only.

Signed-off-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240123-rave-sp-comma-v1-1-84e9b15ba205@kernel.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/rave-sp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/rave-sp.c b/drivers/mfd/rave-sp.c
index 6ff84b2600c543..ea5fbcbbe4a56f 100644
--- a/drivers/mfd/rave-sp.c
+++ b/drivers/mfd/rave-sp.c
@@ -358,7 +358,7 @@ int rave_sp_exec(struct rave_sp *sp,
 
 	ackid       = atomic_inc_return(&sp->ackid);
 	reply.ackid = ackid;
-	reply.code  = rave_sp_reply_code((u8)command),
+	reply.code  = rave_sp_reply_code((u8)command);
 
 	mutex_lock(&sp->bus_lock);
 

From 1e0ea9e75ff3f395ad6f85f0be2258ef114a53dc Mon Sep 17 00:00:00 2001
From: Maciej Strozek <mstrozek@opensource.cirrus.com>
Date: Tue, 23 Jan 2024 15:42:59 +0000
Subject: [PATCH 187/707] mfd: wm831x: Remove redundant forever while loop

Current code executes only once despite the while loop, so remove the
loop. Also msleep(1) will likely result in a larger sleep, so increase
its value for clarity while keeping the same behaviour.

Signed-off-by: Maciej Strozek <mstrozek@opensource.cirrus.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20240123154259.81258-1-mstrozek@opensource.cirrus.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/wm831x-auxadc.c | 43 ++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/drivers/mfd/wm831x-auxadc.c b/drivers/mfd/wm831x-auxadc.c
index 65b98f3fbd9291..18618a8f92062e 100644
--- a/drivers/mfd/wm831x-auxadc.c
+++ b/drivers/mfd/wm831x-auxadc.c
@@ -152,7 +152,7 @@ static irqreturn_t wm831x_auxadc_irq(int irq, void *irq_data)
 static int wm831x_auxadc_read_polled(struct wm831x *wm831x,
 				     enum wm831x_auxadc input)
 {
-	int ret, src, timeout;
+	int ret, src;
 
 	mutex_lock(&wm831x->auxadc_lock);
 
@@ -179,32 +179,25 @@ static int wm831x_auxadc_read_polled(struct wm831x *wm831x,
 		goto disable;
 	}
 
-	/* If we're not using interrupts then poll the
-	 * interrupt status register */
-	timeout = 5;
-	while (timeout) {
-		msleep(1);
+	/* If we're not using interrupts then read the interrupt status register */
+	msleep(20);
 
-		ret = wm831x_reg_read(wm831x,
-				      WM831X_INTERRUPT_STATUS_1);
-		if (ret < 0) {
-			dev_err(wm831x->dev,
-				"ISR 1 read failed: %d\n", ret);
-			goto disable;
-		}
+	ret = wm831x_reg_read(wm831x, WM831X_INTERRUPT_STATUS_1);
+	if (ret < 0) {
+		dev_err(wm831x->dev,
+			"ISR 1 read failed: %d\n", ret);
+		goto disable;
+	}
 
-		/* Did it complete? */
-		if (ret & WM831X_AUXADC_DATA_EINT) {
-			wm831x_reg_write(wm831x,
-					 WM831X_INTERRUPT_STATUS_1,
-					 WM831X_AUXADC_DATA_EINT);
-			break;
-		} else {
-			dev_err(wm831x->dev,
-				"AUXADC conversion timeout\n");
-			ret = -EBUSY;
-			goto disable;
-		}
+	/* Did it complete? */
+	if (ret & WM831X_AUXADC_DATA_EINT) {
+		wm831x_reg_write(wm831x, WM831X_INTERRUPT_STATUS_1,
+				WM831X_AUXADC_DATA_EINT);
+	} else {
+		dev_err(wm831x->dev,
+			"AUXADC conversion timeout\n");
+		ret = -EBUSY;
+		goto disable;
 	}
 
 	ret = wm831x_reg_read(wm831x, WM831X_AUXADC_DATA);

From 6b7704ff03d397788e75b8db78479222f0e80d3f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 23 Jan 2024 15:24:46 -0700
Subject: [PATCH 188/707] iov_iter: streamline iovec/bvec alignment iteration

Rewrite the alignment checking iterators for iovec and bvec to be easier
to read, and also significantly more compact in terms of generated code.
This saves 270 bytes of text on x86-64 for me (with clang-18) and 224
bytes on arm64 (with gcc-13).

In profiles, also saves a bit of time as well for the same workload:

     0.81%     -0.18%  [kernel.vmlinux]  [k] iov_iter_aligned_bvec
     0.48%     -0.09%  [kernel.vmlinux]  [k] iov_iter_is_aligned

which is a nice side benefit as well.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/544b31f7-6d4b-42f5-a544-1420501f081f@kernel.dk
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>

v2: do the other half of the iterators too, as suggested by Keith.
    This further saves some text.
---
 lib/iov_iter.c | 55 +++++++++++++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index e0aa6b440ca5f4..15f5040709c36e 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -714,12 +714,11 @@ EXPORT_SYMBOL(iov_iter_discard);
 static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
 				   unsigned len_mask)
 {
+	const struct iovec *iov = iter_iov(i);
 	size_t size = i->count;
 	size_t skip = i->iov_offset;
-	unsigned k;
 
-	for (k = 0; k < i->nr_segs; k++, skip = 0) {
-		const struct iovec *iov = iter_iov(i) + k;
+	do {
 		size_t len = iov->iov_len - skip;
 
 		if (len > size)
@@ -729,34 +728,36 @@ static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
 		if ((unsigned long)(iov->iov_base + skip) & addr_mask)
 			return false;
 
+		iov++;
 		size -= len;
-		if (!size)
-			break;
-	}
+		skip = 0;
+	} while (size);
+
 	return true;
 }
 
 static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
 				  unsigned len_mask)
 {
-	size_t size = i->count;
+	const struct bio_vec *bvec = i->bvec;
 	unsigned skip = i->iov_offset;
-	unsigned k;
+	size_t size = i->count;
 
-	for (k = 0; k < i->nr_segs; k++, skip = 0) {
-		size_t len = i->bvec[k].bv_len - skip;
+	do {
+		size_t len = bvec->bv_len;
 
 		if (len > size)
 			len = size;
 		if (len & len_mask)
 			return false;
-		if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask)
+		if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
 			return false;
 
+		bvec++;
 		size -= len;
-		if (!size)
-			break;
-	}
+		skip = 0;
+	} while (size);
+
 	return true;
 }
 
@@ -800,13 +801,12 @@ EXPORT_SYMBOL_GPL(iov_iter_is_aligned);
 
 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
 {
+	const struct iovec *iov = iter_iov(i);
 	unsigned long res = 0;
 	size_t size = i->count;
 	size_t skip = i->iov_offset;
-	unsigned k;
 
-	for (k = 0; k < i->nr_segs; k++, skip = 0) {
-		const struct iovec *iov = iter_iov(i) + k;
+	do {
 		size_t len = iov->iov_len - skip;
 		if (len) {
 			res |= (unsigned long)iov->iov_base + skip;
@@ -814,30 +814,31 @@ static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
 				len = size;
 			res |= len;
 			size -= len;
-			if (!size)
-				break;
 		}
-	}
+		iov++;
+		skip = 0;
+	} while (size);
 	return res;
 }
 
 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
 {
+	const struct bio_vec *bvec = i->bvec;
 	unsigned res = 0;
 	size_t size = i->count;
 	unsigned skip = i->iov_offset;
-	unsigned k;
 
-	for (k = 0; k < i->nr_segs; k++, skip = 0) {
-		size_t len = i->bvec[k].bv_len - skip;
-		res |= (unsigned long)i->bvec[k].bv_offset + skip;
+	do {
+		size_t len = bvec->bv_len - skip;
+		res |= (unsigned long)bvec->bv_offset + skip;
 		if (len > size)
 			len = size;
 		res |= len;
+		bvec++;
 		size -= len;
-		if (!size)
-			break;
-	}
+		skip = 0;
+	} while (size);
+
 	return res;
 }
 

From 38c5f831b7aed53416db6c3b0297ea4cfac41294 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Wed, 24 Jan 2024 22:28:55 +0800
Subject: [PATCH 189/707] fs: make the i_size_read/write helpers be
 smp_load_acquire/store_release()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In [Link] Linus mentions that acquire/release makes it clear which
_particular_ memory accesses are the ordered ones, and it's unlikely
to make any performance difference, so it's much better to pair up
the release->acquire ordering than have a "wmb->rmb" ordering.

=========================================================
 update pagecache
 folio_mark_uptodate(folio)
   smp_wmb()
   set_bit PG_uptodate

 === ↑↑↑ STLR ↑↑↑ === smp_store_release(&inode->i_size, i_size)

 folio_test_uptodate(folio)
   test_bit PG_uptodate
   smp_rmb()

 === ↓↓↓ LDAR ↓↓↓ === smp_load_acquire(&inode->i_size)

 copy_page_to_iter()
=========================================================

Calling smp_store_release() in i_size_write() ensures that the data
in the page and the PG_uptodate bit are updated before the isize is
updated, and calling smp_load_acquire() in i_size_read ensures that
it will not read a newer isize than the data in the page. Therefore,
this avoids buffered read-write inconsistencies caused by Load-Load
reordering.

Link: https://lore.kernel.org/r/CAHk-=wifOnmeJq+sn+2s-P46zw0SFEbw9BSCGgp2c5fYPtRPGw@mail.gmail.com/
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Link: https://lore.kernel.org/r/20240124142857.4146716-2-libaokun1@huawei.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4f6669147b9e8d..ebce4763b4bb9a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -907,7 +907,8 @@ static inline loff_t i_size_read(const struct inode *inode)
 	preempt_enable();
 	return i_size;
 #else
-	return inode->i_size;
+	/* Pairs with smp_store_release() in i_size_write() */
+	return smp_load_acquire(&inode->i_size);
 #endif
 }
 
@@ -929,7 +930,12 @@ static inline void i_size_write(struct inode *inode, loff_t i_size)
 	inode->i_size = i_size;
 	preempt_enable();
 #else
-	inode->i_size = i_size;
+	/*
+	 * Pairs with smp_load_acquire() in i_size_read() to ensure
+	 * changes related to inode size (such as page contents) are
+	 * visible before we see the changed inode size.
+	 */
+	smp_store_release(&inode->i_size, i_size);
 #endif
 }
 

From a17ab4403eaf06f54de8bd2f2217b4b69089ba93 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Wed, 24 Jan 2024 22:28:56 +0800
Subject: [PATCH 190/707] Revert "mm/filemap: avoid buffered read/write race to
 read inconsistent data"

This reverts commit e2c27b803bb6 ("mm/filemap: avoid buffered read/write
race to read inconsistent data"). After making the i_size_read/write
helpers be smp_load_acquire/store_release(), it is already guaranteed that
changes to page contents are visible before we see increased inode size,
so the extra smp_rmb() in filemap_read() can be removed.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Link: https://lore.kernel.org/r/20240124142857.4146716-3-libaokun1@huawei.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 mm/filemap.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 750e779c23db74..a72dd2eafd5ace 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2608,15 +2608,6 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 			goto put_folios;
 		end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
 
-		/*
-		 * Pairs with a barrier in
-		 * block_write_end()->mark_buffer_dirty() or other page
-		 * dirtying routines like iomap_write_end() to ensure
-		 * changes to page contents are visible before we see
-		 * increased inode size.
-		 */
-		smp_rmb();
-
 		/*
 		 * Once we start copying data, we don't want to be touching any
 		 * cachelines that might be contended:

From cf6e3cf145eb352e28812a741fde5065f1652ee8 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Wed, 24 Jan 2024 22:28:57 +0800
Subject: [PATCH 191/707] asm-generic: remove extra type checking in
 acquire/release for non-SMP case

If CONFIG_SMP is not enabled, the smp_load_acquire/smp_store_release is
implemented as READ_ONCE/READ_ONCE and barrier() and type checking.
READ_ONCE/READ_ONCE already checks the pointer type, and then checks it
more stringently outside, but the non-SMP case simply isn't relevant, so
remove the extra compiletime_assert_atomic_type() to avoid compilation
errors.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202401230837.TXro0PHi-lkp@intel.com/
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Link: https://lore.kernel.org/r/20240124142857.4146716-4-libaokun1@huawei.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/asm-generic/barrier.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 961f4d88f9ef78..0c0695763bea39 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -193,7 +193,6 @@ do {									\
 #ifndef smp_store_release
 #define smp_store_release(p, v)						\
 do {									\
-	compiletime_assert_atomic_type(*p);				\
 	barrier();							\
 	WRITE_ONCE(*p, v);						\
 } while (0)
@@ -203,7 +202,6 @@ do {									\
 #define smp_load_acquire(p)						\
 ({									\
 	__unqual_scalar_typeof(*p) ___p1 = READ_ONCE(*p);		\
-	compiletime_assert_atomic_type(*p);				\
 	barrier();							\
 	(typeof(*p))___p1;						\
 })

From 458a1af5373e269a1f253440c35dac48ce107727 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 25 Jan 2024 17:17:34 +0100
Subject: [PATCH 192/707] pidfd: cleanup the usage of __pidfd_prepare's flags

- make pidfd_create() static.

- Don't pass O_RDWR | O_CLOEXEC to __pidfd_prepare() in copy_process(),
  __pidfd_prepare() adds these flags unconditionally.

- Kill the flags check in __pidfd_prepare(). sys_pidfd_open() checks the
  flags itself, all other users of pidfd_prepare() pass flags = 0.

  If we need a sanity check for those other in kernel users then
  WARN_ON_ONCE(flags & ~PIDFD_NONBLOCK) makes more sense.

- Don't pass O_RDWR to get_unused_fd_flags(), it ignores everything except
  O_CLOEXEC.

- Don't pass O_CLOEXEC to anon_inode_getfile(), it ignores everything except
  O_ACCMODE | O_NONBLOCK.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240125161734.GA778@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pid.h | 1 -
 kernel/fork.c       | 9 +++------
 kernel/pid.c        | 2 +-
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 395cacce1179ca..e6a041cb8bacc7 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -73,7 +73,6 @@ struct file;
 extern struct pid *pidfd_pid(const struct file *file);
 struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
 struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
-int pidfd_create(struct pid *pid, unsigned int flags);
 int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
 
 static inline struct pid *get_pid(struct pid *pid)
diff --git a/kernel/fork.c b/kernel/fork.c
index 47ff3b35352e0b..34704d277b68a5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2130,15 +2130,12 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
 	int pidfd;
 	struct file *pidfd_file;
 
-	if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
-		return -EINVAL;
-
-	pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+	pidfd = get_unused_fd_flags(O_CLOEXEC);
 	if (pidfd < 0)
 		return pidfd;
 
 	pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
-					flags | O_RDWR | O_CLOEXEC);
+					flags | O_RDWR);
 	if (IS_ERR(pidfd_file)) {
 		put_unused_fd(pidfd);
 		return PTR_ERR(pidfd_file);
@@ -2524,7 +2521,7 @@ __latent_entropy struct task_struct *copy_process(
 	 */
 	if (clone_flags & CLONE_PIDFD) {
 		/* Note that no task has been attached to @pid yet. */
-		retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
+		retval = __pidfd_prepare(pid, 0, &pidfile);
 		if (retval < 0)
 			goto bad_fork_free_pid;
 		pidfd = retval;
diff --git a/kernel/pid.c b/kernel/pid.c
index b52b1086545415..c7a3e359f8f590 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -595,7 +595,7 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
  * Return: On success, a cloexec pidfd is returned.
  *         On error, a negative errno number will be returned.
  */
-int pidfd_create(struct pid *pid, unsigned int flags)
+static int pidfd_create(struct pid *pid, unsigned int flags)
 {
 	int pidfd;
 	struct file *pidfd_file;

From 73b9e13998fa51839f051873ab03967fdf3fe795 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 24 Nov 2023 14:55:37 -0800
Subject: [PATCH 193/707] doc: Spinlocks are implied RCU readers

In kernels built with CONFIG_PREEMPT_RT=n, spinlock critical sections
are RCU readers because they disable preemption.  However, they are also
RCU readers in CONFIG_PREEMPT_RT=y because the -rt locking primitives
contain rcu_read_lock() and rcu_read_unlock().  Therefore, upgrade
rcu_dereference.rst to document this non-obvious case.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/rcu_dereference.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Documentation/RCU/rcu_dereference.rst b/Documentation/RCU/rcu_dereference.rst
index 659d5913784d0d..2524dcdadde2b8 100644
--- a/Documentation/RCU/rcu_dereference.rst
+++ b/Documentation/RCU/rcu_dereference.rst
@@ -408,7 +408,10 @@ member of the rcu_dereference() to use in various situations:
 	RCU flavors, an RCU read-side critical section is entered
 	using rcu_read_lock(), anything that disables bottom halves,
 	anything that disables interrupts, or anything that disables
-	preemption.
+	preemption.  Please note that spinlock critical sections
+	are also implied RCU read-side critical sections, even when
+	they are preemptible, as they are in kernels built with
+	CONFIG_PREEMPT_RT=y.
 
 2.	If the access might be within an RCU read-side critical section
 	on the one hand, or protected by (say) my_lock on the other,

From cdc2686695c2495f5802e7baae07479576180392 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 24 Nov 2023 15:06:46 -0800
Subject: [PATCH 194/707] doc: Make whatisRCU.rst note that spinlocks are RCU
 readers

In kernels built with CONFIG_PREEMPT_RT=n, spinlock critical sections
are RCU readers because they disable preemption.  However, they are also
RCU readers in CONFIG_PREEMPT_RT=y because in that case the locking
primitives contain rcu_read_lock() and rcu_read_unlock().  Therefore,
upgrade whatisRCU.rst to document this non-obvious case.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/whatisRCU.rst | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst
index 60ce02475142d8..246ce0d0b4d116 100644
--- a/Documentation/RCU/whatisRCU.rst
+++ b/Documentation/RCU/whatisRCU.rst
@@ -172,14 +172,25 @@ rcu_read_lock()
 	critical section.  Reference counts may be used in conjunction
 	with RCU to maintain longer-term references to data structures.
 
+	Note that anything that disables bottom halves, preemption,
+	or interrupts also enters an RCU read-side critical section.
+	Acquiring a spinlock also enters an RCU read-side critical
+	sections, even for spinlocks that do not disable preemption,
+	as is the case in kernels built with CONFIG_PREEMPT_RT=y.
+	Sleeplocks do *not* enter RCU read-side critical sections.
+
 rcu_read_unlock()
 ^^^^^^^^^^^^^^^^^
 	void rcu_read_unlock(void);
 
 	This temporal primitives is used by a reader to inform the
 	reclaimer that the reader is exiting an RCU read-side critical
-	section.  Note that RCU read-side critical sections may be nested
-	and/or overlapping.
+	section.  Anything that enables bottom halves, preemption,
+	or interrupts also exits an RCU read-side critical section.
+	Releasing a spinlock also exits an RCU read-side critical section.
+
+	Note that RCU read-side critical sections may be nested and/or
+	overlapping.
 
 synchronize_rcu()
 ^^^^^^^^^^^^^^^^^

From 2888f00828d7779a4045b501fc5eb577b42248b5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 24 Nov 2023 16:23:56 -0800
Subject: [PATCH 195/707] doc: Make checklist.rst note that spinlocks are
 implied RCU readers

In kernels built with CONFIG_PREEMPT_RT=n, spinlock critical sections
are RCU readers because they disable preemption.  However, they are also
RCU readers in CONFIG_PREEMPT_RT=y because in that case the locking
primitives contain rcu_read_lock() and rcu_read_unlock().  Therefore,
upgrade checklist.rst to document this non-obvious case.

While in the area, fix a typo by changing "read-side critical" to
"read-side critical section".

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/checklist.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
index 2d42998a89a637..98a622f7724816 100644
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -68,7 +68,8 @@ over a rather long period of time, but improvements are always welcome!
 	rcu_read_lock_sched(), or by the appropriate update-side lock.
 	Explicit disabling of preemption (preempt_disable(), for example)
 	can serve as rcu_read_lock_sched(), but is less readable and
-	prevents lockdep from detecting locking issues.
+	prevents lockdep from detecting locking issues.  Acquiring a
+	spinlock also enters an RCU read-side critical section.
 
 	Please note that you *cannot* rely on code known to be built
 	only in non-preemptible kernels.  Such code can and will break,
@@ -444,7 +445,7 @@ over a rather long period of time, but improvements are always welcome!
 	real-time workloads than is synchronize_rcu_expedited().
 
 	It is also permissible to sleep in RCU Tasks Trace read-side
-	critical, which are delimited by rcu_read_lock_trace() and
+	critical section, which are delimited by rcu_read_lock_trace() and
 	rcu_read_unlock_trace().  However, this is a specialized flavor
 	of RCU, and you should not use it without first checking with
 	its current users.  In most cases, you should instead use SRCU.

From effe2f73f97f332b593e4fb6d466cc9485ff998c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 24 Nov 2023 16:29:01 -0800
Subject: [PATCH 196/707] doc: Add CONFIG_RCU_STRICT_GRACE_PERIOD to
 checklist.rst

This commit adds CONFIG_RCU_STRICT_GRACE_PERIOD to the list of debugging
Kconfig options in checklist.rst.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/checklist.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
index 98a622f7724816..addd5c1547a420 100644
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -491,6 +491,12 @@ over a rather long period of time, but improvements are always welcome!
 		since the last time that you passed that same object to
 		call_rcu() (or friends).
 
+	CONFIG_RCU_STRICT_GRACE_PERIOD:
+		combine with KASAN to check for pointers leaked out
+		of RCU read-side critical sections.  This Kconfig
+		option is tough on both performance and scalability,
+		and so is limited to four-CPU systems.
+
 	__rcu sparse checks:
 		tag the pointer to the RCU-protected data structure
 		with __rcu, and sparse will warn you if you access that

From 10514fa5f7a55149cf7fc63138f39d35763374bf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sun, 26 Nov 2023 11:06:10 -0800
Subject: [PATCH 197/707] doc: Add EARLY flag to early-parsed kernel boot
 parameters

Kernel boot parameters declared with early_param() are parsed before
embedded parameters are extracted from initrd, and early_param()
parameters are not helpful when embedded in initrd.  Therefore,
mark early_param() kernel boot parameters with "EARLY" in
kernel-parameters.txt.

The following early_param() calls declare kernel boot parameters that
are undocumented:

early_param("atmel.pm_modes", at91_pm_modes_select);
early_param("mem_fclk_21285", early_fclk);
early_param("ecc", early_ecc);
early_param("cachepolicy", early_cachepolicy);
early_param("nodebugmon", early_debug_disable);
early_param("kfence.sample_interval", parse_kfence_early_init);
early_param("additional_cpus", setup_additional_cpus);
early_param("stram_pool", atari_stram_setup);
early_param("disable_octeon_edac", disable_octeon_edac);
early_param("rd_start", rd_start_early);
early_param("rd_size", rd_size_early);
early_param("coherentio", setcoherentio);
early_param("nocoherentio", setnocoherentio);
early_param("fadump", early_fadump_param);
early_param("fadump_reserve_mem", early_fadump_reserve_mem);
early_param("no_stf_barrier", handle_no_stf_barrier);
early_param("no_rfi_flush", handle_no_rfi_flush);
early_param("smt-enabled", early_smt_enabled);
early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup);
early_param("ps3fb", early_parse_ps3fb);
early_param("ps3flash", early_parse_ps3flash);
early_param("novx", disable_vector_extension);
early_param("nobp", nobp_setup_early);
early_param("nospec", nospec_setup_early);
early_param("possible_cpus", _setup_possible_cpus);
early_param("stp", early_parse_stp);
early_param("nopfault", nopfault);
early_param("nmi_mode", nmi_mode_setup);
early_param("sh_mv", early_parse_mv);
early_param("pmb", early_pmb);
early_param("hvirq", early_hvirq_major);
early_param("cfi", cfi_parse_cmdline);
early_param("disableapic", setup_disableapic);
early_param("noapictimer", parse_disable_apic_timer);
early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
early_param("uv_memblksize", parse_mem_block_size);
early_param("retbleed", retbleed_parse_cmdline);
early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
early_param("update_mptable", update_mptable_setup);
early_param("alloc_mptable", parse_alloc_mptable_opt);
early_param("possible_cpus", _setup_possible_cpus);
early_param("lsmsi", early_parse_ls_scfg_msi);
early_param("nokgdbroundup", opt_nokgdbroundup);
early_param("kgdbcon", opt_kgdb_con);
early_param("kasan", early_kasan_flag);
early_param("kasan.mode", early_kasan_mode);
early_param("kasan.vmalloc", early_kasan_flag_vmalloc);
early_param("kasan.page_alloc.sample", early_kasan_flag_page_alloc_sample);
early_param("kasan.page_alloc.sample.order", early_kasan_flag_page_alloc_sample_order);
early_param("kasan.fault", early_kasan_fault);
early_param("kasan.stacktrace", early_kasan_flag_stacktrace);
early_param("kasan.stack_ring_size", early_kasan_flag_stack_ring_size);
early_param("accept_memory", accept_memory_parse);
early_param("page_table_check", early_page_table_check_param);
sh_early_platform_init("earlytimer", &sh_cmt_device_driver);
early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES);

These are not necessarily bugs, given that some kernel boot parameters
are intended for deep debugging rather than general use.

This work does not cover all of the kernel boot parameters declared using
cmdline_find_option() and cmdline_find_option_bool().  If these are in
fact guaranteed to be early (which appears to be the case), they can be
added in a later version of this patch.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Petr Malat <oss@malat.biz>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: <linux-doc@vger.kernel.org>
Cc: <linux-arch@vger.kernel.org>
---
 .../admin-guide/kernel-parameters.rst         |   1 +
 .../admin-guide/kernel-parameters.txt         | 484 +++++++++---------
 2 files changed, 250 insertions(+), 235 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index 102937bc8443a2..fe4644df4b4703 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -108,6 +108,7 @@ is applicable::
 	CMA	Contiguous Memory Area support is enabled.
 	DRM	Direct Rendering Management support is enabled.
 	DYNAMIC_DEBUG Build in debug messages and enable them at runtime
+	EARLY	Parameter processed too early to be embedded in initrd.
 	EDD	BIOS Enhanced Disk Drive Services (EDD) is enabled
 	EFI	EFI Partitioning (GPT) is enabled
 	EVM	Extended Verification Module
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 31b3a25680d08c..4839f2919fdfa6 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -9,7 +9,7 @@
 			accept_memory=eager can be used to accept all memory
 			at once during boot.
 
-	acpi=		[HW,ACPI,X86,ARM64,RISCV64]
+	acpi=		[HW,ACPI,X86,ARM64,RISCV64,EARLY]
 			Advanced Configuration and Power Interface
 			Format: { force | on | off | strict | noirq | rsdt |
 				  copy_dsdt }
@@ -26,7 +26,7 @@
 
 			See also Documentation/power/runtime_pm.rst, pci=noacpi
 
-	acpi_apic_instance=	[ACPI, IOAPIC]
+	acpi_apic_instance=	[ACPI,IOAPIC,EARLY]
 			Format: <int>
 			2: use 2nd APIC table, if available
 			1,0: use 1st APIC table
@@ -41,7 +41,7 @@
 			If set to native, use the device's native backlight mode.
 			If set to none, disable the ACPI backlight interface.
 
-	acpi_force_32bit_fadt_addr
+	acpi_force_32bit_fadt_addr [ACPI,EARLY]
 			force FADT to use 32 bit addresses rather than the
 			64 bit X_* addresses. Some firmware have broken 64
 			bit addresses for force ACPI ignore these and use
@@ -97,7 +97,7 @@
 			no: ACPI OperationRegions are not marked as reserved,
 			no further checks are performed.
 
-	acpi_force_table_verification	[HW,ACPI]
+	acpi_force_table_verification	[HW,ACPI,EARLY]
 			Enable table checksum verification during early stage.
 			By default, this is disabled due to x86 early mapping
 			size limitation.
@@ -137,7 +137,7 @@
 	acpi_no_memhotplug [ACPI] Disable memory hotplug.  Useful for kdump
 			   kernels.
 
-	acpi_no_static_ssdt	[HW,ACPI]
+	acpi_no_static_ssdt	[HW,ACPI,EARLY]
 			Disable installation of static SSDTs at early boot time
 			By default, SSDTs contained in the RSDT/XSDT will be
 			installed automatically and they will appear under
@@ -151,7 +151,7 @@
 			Ignore the ACPI-based watchdog interface (WDAT) and let
 			a native driver control the watchdog device instead.
 
-	acpi_rsdp=	[ACPI,EFI,KEXEC]
+	acpi_rsdp=	[ACPI,EFI,KEXEC,EARLY]
 			Pass the RSDP address to the kernel, mostly used
 			on machines running EFI runtime service to boot the
 			second kernel for kdump.
@@ -228,10 +228,10 @@
 			to assume that this machine's pmtimer latches its value
 			and always returns good values.
 
-	acpi_sci=	[HW,ACPI] ACPI System Control Interrupt trigger mode
+	acpi_sci=	[HW,ACPI,EARLY] ACPI System Control Interrupt trigger mode
 			Format: { level | edge | high | low }
 
-	acpi_skip_timer_override [HW,ACPI]
+	acpi_skip_timer_override [HW,ACPI,EARLY]
 			Recognize and ignore IRQ0/pin2 Interrupt Override.
 			For broken nForce2 BIOS resulting in XT-PIC timer.
 
@@ -266,11 +266,11 @@
 			behave incorrectly in some ways with respect to system
 			suspend and resume to be ignored (use wisely).
 
-	acpi_use_timer_override [HW,ACPI]
+	acpi_use_timer_override [HW,ACPI,EARLY]
 			Use timer override. For some broken Nvidia NF5 boards
 			that require a timer override, but don't have HPET
 
-	add_efi_memmap	[EFI; X86] Include EFI memory map in
+	add_efi_memmap	[EFI,X86,EARLY] Include EFI memory map in
 			kernel's map of available physical RAM.
 
 	agp=		[AGP]
@@ -307,7 +307,7 @@
 			do not want to use tracing_snapshot_alloc() as it needs
 			to be done where GFP_KERNEL allocations are allowed.
 
-	allow_mismatched_32bit_el0 [ARM64]
+	allow_mismatched_32bit_el0 [ARM64,EARLY]
 			Allow execve() of 32-bit applications and setting of the
 			PER_LINUX32 personality on systems where only a strict
 			subset of the CPUs support 32-bit EL0. When this
@@ -351,7 +351,7 @@
 			             This mode requires kvm-amd.avic=1.
 			             (Default when IOMMU HW support is present.)
 
-	amd_pstate=	[X86]
+	amd_pstate=	[X86,EARLY]
 			disable
 			  Do not enable amd_pstate as the default
 			  scaling driver for the supported processors
@@ -391,7 +391,7 @@
 			not play well with APC CPU idle - disable it if you have
 			APC and your system crashes randomly.
 
-	apic=		[APIC,X86] Advanced Programmable Interrupt Controller
+	apic=		[APIC,X86,EARLY] Advanced Programmable Interrupt Controller
 			Change the output verbosity while booting
 			Format: { quiet (default) | verbose | debug }
 			Change the amount of debugging information output
@@ -401,7 +401,7 @@
 			Format: apic=driver_name
 			Examples: apic=bigsmp
 
-	apic_extnmi=	[APIC,X86] External NMI delivery setting
+	apic_extnmi=	[APIC,X86,EARLY] External NMI delivery setting
 			Format: { bsp (default) | all | none }
 			bsp:  External NMI is delivered only to CPU 0
 			all:  External NMIs are broadcast to all CPUs as a
@@ -508,21 +508,22 @@
 	bert_disable	[ACPI]
 			Disable BERT OS support on buggy BIOSes.
 
-	bgrt_disable	[ACPI][X86]
+	bgrt_disable	[ACPI,X86,EARLY]
 			Disable BGRT to avoid flickering OEM logo.
 
 	blkdevparts=	Manual partition parsing of block device(s) for
 			embedded devices based on command line input.
 			See Documentation/block/cmdline-partition.rst
 
-	boot_delay=	Milliseconds to delay each printk during boot.
+	boot_delay=	[KNL,EARLY]
+			Milliseconds to delay each printk during boot.
 			Only works if CONFIG_BOOT_PRINTK_DELAY is enabled,
 			and you may also have to specify "lpj=".  Boot_delay
 			values larger than 10 seconds (10000) are assumed
 			erroneous and ignored.
 			Format: integer
 
-	bootconfig	[KNL]
+	bootconfig	[KNL,EARLY]
 			Extended command line options can be added to an initrd
 			and this will cause the kernel to look for it.
 
@@ -557,7 +558,7 @@
 			trust validation.
 			format: { id:<keyid> | builtin }
 
-	cca=		[MIPS] Override the kernel pages' cache coherency
+	cca=		[MIPS,EARLY] Override the kernel pages' cache coherency
 			algorithm.  Accepted values range from 0 to 7
 			inclusive. See arch/mips/include/asm/pgtable-bits.h
 			for platform specific values (SB1, Loongson3 and
@@ -672,7 +673,7 @@
 			[X86-64] hpet,tsc
 
 	clocksource.arm_arch_timer.evtstrm=
-			[ARM,ARM64]
+			[ARM,ARM64,EARLY]
 			Format: <bool>
 			Enable/disable the eventstream feature of the ARM
 			architected timer so that code using WFE-based polling
@@ -702,7 +703,7 @@
 			10 seconds when built into the kernel.
 
 	cma=nn[MG]@[start[MG][-end[MG]]]
-			[KNL,CMA]
+			[KNL,CMA,EARLY]
 			Sets the size of kernel global memory area for
 			contiguous memory allocations and optionally the
 			placement constraint by the physical address range of
@@ -711,7 +712,7 @@
 			kernel/dma/contiguous.c
 
 	cma_pernuma=nn[MG]
-			[KNL,CMA]
+			[KNL,CMA,EARLY]
 			Sets the size of kernel per-numa memory area for
 			contiguous memory allocations. A value of 0 disables
 			per-numa CMA altogether. And If this option is not
@@ -722,7 +723,7 @@
 			they will fallback to the global default memory area.
 
 	numa_cma=<node>:nn[MG][,<node>:nn[MG]]
-			[KNL,CMA]
+			[KNL,CMA,EARLY]
 			Sets the size of kernel numa memory area for
 			contiguous memory allocations. It will reserve CMA
 			area for the specified node.
@@ -739,7 +740,7 @@
 			a hypervisor.
 			Default: yes
 
-	coherent_pool=nn[KMG]	[ARM,KNL]
+	coherent_pool=nn[KMG]	[ARM,KNL,EARLY]
 			Sets the size of memory pool for coherent, atomic dma
 			allocations, by default set to 256K.
 
@@ -757,7 +758,7 @@
 	condev=		[HW,S390] console device
 	conmode=
 
-	con3215_drop=	[S390] 3215 console drop mode.
+	con3215_drop=	[S390,EARLY] 3215 console drop mode.
 			Format: y|n|Y|N|1|0
 			When set to true, drop data on the 3215 console when
 			the console buffer is full. In this case the
@@ -863,7 +864,7 @@
 			kernel before the cpufreq driver probes.
 
 	cpu_init_udelay=N
-			[X86] Delay for N microsec between assert and de-assert
+			[X86,EARLY] Delay for N microsec between assert and de-assert
 			of APIC INIT to start processors.  This delay occurs
 			on every CPU online, such as boot, and resume from suspend.
 			Default: 10000
@@ -883,7 +884,7 @@
 			kernel more unstable.
 
 	crashkernel=size[KMG][@offset[KMG]]
-			[KNL] Using kexec, Linux can switch to a 'crash kernel'
+			[KNL,EARLY] Using kexec, Linux can switch to a 'crash kernel'
 			upon panic. This parameter reserves the physical
 			memory region [offset, offset + size] for that kernel
 			image. If '@offset' is omitted, then a suitable offset
@@ -954,10 +955,10 @@
 			Format: <port#>,<type>
 			See also Documentation/input/devices/joystick-parport.rst
 
-	debug		[KNL] Enable kernel debugging (events log level).
+	debug		[KNL,EARLY] Enable kernel debugging (events log level).
 
 	debug_boot_weak_hash
-			[KNL] Enable printing [hashed] pointers early in the
+			[KNL,EARLY] Enable printing [hashed] pointers early in the
 			boot sequence.  If enabled, we use a weak hash instead
 			of siphash to hash pointers.  Use this option if you are
 			seeing instances of '(___ptrval___)') and need to see a
@@ -974,10 +975,10 @@
 			will print _a_lot_ more information - normally only
 			useful to lockdep developers.
 
-	debug_objects	[KNL] Enable object debugging
+	debug_objects	[KNL,EARLY] Enable object debugging
 
 	debug_guardpage_minorder=
-			[KNL] When CONFIG_DEBUG_PAGEALLOC is set, this
+			[KNL,EARLY] When CONFIG_DEBUG_PAGEALLOC is set, this
 			parameter allows control of the order of pages that will
 			be intentionally kept free (and hence protected) by the
 			buddy allocator. Bigger value increase the probability
@@ -996,7 +997,7 @@
 			help tracking down these problems.
 
 	debug_pagealloc=
-			[KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter
+			[KNL,EARLY] When CONFIG_DEBUG_PAGEALLOC is set, this parameter
 			enables the feature at boot time. By default, it is
 			disabled and the system will work mostly the same as a
 			kernel built without CONFIG_DEBUG_PAGEALLOC.
@@ -1004,8 +1005,8 @@
 			useful to also enable the page_owner functionality.
 			on: enable the feature
 
-	debugfs=    	[KNL] This parameter enables what is exposed to userspace
-			and debugfs internal clients.
+	debugfs=    	[KNL,EARLY] This parameter enables what is exposed to
+			userspace and debugfs internal clients.
 			Format: { on, no-mount, off }
 			on: 	All functions are enabled.
 			no-mount:
@@ -1084,7 +1085,7 @@
 	dhash_entries=	[KNL]
 			Set number of hash buckets for dentry cache.
 
-	disable_1tb_segments [PPC]
+	disable_1tb_segments [PPC,EARLY]
 			Disables the use of 1TB hash page table segments. This
 			causes the kernel to fall back to 256MB segments which
 			can be useful when debugging issues that require an SLB
@@ -1093,7 +1094,7 @@
 	disable=	[IPV6]
 			See Documentation/networking/ipv6.rst.
 
-	disable_radix	[PPC]
+	disable_radix	[PPC,EARLY]
 			Disable RADIX MMU mode on POWER9
 
 	disable_tlbie	[PPC]
@@ -1109,25 +1110,25 @@
 			causing system reset or hang due to sending
 			INIT from AP to BSP.
 
-	disable_ddw	[PPC/PSERIES]
+	disable_ddw	[PPC/PSERIES,EARLY]
 			Disable Dynamic DMA Window support. Use this
 			to workaround buggy firmware.
 
 	disable_ipv6=	[IPV6]
 			See Documentation/networking/ipv6.rst.
 
-	disable_mtrr_cleanup [X86]
+	disable_mtrr_cleanup [X86,EARLY]
 			The kernel tries to adjust MTRR layout from continuous
 			to discrete, to make X server driver able to add WB
 			entry later. This parameter disables that.
 
-	disable_mtrr_trim [X86, Intel and AMD only]
+	disable_mtrr_trim [X86, Intel and AMD only,EARLY]
 			By default the kernel will trim any uncacheable
 			memory out of your available memory pool based on
 			MTRR settings.  This parameter disables that behavior,
 			possibly causing your machine to run very slowly.
 
-	disable_timer_pin_1 [X86]
+	disable_timer_pin_1 [X86,EARLY]
 			Disable PIN 1 of APIC timer
 			Can be useful to work around chipset bugs.
 
@@ -1177,7 +1178,7 @@
 
 	dscc4.setup=	[NET]
 
-	dt_cpu_ftrs=	[PPC]
+	dt_cpu_ftrs=	[PPC,EARLY]
 			Format: {"off" | "known"}
 			Control how the dt_cpu_ftrs device-tree binding is
 			used for CPU feature discovery and setup (if it
@@ -1197,12 +1198,12 @@
 			Documentation/admin-guide/dynamic-debug-howto.rst
 			for details.
 
-	early_ioremap_debug [KNL]
+	early_ioremap_debug [KNL,EARLY]
 			Enable debug messages in early_ioremap support. This
 			is useful for tracking down temporary early mappings
 			which are not unmapped.
 
-	earlycon=	[KNL] Output early console device and options.
+	earlycon=	[KNL,EARLY] Output early console device and options.
 
 			When used with no options, the early console is
 			determined by stdout-path property in device tree's
@@ -1338,7 +1339,7 @@
 			address must be provided, and the serial port must
 			already be setup and configured.
 
-	earlyprintk=	[X86,SH,ARM,M68k,S390]
+	earlyprintk=	[X86,SH,ARM,M68k,S390,UM,EARLY]
 			earlyprintk=vga
 			earlyprintk=sclp
 			earlyprintk=xen
@@ -1396,7 +1397,7 @@
 	edd=		[EDD]
 			Format: {"off" | "on" | "skip[mbr]"}
 
-	efi=		[EFI]
+	efi=		[EFI,EARLY]
 			Format: { "debug", "disable_early_pci_dma",
 				  "nochunk", "noruntime", "nosoftreserve",
 				  "novamap", "no_disable_early_pci_dma" }
@@ -1417,13 +1418,13 @@
 			no_disable_early_pci_dma: Leave the busmaster bit set
 			on all PCI bridges while in the EFI boot stub
 
-	efi_no_storage_paranoia [EFI; X86]
+	efi_no_storage_paranoia [EFI,X86,EARLY]
 			Using this parameter you can use more than 50% of
 			your efi variable storage. Use this parameter only if
 			you are really sure that your UEFI does sane gc and
 			fulfills the spec otherwise your board may brick.
 
-	efi_fake_mem=	nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86]
+	efi_fake_mem=	nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI,X86,EARLY]
 			Add arbitrary attribute to specific memory range by
 			updating original EFI memory map.
 			Region of memory which aa attribute is added to is
@@ -1454,7 +1455,7 @@
 	eisa_irq_edge=	[PARISC,HW]
 			See header of drivers/parisc/eisa.c.
 
-	ekgdboc=	[X86,KGDB] Allow early kernel console debugging
+	ekgdboc=	[X86,KGDB,EARLY] Allow early kernel console debugging
 			Format: ekgdboc=kbd
 
 			This is designed to be used in conjunction with
@@ -1469,13 +1470,13 @@
 			See comment before function elanfreq_setup() in
 			arch/x86/kernel/cpu/cpufreq/elanfreq.c.
 
-	elfcorehdr=[size[KMG]@]offset[KMG] [PPC,SH,X86,S390]
+	elfcorehdr=[size[KMG]@]offset[KMG] [PPC,SH,X86,S390,EARLY]
 			Specifies physical address of start of kernel core
 			image elf header and optionally the size. Generally
 			kexec loader will pass this option to capture kernel.
 			See Documentation/admin-guide/kdump/kdump.rst for details.
 
-	enable_mtrr_cleanup [X86]
+	enable_mtrr_cleanup [X86,EARLY]
 			The kernel tries to adjust MTRR layout from continuous
 			to discrete, to make X server driver able to add WB
 			entry later. This parameter enables that.
@@ -1508,7 +1509,7 @@
 			Permit 'security.evm' to be updated regardless of
 			current integrity status.
 
-	early_page_ext [KNL] Enforces page_ext initialization to earlier
+	early_page_ext [KNL,EARLY] Enforces page_ext initialization to earlier
 			stages so cover more early boot allocations.
 			Please note that as side effect some optimizations
 			might be disabled to achieve that (e.g. parallelized
@@ -1600,7 +1601,7 @@
 			can be changed at run time by the max_graph_depth file
 			in the tracefs tracing directory. default: 0 (no limit)
 
-	fw_devlink=	[KNL] Create device links between consumer and supplier
+	fw_devlink=	[KNL,EARLY] Create device links between consumer and supplier
 			devices by scanning the firmware to infer the
 			consumer/supplier relationships. This feature is
 			especially useful when drivers are loaded as modules as
@@ -1619,12 +1620,12 @@
 			rpm --	Like "on", but also use to order runtime PM.
 
 	fw_devlink.strict=<bool>
-			[KNL] Treat all inferred dependencies as mandatory
+			[KNL,EARLY] Treat all inferred dependencies as mandatory
 			dependencies. This only applies for fw_devlink=on|rpm.
 			Format: <bool>
 
 	fw_devlink.sync_state =
-			[KNL] When all devices that could probe have finished
+			[KNL,EARLY] When all devices that could probe have finished
 			probing, this parameter controls what to do with
 			devices that haven't yet received their sync_state()
 			calls.
@@ -1645,12 +1646,12 @@
 
 	gamma=		[HW,DRM]
 
-	gart_fix_e820=	[X86-64] disable the fix e820 for K8 GART
+	gart_fix_e820=	[X86-64,EARLY] disable the fix e820 for K8 GART
 			Format: off | on
 			default: on
 
 	gather_data_sampling=
-			[X86,INTEL] Control the Gather Data Sampling (GDS)
+			[X86,INTEL,EARLY] Control the Gather Data Sampling (GDS)
 			mitigation.
 
 			Gather Data Sampling is a hardware vulnerability which
@@ -1748,7 +1749,7 @@
 				(that will set all pages holding image data
 				during restoration read-only).
 
-	highmem=nn[KMG]	[KNL,BOOT] forces the highmem zone to have an exact
+	highmem=nn[KMG]	[KNL,BOOT,EARLY] forces the highmem zone to have an exact
 			size of <nn>. This works even on boxes that have no
 			highmem otherwise. This also works to reduce highmem
 			size on bigger boxes.
@@ -1759,7 +1760,7 @@
 
 	hlt		[BUGS=ARM,SH]
 
-	hostname=	[KNL] Set the hostname (aka UTS nodename).
+	hostname=	[KNL,EARLY] Set the hostname (aka UTS nodename).
 			Format: <string>
 			This allows setting the system's hostname during early
 			startup. This sets the name returned by gethostname.
@@ -1804,7 +1805,7 @@
 			Documentation/admin-guide/mm/hugetlbpage.rst.
 			Format: size[KMG]
 
-	hugetlb_cma=	[HW,CMA] The size of a CMA area used for allocation
+	hugetlb_cma=	[HW,CMA,EARLY] The size of a CMA area used for allocation
 			of gigantic hugepages. Or using node format, the size
 			of a CMA area per node can be specified.
 			Format: nn[KMGTPE] or (node format)
@@ -1850,9 +1851,10 @@
 				If specified, z/VM IUCV HVC accepts connections
 				from listed z/VM user IDs only.
 
-	hv_nopvspin	[X86,HYPER_V] Disables the paravirt spinlock optimizations
-				      which allow the hypervisor to 'idle' the
-				      guest on lock contention.
+	hv_nopvspin	[X86,HYPER_V,EARLY]
+			Disables the paravirt spinlock optimizations
+			which allow the hypervisor to 'idle' the guest
+			on lock contention.
 
 	i2c_bus=	[HW]	Override the default board specific I2C bus speed
 				or register an additional I2C bus that is not
@@ -1917,7 +1919,7 @@
 			Format: <io>[,<membase>[,<icn_id>[,<icn_id2>]]]
 
 
-	idle=		[X86]
+	idle=		[X86,EARLY]
 			Format: idle=poll, idle=halt, idle=nomwait
 			Poll forces a polling idle loop that can slightly
 			improve the performance of waking up a idle CPU, but
@@ -1973,7 +1975,7 @@
 			mode generally follows that for the NaN encoding,
 			except where unsupported by hardware.
 
-	ignore_loglevel	[KNL]
+	ignore_loglevel	[KNL,EARLY]
 			Ignore loglevel setting - this will print /all/
 			kernel messages to the console. Useful for debugging.
 			We also add it as printk module parameter, so users
@@ -2091,21 +2093,21 @@
 			unpacking being completed before device_ and
 			late_ initcalls.
 
-	initrd=		[BOOT] Specify the location of the initial ramdisk
+	initrd=		[BOOT,EARLY] Specify the location of the initial ramdisk
 
-	initrdmem=	[KNL] Specify a physical address and size from which to
+	initrdmem=	[KNL,EARLY] Specify a physical address and size from which to
 			load the initrd. If an initrd is compiled in or
 			specified in the bootparams, it takes priority over this
 			setting.
 			Format: ss[KMG],nn[KMG]
 			Default is 0, 0
 
-	init_on_alloc=	[MM] Fill newly allocated pages and heap objects with
+	init_on_alloc=	[MM,EARLY] Fill newly allocated pages and heap objects with
 			zeroes.
 			Format: 0 | 1
 			Default set by CONFIG_INIT_ON_ALLOC_DEFAULT_ON.
 
-	init_on_free=	[MM] Fill freed pages and heap objects with zeroes.
+	init_on_free=	[MM,EARLY] Fill freed pages and heap objects with zeroes.
 			Format: 0 | 1
 			Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON.
 
@@ -2161,7 +2163,7 @@
 			0	disables intel_idle and fall back on acpi_idle.
 			1 to 9	specify maximum depth of C-state.
 
-	intel_pstate=	[X86]
+	intel_pstate=	[X86,EARLY]
 			disable
 			  Do not enable intel_pstate as the default
 			  scaling driver for the supported processors
@@ -2205,7 +2207,7 @@
 			  Allow per-logical-CPU P-State performance control limits using
 			  cpufreq sysfs interface
 
-	intremap=	[X86-64, Intel-IOMMU]
+	intremap=	[X86-64,Intel-IOMMU,EARLY]
 			on	enable Interrupt Remapping (default)
 			off	disable Interrupt Remapping
 			nosid	disable Source ID checking
@@ -2217,7 +2219,7 @@
 		strict	regions from userspace.
 		relaxed
 
-	iommu=		[X86]
+	iommu=		[X86,EARLY]
 		off
 		force
 		noforce
@@ -2232,7 +2234,7 @@
 		nobypass	[PPC/POWERNV]
 			Disable IOMMU bypass, using IOMMU for PCI devices.
 
-	iommu.forcedac=	[ARM64, X86] Control IOVA allocation for PCI devices.
+	iommu.forcedac=	[ARM64,X86,EARLY] Control IOVA allocation for PCI devices.
 			Format: { "0" | "1" }
 			0 - Try to allocate a 32-bit DMA address first, before
 			  falling back to the full range if needed.
@@ -2240,7 +2242,7 @@
 			  forcing Dual Address Cycle for PCI cards supporting
 			  greater than 32-bit addressing.
 
-	iommu.strict=	[ARM64, X86, S390] Configure TLB invalidation behaviour
+	iommu.strict=	[ARM64,X86,S390,EARLY] Configure TLB invalidation behaviour
 			Format: { "0" | "1" }
 			0 - Lazy mode.
 			  Request that DMA unmap operations use deferred
@@ -2256,7 +2258,7 @@
 			legacy driver-specific options takes precedence.
 
 	iommu.passthrough=
-			[ARM64, X86] Configure DMA to bypass the IOMMU by default.
+			[ARM64,X86,EARLY] Configure DMA to bypass the IOMMU by default.
 			Format: { "0" | "1" }
 			0 - Use IOMMU translation for DMA.
 			1 - Bypass the IOMMU for DMA.
@@ -2266,7 +2268,7 @@
 			See comment before marvel_specify_io7 in
 			arch/alpha/kernel/core_marvel.c.
 
-	io_delay=	[X86] I/O delay method
+	io_delay=	[X86,EARLY] I/O delay method
 		0x80
 			Standard port 0x80 based delay
 		0xed
@@ -2279,28 +2281,28 @@
 	ip=		[IP_PNP]
 			See Documentation/admin-guide/nfs/nfsroot.rst.
 
-	ipcmni_extend	[KNL] Extend the maximum number of unique System V
+	ipcmni_extend	[KNL,EARLY] Extend the maximum number of unique System V
 			IPC identifiers from 32,768 to 16,777,216.
 
 	irqaffinity=	[SMP] Set the default irq affinity mask
 			The argument is a cpu list, as described above.
 
 	irqchip.gicv2_force_probe=
-			[ARM, ARM64]
+			[ARM,ARM64,EARLY]
 			Format: <bool>
 			Force the kernel to look for the second 4kB page
 			of a GICv2 controller even if the memory range
 			exposed by the device tree is too small.
 
 	irqchip.gicv3_nolpi=
-			[ARM, ARM64]
+			[ARM,ARM64,EARLY]
 			Force the kernel to ignore the availability of
 			LPIs (and by consequence ITSs). Intended for system
 			that use the kernel as a bootloader, and thus want
 			to let secondary kernels in charge of setting up
 			LPIs.
 
-	irqchip.gicv3_pseudo_nmi= [ARM64]
+	irqchip.gicv3_pseudo_nmi= [ARM64,EARLY]
 			Enables support for pseudo-NMIs in the kernel. This
 			requires the kernel to be built with
 			CONFIG_ARM64_PSEUDO_NMI.
@@ -2445,7 +2447,7 @@
 			parameter KASAN will print report only for the first
 			invalid access.
 
-	keep_bootcon	[KNL]
+	keep_bootcon	[KNL,EARLY]
 			Do not unregister boot console at start. This is only
 			useful for debugging when something happens in the window
 			between unregistering the boot console and initializing
@@ -2453,7 +2455,7 @@
 
 	keepinitrd	[HW,ARM] See retain_initrd.
 
-	kernelcore=	[KNL,X86,IA-64,PPC]
+	kernelcore=	[KNL,X86,IA-64,PPC,EARLY]
 			Format: nn[KMGTPE] | nn% | "mirror"
 			This parameter specifies the amount of memory usable by
 			the kernel for non-movable allocations.  The requested
@@ -2478,7 +2480,7 @@
 			for Movable pages.  "nn[KMGTPE]", "nn%", and "mirror"
 			are exclusive, so you cannot specify multiple forms.
 
-	kgdbdbgp=	[KGDB,HW] kgdb over EHCI usb debug port.
+	kgdbdbgp=	[KGDB,HW,EARLY] kgdb over EHCI usb debug port.
 			Format: <Controller#>[,poll interval]
 			The controller # is the number of the ehci usb debug
 			port as it is probed via PCI.  The poll interval is
@@ -2499,7 +2501,7 @@
 			 kms, kbd format: kms,kbd
 			 kms, kbd and serial format: kms,kbd,<ser_dev>[,baud]
 
-	kgdboc_earlycon=	[KGDB,HW]
+	kgdboc_earlycon=	[KGDB,HW,EARLY]
 			If the boot console provides the ability to read
 			characters and can work in polling mode, you can use
 			this parameter to tell kgdb to use it as a backend
@@ -2514,14 +2516,14 @@
 			blank and the first boot console that implements
 			read() will be picked.
 
-	kgdbwait	[KGDB] Stop kernel execution and enter the
+	kgdbwait	[KGDB,EARLY] Stop kernel execution and enter the
 			kernel debugger at the earliest opportunity.
 
 	kmac=		[MIPS] Korina ethernet MAC address.
 			Configure the RouterBoard 532 series on-chip
 			Ethernet adapter MAC address.
 
-	kmemleak=	[KNL] Boot-time kmemleak enable/disable
+	kmemleak=	[KNL,EARLY] Boot-time kmemleak enable/disable
 			Valid arguments: on, off
 			Default: on
 			Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y,
@@ -2540,8 +2542,8 @@
 			See also Documentation/trace/kprobetrace.rst "Kernel
 			Boot Parameter" section.
 
-	kpti=		[ARM64] Control page table isolation of user
-			and kernel address spaces.
+	kpti=		[ARM64,EARLY] Control page table isolation of
+			user and kernel address spaces.
 			Default: enabled on cores which need mitigation.
 			0: force disabled
 			1: force enabled
@@ -2618,7 +2620,8 @@
 			for NPT.
 
 	kvm-arm.mode=
-			[KVM,ARM] Select one of KVM/arm64's modes of operation.
+			[KVM,ARM,EARLY] Select one of KVM/arm64's modes of
+			operation.
 
 			none: Forcefully disable KVM.
 
@@ -2638,22 +2641,22 @@
 			used with extreme caution.
 
 	kvm-arm.vgic_v3_group0_trap=
-			[KVM,ARM] Trap guest accesses to GICv3 group-0
+			[KVM,ARM,EARLY] Trap guest accesses to GICv3 group-0
 			system registers
 
 	kvm-arm.vgic_v3_group1_trap=
-			[KVM,ARM] Trap guest accesses to GICv3 group-1
+			[KVM,ARM,EARLY] Trap guest accesses to GICv3 group-1
 			system registers
 
 	kvm-arm.vgic_v3_common_trap=
-			[KVM,ARM] Trap guest accesses to GICv3 common
+			[KVM,ARM,EARLY] Trap guest accesses to GICv3 common
 			system registers
 
 	kvm-arm.vgic_v4_enable=
-			[KVM,ARM] Allow use of GICv4 for direct injection of
-			LPIs.
+			[KVM,ARM,EARLY] Allow use of GICv4 for direct
+			injection of LPIs.
 
-	kvm_cma_resv_ratio=n [PPC]
+	kvm_cma_resv_ratio=n [PPC,EARLY]
 			Reserves given percentage from system memory area for
 			contiguous memory allocation for KVM hash pagetable
 			allocation.
@@ -2706,7 +2709,7 @@
 			(enabled). Disable by KVM if hardware lacks support
 			for it.
 
-	l1d_flush=	[X86,INTEL]
+	l1d_flush=	[X86,INTEL,EARLY]
 			Control mitigation for L1D based snooping vulnerability.
 
 			Certain CPUs are vulnerable to an exploit against CPU
@@ -2723,7 +2726,7 @@
 
 			on         - enable the interface for the mitigation
 
-	l1tf=           [X86] Control mitigation of the L1TF vulnerability on
+	l1tf=           [X86,EARLY] Control mitigation of the L1TF vulnerability on
 			      affected CPUs
 
 			The kernel PTE inversion protection is unconditionally
@@ -2792,7 +2795,7 @@
 
 	l3cr=		[PPC]
 
-	lapic		[X86-32,APIC] Enable the local APIC even if BIOS
+	lapic		[X86-32,APIC,EARLY] Enable the local APIC even if BIOS
 			disabled it.
 
 	lapic=		[X86,APIC] Do not use TSC deadline
@@ -2800,7 +2803,7 @@
 			back to the programmable timer unit in the LAPIC.
 			Format: notscdeadline
 
-	lapic_timer_c2_ok	[X86,APIC] trust the local apic timer
+	lapic_timer_c2_ok	[X86,APIC,EARLY] trust the local apic timer
 			in C2 power state.
 
 	libata.dma=	[LIBATA] DMA control
@@ -2924,7 +2927,7 @@
 	lockd.nlm_udpport=M	[NFS] Assign UDP port.
 			Format: <integer>
 
-	lockdown=	[SECURITY]
+	lockdown=	[SECURITY,EARLY]
 			{ integrity | confidentiality }
 			Enable the kernel lockdown feature. If set to
 			integrity, kernel features that allow userland to
@@ -3031,7 +3034,8 @@
 	logibm.irq=	[HW,MOUSE] Logitech Bus Mouse Driver
 			Format: <irq>
 
-	loglevel=	All Kernel Messages with a loglevel smaller than the
+	loglevel=	[KNL,EARLY]
+			All Kernel Messages with a loglevel smaller than the
 			console loglevel will be printed to the console. It can
 			also be changed with klogd or other programs. The
 			loglevels are defined as follows:
@@ -3045,13 +3049,15 @@
 			6 (KERN_INFO)		informational
 			7 (KERN_DEBUG)		debug-level messages
 
-	log_buf_len=n[KMG]	Sets the size of the printk ring buffer,
-			in bytes.  n must be a power of two and greater
-			than the minimal size. The minimal size is defined
-			by LOG_BUF_SHIFT kernel config parameter. There is
-			also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter
-			that allows to increase the default size depending on
-			the number of CPUs. See init/Kconfig for more details.
+	log_buf_len=n[KMG] [KNL,EARLY]
+			Sets the size of the printk ring buffer, in bytes.
+			n must be a power of two and greater than the
+			minimal size. The minimal size is defined by
+			LOG_BUF_SHIFT kernel config parameter. There
+			is also CONFIG_LOG_CPU_MAX_BUF_SHIFT config
+			parameter that allows to increase the default size
+			depending on the number of CPUs. See init/Kconfig
+			for more details.
 
 	logo.nologo	[FB] Disables display of the built-in Linux logo.
 			This may be used to provide more screen space for
@@ -3109,7 +3115,7 @@
 	max_addr=nn[KMG]	[KNL,BOOT,IA-64] All physical memory greater
 			than or equal to this physical address is ignored.
 
-	maxcpus=	[SMP] Maximum number of processors that	an SMP kernel
+	maxcpus=	[SMP,EARLY] Maximum number of processors that an SMP kernel
 			will bring up during bootup.  maxcpus=n : n >= 0 limits
 			the kernel to bring up 'n' processors. Surely after
 			bootup you can bring up the other plugged cpu by executing
@@ -3136,7 +3142,7 @@
 			Format: <first>,<last>
 			Specifies range of consoles to be captured by the MDA.
 
-	mds=		[X86,INTEL]
+	mds=		[X86,INTEL,EARLY]
 			Control mitigation for the Micro-architectural Data
 			Sampling (MDS) vulnerability.
 
@@ -3168,11 +3174,12 @@
 
 			For details see: Documentation/admin-guide/hw-vuln/mds.rst
 
-	mem=nn[KMG]	[HEXAGON] Set the memory size.
+	mem=nn[KMG]	[HEXAGON,EARLY] Set the memory size.
 			Must be specified, otherwise memory size will be 0.
 
-	mem=nn[KMG]	[KNL,BOOT] Force usage of a specific amount of memory
-			Amount of memory to be used in cases as follows:
+	mem=nn[KMG]	[KNL,BOOT,EARLY] Force usage of a specific amount
+			of memory Amount of memory to be used in cases
+			as follows:
 
 			1 for test;
 			2 when the kernel is not able to see the whole system memory;
@@ -3196,8 +3203,8 @@
 			if system memory of hypervisor is not sufficient.
 
 	mem=nn[KMG]@ss[KMG]
-			[ARM,MIPS] - override the memory layout reported by
-			firmware.
+			[ARM,MIPS,EARLY] - override the memory layout
+			reported by firmware.
 			Define a memory region of size nn[KMG] starting at
 			ss[KMG].
 			Multiple different regions can be specified with
@@ -3206,7 +3213,7 @@
 	mem=nopentium	[BUGS=X86-32] Disable usage of 4MB pages for kernel
 			memory.
 
-	memblock=debug	[KNL] Enable memblock debug messages.
+	memblock=debug	[KNL,EARLY] Enable memblock debug messages.
 
 	memchunk=nn[KMG]
 			[KNL,SH] Allow user to override the default size for
@@ -3220,14 +3227,14 @@
 			option.
 			See Documentation/admin-guide/mm/memory-hotplug.rst.
 
-	memmap=exactmap	[KNL,X86] Enable setting of an exact
+	memmap=exactmap	[KNL,X86,EARLY] Enable setting of an exact
 			E820 memory map, as specified by the user.
 			Such memmap=exactmap lines can be constructed based on
 			BIOS output or other requirements. See the memmap=nn@ss
 			option description.
 
 	memmap=nn[KMG]@ss[KMG]
-			[KNL, X86, MIPS, XTENSA] Force usage of a specific region of memory.
+			[KNL, X86,MIPS,XTENSA,EARLY] Force usage of a specific region of memory.
 			Region of memory to be used is from ss to ss+nn.
 			If @ss[KMG] is omitted, it is equivalent to mem=nn[KMG],
 			which limits max address to nn[KMG].
@@ -3237,11 +3244,11 @@
 				memmap=100M@2G,100M#3G,1G!1024G
 
 	memmap=nn[KMG]#ss[KMG]
-			[KNL,ACPI] Mark specific memory as ACPI data.
+			[KNL,ACPI,EARLY] Mark specific memory as ACPI data.
 			Region of memory to be marked is from ss to ss+nn.
 
 	memmap=nn[KMG]$ss[KMG]
-			[KNL,ACPI] Mark specific memory as reserved.
+			[KNL,ACPI,EARLY] Mark specific memory as reserved.
 			Region of memory to be reserved is from ss to ss+nn.
 			Example: Exclude memory from 0x18690000-0x1869ffff
 			         memmap=64K$0x18690000
@@ -3251,14 +3258,14 @@
 			like Grub2, otherwise '$' and the following number
 			will be eaten.
 
-	memmap=nn[KMG]!ss[KMG]
+	memmap=nn[KMG]!ss[KMG,EARLY]
 			[KNL,X86] Mark specific memory as protected.
 			Region of memory to be used, from ss to ss+nn.
 			The memory region may be marked as e820 type 12 (0xc)
 			and is NVDIMM or ADR memory.
 
 	memmap=<size>%<offset>-<oldtype>+<newtype>
-			[KNL,ACPI] Convert memory within the specified region
+			[KNL,ACPI,EARLY] Convert memory within the specified region
 			from <oldtype> to <newtype>. If "-<oldtype>" is left
 			out, the whole region will be marked as <newtype>,
 			even if previously unavailable. If "+<newtype>" is left
@@ -3266,7 +3273,7 @@
 			specified as e820 types, e.g., 1 = RAM, 2 = reserved,
 			3 = ACPI, 12 = PRAM.
 
-	memory_corruption_check=0/1 [X86]
+	memory_corruption_check=0/1 [X86,EARLY]
 			Some BIOSes seem to corrupt the first 64k of
 			memory when doing things like suspend/resume.
 			Setting this option will scan the memory
@@ -3278,13 +3285,13 @@
 			affects the same memory, you can use memmap=
 			to prevent the kernel from using that memory.
 
-	memory_corruption_check_size=size [X86]
+	memory_corruption_check_size=size [X86,EARLY]
 			By default it checks for corruption in the low
 			64k, making this memory unavailable for normal
 			use.  Use this parameter to scan for
 			corruption in more or less memory.
 
-	memory_corruption_check_period=seconds [X86]
+	memory_corruption_check_period=seconds [X86,EARLY]
 			By default it checks for corruption every 60
 			seconds.  Use this parameter to check at some
 			other rate.  0 disables periodic checking.
@@ -3308,7 +3315,7 @@
 			Note that even when enabled, there are a few cases where
 			the feature is not effective.
 
-	memtest=	[KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
+	memtest=	[KNL,X86,ARM,M68K,PPC,RISCV,EARLY] Enable memtest
 			Format: <integer>
 			default : 0 <disable>
 			Specifies the number of memtest passes to be
@@ -3376,7 +3383,7 @@
 			https://repo.or.cz/w/linux-2.6/mini2440.git
 
 	mitigations=
-			[X86,PPC,S390,ARM64] Control optional mitigations for
+			[X86,PPC,S390,ARM64,EARLY] Control optional mitigations for
 			CPU vulnerabilities.  This is a set of curated,
 			arch-independent options, each of which is an
 			aggregation of existing arch-specific options.
@@ -3429,7 +3436,7 @@
 					       retbleed=auto,nosmt [X86]
 
 	mminit_loglevel=
-			[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
+			[KNL,EARLY] When CONFIG_DEBUG_MEMORY_INIT is set, this
 			parameter allows control of the logging verbosity for
 			the additional memory initialisation checks. A value
 			of 0 disables mminit logging and a level of 4 will
@@ -3437,7 +3444,7 @@
 			so loglevel=8 may also need to be specified.
 
 	mmio_stale_data=
-			[X86,INTEL] Control mitigation for the Processor
+			[X86,INTEL,EARLY] Control mitigation for the Processor
 			MMIO Stale Data vulnerabilities.
 
 			Processor MMIO Stale Data is a class of
@@ -3512,7 +3519,7 @@
 	mousedev.yres=	[MOUSE] Vertical screen resolution, used for devices
 			reporting absolute coordinates, such as tablets
 
-	movablecore=	[KNL,X86,IA-64,PPC]
+	movablecore=	[KNL,X86,IA-64,PPC,EARLY]
 			Format: nn[KMGTPE] | nn%
 			This parameter is the complement to kernelcore=, it
 			specifies the amount of memory used for migratable
@@ -3523,7 +3530,7 @@
 			that the amount of memory usable for all allocations
 			is not too small.
 
-	movable_node	[KNL] Boot-time switch to make hotplugable memory
+	movable_node	[KNL,EARLY] Boot-time switch to make hotplugable memory
 			NUMA nodes to be movable. This means that the memory
 			of such nodes will be usable only for movable
 			allocations which rules out almost all kernel
@@ -3547,21 +3554,21 @@
 			[HW] Make the MicroTouch USB driver use raw coordinates
 			('y', default) or cooked coordinates ('n')
 
-	mtrr=debug	[X86]
+	mtrr=debug	[X86,EARLY]
 			Enable printing debug information related to MTRR
 			registers at boot time.
 
-	mtrr_chunk_size=nn[KMG] [X86]
+	mtrr_chunk_size=nn[KMG,X86,EARLY]
 			used for mtrr cleanup. It is largest continuous chunk
 			that could hold holes aka. UC entries.
 
-	mtrr_gran_size=nn[KMG] [X86]
+	mtrr_gran_size=nn[KMG,X86,EARLY]
 			Used for mtrr cleanup. It is granularity of mtrr block.
 			Default is 1.
 			Large value could prevent small alignment from
 			using up MTRRs.
 
-	mtrr_spare_reg_nr=n [X86]
+	mtrr_spare_reg_nr=n [X86,EARLY]
 			Format: <integer>
 			Range: 0,7 : spare reg number
 			Default : 1
@@ -3747,10 +3754,10 @@
 			emulation library even if a 387 maths coprocessor
 			is present.
 
-	no4lvl		[RISCV] Disable 4-level and 5-level paging modes. Forces
-			kernel to use 3-level paging instead.
+	no4lvl		[RISCV,EARLY] Disable 4-level and 5-level paging modes.
+			Forces kernel to use 3-level paging instead.
 
-	no5lvl		[X86-64,RISCV] Disable 5-level paging mode. Forces
+	no5lvl		[X86-64,RISCV,EARLY] Disable 5-level paging mode. Forces
 			kernel to use 4-level paging instead.
 
 	noaliencache	[MM, NUMA, SLAB] Disables the allocation of alien
@@ -3759,15 +3766,15 @@
 
 	noalign		[KNL,ARM]
 
-	noaltinstr	[S390] Disables alternative instructions patching
-			(CPU alternatives feature).
+	noaltinstr	[S390,EARLY] Disables alternative instructions
+			patching (CPU alternatives feature).
 
-	noapic		[SMP,APIC] Tells the kernel to not make use of any
+	noapic		[SMP,APIC,EARLY] Tells the kernel to not make use of any
 			IOAPICs that may be present in the system.
 
 	noautogroup	Disable scheduler automatic task group creation.
 
-	nocache		[ARM]
+	nocache		[ARM,EARLY]
 
 	no_console_suspend
 			[HW] Never suspend the console
@@ -3785,13 +3792,13 @@
 			turn on/off it dynamically.
 
 	no_debug_objects
-			[KNL] Disable object debugging
+			[KNL,EARLY] Disable object debugging
 
 	nodsp		[SH] Disable hardware DSP at boot time.
 
-	noefi		Disable EFI runtime services support.
+	noefi		[EFI,EARLY] Disable EFI runtime services support.
 
-	no_entry_flush  [PPC] Don't flush the L1-D cache when entering the kernel.
+	no_entry_flush  [PPC,EARLY] Don't flush the L1-D cache when entering the kernel.
 
 	noexec		[IA-64]
 
@@ -3822,6 +3829,7 @@
 			real-time systems.
 
 	no_hash_pointers
+			[KNL,EARLY]
 			Force pointers printed to the console or buffers to be
 			unhashed.  By default, when a pointer is printed via %p
 			format string, that pointer is "hashed", i.e. obscured
@@ -3846,9 +3854,9 @@
 			the impact of the sleep instructions. This is also
 			useful when using JTAG debugger.
 
-	nohugeiomap	[KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings.
+	nohugeiomap	[KNL,X86,PPC,ARM64,EARLY] Disable kernel huge I/O mappings.
 
-	nohugevmalloc	[KNL,X86,PPC,ARM64] Disable kernel huge vmalloc mappings.
+	nohugevmalloc	[KNL,X86,PPC,ARM64,EARLY] Disable kernel huge vmalloc mappings.
 
 	nohz=		[KNL] Boottime enable/disable dynamic ticks
 			Valid arguments: on, off
@@ -3870,13 +3878,13 @@
 	noinitrd	[RAM] Tells the kernel not to load any configured
 			initial RAM disk.
 
-	nointremap	[X86-64, Intel-IOMMU] Do not enable interrupt
+	nointremap	[X86-64,Intel-IOMMU,EARLY] Do not enable interrupt
 			remapping.
 			[Deprecated - use intremap=off]
 
 	nointroute	[IA-64]
 
-	noinvpcid	[X86] Disable the INVPCID cpu feature.
+	noinvpcid	[X86,EARLY] Disable the INVPCID cpu feature.
 
 	noiotrap	[SH] Disables trapped I/O port accesses.
 
@@ -3887,19 +3895,19 @@
 
 	nojitter	[IA-64] Disables jitter checking for ITC timers.
 
-	nokaslr		[KNL]
+	nokaslr		[KNL,EARLY]
 			When CONFIG_RANDOMIZE_BASE is set, this disables
 			kernel and module base offset ASLR (Address Space
 			Layout Randomization).
 
-	no-kvmapf	[X86,KVM] Disable paravirtualized asynchronous page
+	no-kvmapf	[X86,KVM,EARLY] Disable paravirtualized asynchronous page
 			fault handling.
 
-	no-kvmclock	[X86,KVM] Disable paravirtualized KVM clock driver
+	no-kvmclock	[X86,KVM,EARLY] Disable paravirtualized KVM clock driver
 
-	nolapic		[X86-32,APIC] Do not enable or use the local APIC.
+	nolapic		[X86-32,APIC,EARLY] Do not enable or use the local APIC.
 
-	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
+	nolapic_timer	[X86-32,APIC,EARLY] Do not use the local APIC timer.
 
 	nomca		[IA-64] Disable machine check abort handling
 
@@ -3924,23 +3932,23 @@
 			shutdown the other cpus.  Instead use the REBOOT_VECTOR
 			irq.
 
-	nopat		[X86] Disable PAT (page attribute table extension of
+	nopat		[X86,EARLY] Disable PAT (page attribute table extension of
 			pagetables) support.
 
-	nopcid		[X86-64] Disable the PCID cpu feature.
+	nopcid		[X86-64,EARLY] Disable the PCID cpu feature.
 
 	nopku		[X86] Disable Memory Protection Keys CPU feature found
 			in some Intel CPUs.
 
-	nopti		[X86-64]
+	nopti		[X86-64,EARLY]
 			Equivalent to pti=off
 
-	nopv=		[X86,XEN,KVM,HYPER_V,VMWARE]
+	nopv=		[X86,XEN,KVM,HYPER_V,VMWARE,EARLY]
 			Disables the PV optimizations forcing the guest to run
 			as generic guest with no PV drivers. Currently support
 			XEN HVM, KVM, HYPER_V and VMWARE guest.
 
-	nopvspin	[X86,XEN,KVM]
+	nopvspin	[X86,XEN,KVM,EARLY]
 			Disables the qspinlock slow path using PV optimizations
 			which allow the hypervisor to 'idle' the guest on lock
 			contention.
@@ -3960,20 +3968,20 @@
 			This is required for the Braillex ib80-piezo Braille
 			reader made by F.H. Papenmeier (Germany).
 
-	nosgx		[X86-64,SGX] Disables Intel SGX kernel support.
+	nosgx		[X86-64,SGX,EARLY] Disables Intel SGX kernel support.
 
-	nosmap		[PPC]
+	nosmap		[PPC,EARLY]
 			Disable SMAP (Supervisor Mode Access Prevention)
 			even if it is supported by processor.
 
-	nosmep		[PPC64s]
+	nosmep		[PPC64s,EARLY]
 			Disable SMEP (Supervisor Mode Execution Prevention)
 			even if it is supported by processor.
 
-	nosmp		[SMP] Tells an SMP kernel to act as a UP kernel,
+	nosmp		[SMP,EARLY] Tells an SMP kernel to act as a UP kernel,
 			and disable the IO APIC.  legacy for "maxcpus=0".
 
-	nosmt		[KNL,MIPS,PPC,S390] Disable symmetric multithreading (SMT).
+	nosmt		[KNL,MIPS,PPC,S390,EARLY] Disable symmetric multithreading (SMT).
 			Equivalent to smt=1.
 
 			[KNL,X86,PPC] Disable symmetric multithreading (SMT).
@@ -3983,22 +3991,23 @@
 	nosoftlockup	[KNL] Disable the soft-lockup detector.
 
 	nospec_store_bypass_disable
-			[HW] Disable all mitigations for the Speculative Store Bypass vulnerability
+			[HW,EARLY] Disable all mitigations for the Speculative
+			Store Bypass vulnerability
 
-	nospectre_bhb	[ARM64] Disable all mitigations for Spectre-BHB (branch
+	nospectre_bhb	[ARM64,EARLY] Disable all mitigations for Spectre-BHB (branch
 			history injection) vulnerability. System may allow data leaks
 			with this option.
 
-	nospectre_v1	[X86,PPC] Disable mitigations for Spectre Variant 1
+	nospectre_v1	[X86,PPC,EARLY] Disable mitigations for Spectre Variant 1
 			(bounds check bypass). With this option data leaks are
 			possible in the system.
 
-	nospectre_v2	[X86,PPC_E500,ARM64] Disable all mitigations for
-			the Spectre variant 2 (indirect branch prediction)
-			vulnerability. System may allow data leaks with this
-			option.
+	nospectre_v2	[X86,PPC_E500,ARM64,EARLY] Disable all mitigations
+			for the Spectre variant 2 (indirect branch
+			prediction) vulnerability. System may allow data
+			leaks with this option.
 
-	no-steal-acc	[X86,PV_OPS,ARM64,PPC/PSERIES,RISCV] Disable
+	no-steal-acc	[X86,PV_OPS,ARM64,PPC/PSERIES,RISCV,EARLY] Disable
 			paravirtualized steal time accounting. steal time is
 			computed, but won't influence scheduler behaviour
 
@@ -4008,7 +4017,7 @@
 			broken timer IRQ sources.
 
 	no_uaccess_flush
-	                [PPC] Don't flush the L1-D cache after accessing user data.
+	                [PPC,EARLY] Don't flush the L1-D cache after accessing user data.
 
 	novmcoredd	[KNL,KDUMP]
 			Disable device dump. Device dump allows drivers to
@@ -4022,15 +4031,15 @@
 			is set.
 
 	no-vmw-sched-clock
-			[X86,PV_OPS] Disable paravirtualized VMware scheduler
-			clock and use the default one.
+			[X86,PV_OPS,EARLY] Disable paravirtualized VMware
+			scheduler clock and use the default one.
 
 	nowatchdog	[KNL] Disable both lockup detectors, i.e.
 			soft-lockup and NMI watchdog (hard-lockup).
 
-	nowb		[ARM]
+	nowb		[ARM,EARLY]
 
-	nox2apic	[X86-64,APIC] Do not enable x2APIC mode.
+	nox2apic	[X86-64,APIC,EARLY] Do not enable x2APIC mode.
 
 			NOTE: this parameter will be ignored on systems with the
 			LEGACY_XAPIC_DISABLED bit set in the
@@ -4068,7 +4077,7 @@
 			purges which is reported from either PAL_VM_SUMMARY or
 			SAL PALO.
 
-	nr_cpus=	[SMP] Maximum number of processors that	an SMP kernel
+	nr_cpus=	[SMP,EARLY] Maximum number of processors that an SMP kernel
 			could support.  nr_cpus=n : n >= 1 limits the kernel to
 			support 'n' processors. It could be larger than the
 			number of already plugged CPU during bootup, later in
@@ -4079,8 +4088,9 @@
 
 	nr_uarts=	[SERIAL] maximum number of UARTs to be registered.
 
-	numa=off 	[KNL, ARM64, PPC, RISCV, SPARC, X86] Disable NUMA, Only
-			set up a single NUMA node spanning all memory.
+	numa=off 	[KNL, ARM64, PPC, RISCV, SPARC, X86, EARLY]
+			Disable NUMA, Only set up a single NUMA node
+			spanning all memory.
 
 	numa_balancing=	[KNL,ARM64,PPC,RISCV,S390,X86] Enable or disable automatic
 			NUMA balancing.
@@ -4091,7 +4101,7 @@
 			This can be set from sysctl after boot.
 			See Documentation/admin-guide/sysctl/vm.rst for details.
 
-	ohci1394_dma=early	[HW] enable debugging via the ohci1394 driver.
+	ohci1394_dma=early	[HW,EARLY] enable debugging via the ohci1394 driver.
 			See Documentation/core-api/debugging-via-ohci1394.rst for more
 			info.
 
@@ -4117,7 +4127,8 @@
 				   Once locked, the boundary cannot be changed.
 				   1 indicates lock status, 0 indicates unlock status.
 
-	oops=panic	Always panic on oopses. Default is to just kill the
+	oops=panic	[KNL,EARLY]
+			Always panic on oopses. Default is to just kill the
 			process, but there is a small probability of
 			deadlocking the machine.
 			This will also cause panics on machine check exceptions.
@@ -4133,13 +4144,13 @@
 			can be read from sysfs at:
 			/sys/module/page_alloc/parameters/shuffle.
 
-	page_owner=	[KNL] Boot-time page_owner enabling option.
+	page_owner=	[KNL,EARLY] Boot-time page_owner enabling option.
 			Storage of the information about who allocated
 			each page is disabled in default. With this switch,
 			we can turn it on.
 			on: enable the feature
 
-	page_poison=	[KNL] Boot-time parameter changing the state of
+	page_poison=	[KNL,EARLY] Boot-time parameter changing the state of
 			poisoning on the buddy allocator, available with
 			CONFIG_PAGE_POISONING=y.
 			off: turn off poisoning (default)
@@ -4157,7 +4168,8 @@
 			timeout < 0: reboot immediately
 			Format: <timeout>
 
-	panic_on_taint=	Bitmask for conditionally calling panic() in add_taint()
+	panic_on_taint=	[KNL,EARLY]
+			Bitmask for conditionally calling panic() in add_taint()
 			Format: <hex>[,nousertaint]
 			Hexadecimal bitmask representing the set of TAINT flags
 			that will cause the kernel to panic when add_taint() is
@@ -4313,7 +4325,7 @@
 
 	pcbit=		[HW,ISDN]
 
-	pci=option[,option...]	[PCI] various PCI subsystem options.
+	pci=option[,option...]	[PCI,EARLY] various PCI subsystem options.
 
 				Some options herein operate on a specific device
 				or a set of devices (<pci_dev>). These are
@@ -4582,7 +4594,8 @@
 			Format: { 0 | 1 }
 			See arch/parisc/kernel/pdc_chassis.c
 
-	percpu_alloc=	Select which percpu first chunk allocator to use.
+	percpu_alloc=	[MM,EARLY]
+			Select which percpu first chunk allocator to use.
 			Currently supported values are "embed" and "page".
 			Archs may support subset or none of the	selections.
 			See comments in mm/percpu.c for details on each
@@ -4651,12 +4664,12 @@
 			execution priority.
 
 	ppc_strict_facility_enable
-			[PPC] This option catches any kernel floating point,
+			[PPC,ENABLE] This option catches any kernel floating point,
 			Altivec, VSX and SPE outside of regions specifically
 			allowed (eg kernel_enable_fpu()/kernel_disable_fpu()).
 			There is some performance impact when enabling this.
 
-	ppc_tm=		[PPC]
+	ppc_tm=		[PPC,EARLY]
 			Format: {"off"}
 			Disable Hardware Transactional Memory
 
@@ -4766,7 +4779,7 @@
 			[KNL] Number of legacy pty's. Overwrites compiled-in
 			default number.
 
-	quiet		[KNL] Disable most log messages
+	quiet		[KNL,EARLY] Disable most log messages
 
 	r128=		[HW,DRM]
 
@@ -4783,17 +4796,17 @@
 	ramdisk_start=	[RAM] RAM disk image start address
 
 	random.trust_cpu=off
-			[KNL] Disable trusting the use of the CPU's
+			[KNL,EARLY] Disable trusting the use of the CPU's
 			random number generator (if available) to
 			initialize the kernel's RNG.
 
 	random.trust_bootloader=off
-			[KNL] Disable trusting the use of the a seed
+			[KNL,EARLY] Disable trusting the use of the a seed
 			passed by the bootloader (if available) to
 			initialize the kernel's RNG.
 
 	randomize_kstack_offset=
-			[KNL] Enable or disable kernel stack offset
+			[KNL,EARLY] Enable or disable kernel stack offset
 			randomization, which provides roughly 5 bits of
 			entropy, frustrating memory corruption attacks
 			that depend on stack address determinism or
@@ -5484,7 +5497,7 @@
 			Run specified binary instead of /init from the ramdisk,
 			used for early userspace startup. See initrd.
 
-	rdrand=		[X86]
+	rdrand=		[X86,EARLY]
 			force - Override the decision by the kernel to hide the
 				advertisement of RDRAND support (this affects
 				certain AMD processors because of buggy BIOS
@@ -5580,7 +5593,7 @@
 			them.  If <base> is less than 0x10000, the region
 			is assumed to be I/O ports; otherwise it is memory.
 
-	reservetop=	[X86-32]
+	reservetop=	[X86-32,EARLY]
 			Format: nn[KMG]
 			Reserves a hole at the top of the kernel virtual
 			address space.
@@ -5665,7 +5678,7 @@
 			[KNL] Disable ring 3 MONITOR/MWAIT feature on supported
 			CPUs.
 
-	riscv_isa_fallback [RISCV]
+	riscv_isa_fallback [RISCV,EARLY]
 			When CONFIG_RISCV_ISA_FALLBACK is not enabled, permit
 			falling back to detecting extension support by parsing
 			"riscv,isa" property on devicetree systems when the
@@ -5674,13 +5687,14 @@
 
 	ro		[KNL] Mount root device read-only on boot
 
-	rodata=		[KNL]
+	rodata=		[KNL,EARLY]
 		on	Mark read-only kernel memory as read-only (default).
 		off	Leave read-only kernel memory writable for debugging.
 		full	Mark read-only kernel memory and aliases as read-only
 		        [arm64]
 
 	rockchip.usb_uart
+			[EARLY]
 			Enable the uart passthrough on the designated usb port
 			on Rockchip SoCs. When active, the signals of the
 			debug-uart get routed to the D+ and D- pins of the usb
@@ -5741,7 +5755,7 @@
 	sa1100ir	[NET]
 			See drivers/net/irda/sa1100_ir.c.
 
-	sched_verbose	[KNL] Enables verbose scheduler debug messages.
+	sched_verbose	[KNL,EARLY] Enables verbose scheduler debug messages.
 
 	schedstats=	[KNL,X86] Enable or disable scheduled statistics.
 			Allowed values are enable and disable. This feature
@@ -5856,7 +5870,7 @@
 			non-zero "wait" parameter.  See weight_single
 			and weight_many.
 
-	skew_tick=	[KNL] Offset the periodic timer tick per cpu to mitigate
+	skew_tick=	[KNL,EARLY] Offset the periodic timer tick per cpu to mitigate
 			xtime_lock contention on larger systems, and/or RCU lock
 			contention on all systems with CONFIG_MAXSMP set.
 			Format: { "0" | "1" }
@@ -5987,10 +6001,10 @@
 				1: Fast pin select (default)
 				2: ATC IRMode
 
-	smt=		[KNL,MIPS,S390] Set the maximum number of threads (logical
-			CPUs) to use per physical CPU on systems capable of
-			symmetric multithreading (SMT). Will be capped to the
-			actual hardware limit.
+	smt=		[KNL,MIPS,S390,EARLY] Set the maximum number of threads
+			(logical CPUs) to use per physical CPU on systems
+			capable of symmetric multithreading (SMT). Will
+			be capped to the actual hardware limit.
 			Format: <integer>
 			Default: -1 (no limit)
 
@@ -6012,7 +6026,7 @@
 	sonypi.*=	[HW] Sony Programmable I/O Control Device driver
 			See Documentation/admin-guide/laptops/sonypi.rst
 
-	spectre_v2=	[X86] Control mitigation of Spectre variant 2
+	spectre_v2=	[X86,EARLY] Control mitigation of Spectre variant 2
 			(indirect branch speculation) vulnerability.
 			The default operation protects the kernel from
 			user space attacks.
@@ -6092,7 +6106,7 @@
 			spectre_v2_user=auto.
 
 	spec_rstack_overflow=
-			[X86] Control RAS overflow mitigation on AMD Zen CPUs
+			[X86,EARLY] Control RAS overflow mitigation on AMD Zen CPUs
 
 			off		- Disable mitigation
 			microcode	- Enable microcode mitigation only
@@ -6103,7 +6117,7 @@
 					  (cloud-specific mitigation)
 
 	spec_store_bypass_disable=
-			[HW] Control Speculative Store Bypass (SSB) Disable mitigation
+			[HW,EARLY] Control Speculative Store Bypass (SSB) Disable mitigation
 			(Speculative Store Bypass vulnerability)
 
 			Certain CPUs are vulnerable to an exploit against a
@@ -6199,7 +6213,7 @@
 			#DB exception for bus lock is triggered only when
 			CPL > 0.
 
-	srbds=		[X86,INTEL]
+	srbds=		[X86,INTEL,EARLY]
 			Control the Special Register Buffer Data Sampling
 			(SRBDS) mitigation.
 
@@ -6286,7 +6300,7 @@
 			srcutree.convert_to_big must have the 0x10 bit
 			set for contention-based conversions to occur.
 
-	ssbd=		[ARM64,HW]
+	ssbd=		[ARM64,HW,EARLY]
 			Speculative Store Bypass Disable control
 
 			On CPUs that are vulnerable to the Speculative
@@ -6310,7 +6324,7 @@
 			growing up) the main stack are reserved for no other
 			mapping. Default value is 256 pages.
 
-	stack_depot_disable= [KNL]
+	stack_depot_disable= [KNL,EARLY]
 			Setting this to true through kernel command line will
 			disable the stack depot thereby saving the static memory
 			consumed by the stack hash table. By default this is set
@@ -6349,12 +6363,12 @@
 			be used to filter out binaries which have
 			not yet been made aware of AT_MINSIGSTKSZ.
 
-	stress_hpt	[PPC]
+	stress_hpt	[PPC,EARLY]
 			Limits the number of kernel HPT entries in the hash
 			page table to increase the rate of hash page table
 			faults on kernel addresses.
 
-	stress_slb	[PPC]
+	stress_slb	[PPC,EARLY]
 			Limits the number of kernel SLB entries, and flushes
 			them frequently to increase the rate of SLB faults
 			on kernel addresses.
@@ -6414,7 +6428,7 @@
 			This parameter controls use of the Protected
 			Execution Facility on pSeries.
 
-	swiotlb=	[ARM,IA-64,PPC,MIPS,X86]
+	swiotlb=	[ARM,IA-64,PPC,MIPS,X86,EARLY]
 			Format: { <int> [,<int>] | force | noforce }
 			<int> -- Number of I/O TLB slabs
 			<int> -- Second integer after comma. Number of swiotlb
@@ -6424,7 +6438,7 @@
 			         wouldn't be automatically used by the kernel
 			noforce -- Never use bounce buffers (for debugging)
 
-	switches=	[HW,M68k]
+	switches=	[HW,M68k,EARLY]
 
 	sysctl.*=	[KNL]
 			Set a sysctl parameter, right before loading the init
@@ -6483,11 +6497,11 @@
 			<deci-seconds>: poll all this frequency
 			0: no polling (default)
 
-	threadirqs	[KNL]
+	threadirqs	[KNL,EARLY]
 			Force threading of all interrupt handlers except those
 			marked explicitly IRQF_NO_THREAD.
 
-	topology=	[S390]
+	topology=	[S390,EARLY]
 			Format: {off | on}
 			Specify if the kernel should make use of the cpu
 			topology information if the hardware supports this.
@@ -6728,7 +6742,7 @@
 			can be overridden by a later tsc=nowatchdog.  A console
 			message will flag any such suppression or overriding.
 
-	tsc_early_khz=  [X86] Skip early TSC calibration and use the given
+	tsc_early_khz=  [X86,EARLY] Skip early TSC calibration and use the given
 			value instead. Useful when the early TSC frequency discovery
 			procedure is not reliable, such as on overclocked systems
 			with CPUID.16h support and partial CPUID.15h support.
@@ -6763,7 +6777,7 @@
 			See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
 			for more details.
 
-	tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async
+	tsx_async_abort= [X86,INTEL,EARLY] Control mitigation for the TSX Async
 			Abort (TAA) vulnerability.
 
 			Similar to Micro-architectural Data Sampling (MDS)
@@ -6829,7 +6843,7 @@
 	unknown_nmi_panic
 			[X86] Cause panic on unknown NMI.
 
-	unwind_debug	[X86-64]
+	unwind_debug	[X86-64,EARLY]
 			Enable unwinder debug output.  This can be
 			useful for debugging certain unwinder error
 			conditions, including corrupt stacks and
@@ -7019,7 +7033,7 @@
 			Example: user_debug=31
 
 	userpte=
-			[X86] Flags controlling user PTE allocations.
+			[X86,EARLY] Flags controlling user PTE allocations.
 
 				nohigh = do not allocate PTE pages in
 					HIGHMEM regardless of setting
@@ -7048,7 +7062,7 @@
 	vector=		[IA-64,SMP]
 			vector=percpu: enable percpu vector domain
 
-	video=		[FB] Frame buffer configuration
+	video=		[FB,EARLY] Frame buffer configuration
 			See Documentation/fb/modedb.rst.
 
 	video.brightness_switch_enabled= [ACPI]
@@ -7096,13 +7110,13 @@
 			  P	Enable page structure init time poisoning
 			  -	Disable all of the above options
 
-	vmalloc=nn[KMG]	[KNL,BOOT] Forces the vmalloc area to have an exact
-			size of <nn>. This can be used to increase the
-			minimum size (128MB on x86). It can also be used to
-			decrease the size and leave more room for directly
-			mapped kernel RAM.
+	vmalloc=nn[KMG]	[KNL,BOOT,EARLY] Forces the vmalloc area to have an
+			exact size of <nn>. This can be used to increase
+			the minimum size (128MB on x86). It can also be
+			used to decrease the size and leave more room
+			for directly mapped kernel RAM.
 
-	vmcp_cma=nn[MG]	[KNL,S390]
+	vmcp_cma=nn[MG]	[KNL,S390,EARLY]
 			Sets the memory size reserved for contiguous memory
 			allocations for the vmcp device driver.
 
@@ -7115,7 +7129,7 @@
 	vmpoff=		[KNL,S390] Perform z/VM CP command after power off.
 			Format: <command>
 
-	vsyscall=	[X86-64]
+	vsyscall=	[X86-64,EARLY]
 			Controls the behavior of vsyscalls (i.e. calls to
 			fixed addresses of 0xffffffffff600x00 from legacy
 			code).  Most statically-linked binaries and older
@@ -7263,13 +7277,13 @@
 			When enabled, memory and cache locality will be
 			impacted.
 
-	writecombine=	[LOONGARCH] Control the MAT (Memory Access Type) of
-			ioremap_wc().
+	writecombine=	[LOONGARCH,EARLY] Control the MAT (Memory Access
+			Type) of ioremap_wc().
 
 			on   - Enable writecombine, use WUC for ioremap_wc()
 			off  - Disable writecombine, use SUC for ioremap_wc()
 
-	x2apic_phys	[X86-64,APIC] Use x2apic physical mode instead of
+	x2apic_phys	[X86-64,APIC,EARLY] Use x2apic physical mode instead of
 			default x2apic cluster mode on platforms
 			supporting x2apic.
 
@@ -7280,7 +7294,7 @@
 			save/restore/migration must be enabled to handle larger
 			domains.
 
-	xen_emul_unplug=		[HW,X86,XEN]
+	xen_emul_unplug=		[HW,X86,XEN,EARLY]
 			Unplug Xen emulated devices
 			Format: [unplug0,][unplug1]
 			ide-disks -- unplug primary master IDE devices
@@ -7292,17 +7306,17 @@
 				the unplug protocol
 			never -- do not unplug even if version check succeeds
 
-	xen_legacy_crash	[X86,XEN]
+	xen_legacy_crash	[X86,XEN,EARLY]
 			Crash from Xen panic notifier, without executing late
 			panic() code such as dumping handler.
 
-	xen_msr_safe=	[X86,XEN]
+	xen_msr_safe=	[X86,XEN,EARLY]
 			Format: <bool>
 			Select whether to always use non-faulting (safe) MSR
 			access functions when running as Xen PV guest. The
 			default value is controlled by CONFIG_XEN_PV_MSR_SAFE.
 
-	xen_nopvspin	[X86,XEN]
+	xen_nopvspin	[X86,XEN,EARLY]
 			Disables the qspinlock slowpath using Xen PV optimizations.
 			This parameter is obsoleted by "nopvspin" parameter, which
 			has equivalent effect for XEN platform.
@@ -7314,7 +7328,7 @@
 			has equivalent effect for XEN platform.
 
 	xen_no_vector_callback
-			[KNL,X86,XEN] Disable the vector callback for Xen
+			[KNL,X86,XEN,EARLY] Disable the vector callback for Xen
 			event channel interrupts.
 
 	xen_scrub_pages=	[XEN]
@@ -7323,7 +7337,7 @@
 			with /sys/devices/system/xen_memory/xen_memory0/scrub_pages.
 			Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT.
 
-	xen_timer_slop=	[X86-64,XEN]
+	xen_timer_slop=	[X86-64,XEN,EARLY]
 			Set the timer slop (in nanoseconds) for the virtual Xen
 			timers (default is 100000). This adjusts the minimum
 			delta of virtualized Xen timers, where lower values
@@ -7376,7 +7390,7 @@
 			host controller quirks. Meaning of each bit can be
 			consulted in header drivers/usb/host/xhci.h.
 
-	xmon		[PPC]
+	xmon		[PPC,EARLY]
 			Format: { early | on | rw | ro | off }
 			Controls if xmon debugger is enabled. Default is off.
 			Passing only "xmon" is equivalent to "xmon=early".

From b42cdd2cbe2b3f677b3eb484c6c6bb4fa10e269c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 4 Dec 2023 09:33:29 -0800
Subject: [PATCH 198/707] rcu-tasks: Repair RCU Tasks Trace quiescence check

The context-switch-time check for RCU Tasks Trace quiescence expects
current->trc_reader_special.b.need_qs to be zero, and if so, updates
it to TRC_NEED_QS_CHECKED.  This is backwards, because if this value
is zero, there is no RCU Tasks Trace grace period in flight, an thus
no need for a quiescent state.  Instead, when a grace period starts,
this field is set to TRC_NEED_QS.

This commit therefore changes the check from zero to TRC_NEED_QS.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rcupdate.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 0746b1b0b6639d..16f519914415eb 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -184,9 +184,9 @@ void rcu_tasks_trace_qs_blkd(struct task_struct *t);
 	do {									\
 		int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting);	\
 										\
-		if (likely(!READ_ONCE((t)->trc_reader_special.b.need_qs)) &&	\
+		if (unlikely(READ_ONCE((t)->trc_reader_special.b.need_qs) == TRC_NEED_QS) &&	\
 		    likely(!___rttq_nesting)) {					\
-			rcu_trc_cmpxchg_need_qs((t), 0,	TRC_NEED_QS_CHECKED);	\
+			rcu_trc_cmpxchg_need_qs((t), TRC_NEED_QS, TRC_NEED_QS_CHECKED);	\
 		} else if (___rttq_nesting && ___rttq_nesting != INT_MIN &&	\
 			   !READ_ONCE((t)->trc_reader_special.b.blocked)) {	\
 			rcu_tasks_trace_qs_blkd(t);				\

From 43824241662ac61e90a023f908f267e6b137626b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 15 Nov 2023 14:11:26 -0500
Subject: [PATCH 199/707] rcu: Rename jiffies_till_flush to jiffies_lazy_flush

The variable name jiffies_till_flush is too generic and therefore:

* It may shadow a global variable
* It doesn't tell on what it operates

Make the name more precise, along with the related APIs.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcu.h       |  8 ++++----
 kernel/rcu/rcuscale.c  |  6 +++---
 kernel/rcu/tree_nocb.h | 22 +++++++++++-----------
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index f94f65877f2b68..dcfb666f24993f 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -543,11 +543,11 @@ enum rcutorture_type {
 };
 
 #if defined(CONFIG_RCU_LAZY)
-unsigned long rcu_lazy_get_jiffies_till_flush(void);
-void rcu_lazy_set_jiffies_till_flush(unsigned long j);
+unsigned long rcu_get_jiffies_lazy_flush(void);
+void rcu_set_jiffies_lazy_flush(unsigned long j);
 #else
-static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
-static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
+static inline unsigned long rcu_get_jiffies_lazy_flush(void) { return 0; }
+static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { }
 #endif
 
 #if defined(CONFIG_TREE_RCU)
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index ffdb30495e3cc3..8db4fedaaa1eb7 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -764,9 +764,9 @@ kfree_scale_init(void)
 
 	if (kfree_by_call_rcu) {
 		/* do a test to check the timeout. */
-		orig_jif = rcu_lazy_get_jiffies_till_flush();
+		orig_jif = rcu_get_jiffies_lazy_flush();
 
-		rcu_lazy_set_jiffies_till_flush(2 * HZ);
+		rcu_set_jiffies_lazy_flush(2 * HZ);
 		rcu_barrier();
 
 		jif_start = jiffies;
@@ -775,7 +775,7 @@ kfree_scale_init(void)
 
 		smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1);
 
-		rcu_lazy_set_jiffies_till_flush(orig_jif);
+		rcu_set_jiffies_lazy_flush(orig_jif);
 
 		if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) {
 			pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n");
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 4efbf7333d4e16..aecef51166c7e2 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -256,6 +256,7 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
 	return __wake_nocb_gp(rdp_gp, rdp, force, flags);
 }
 
+#ifdef CONFIG_RCU_LAZY
 /*
  * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
  * can elapse before lazy callbacks are flushed. Lazy callbacks
@@ -264,21 +265,20 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
  * left unsubmitted to RCU after those many jiffies.
  */
 #define LAZY_FLUSH_JIFFIES (10 * HZ)
-static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
+static unsigned long jiffies_lazy_flush = LAZY_FLUSH_JIFFIES;
 
-#ifdef CONFIG_RCU_LAZY
 // To be called only from test code.
-void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
+void rcu_set_jiffies_lazy_flush(unsigned long jif)
 {
-	jiffies_till_flush = jif;
+	jiffies_lazy_flush = jif;
 }
-EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
+EXPORT_SYMBOL(rcu_set_jiffies_lazy_flush);
 
-unsigned long rcu_lazy_get_jiffies_till_flush(void)
+unsigned long rcu_get_jiffies_lazy_flush(void)
 {
-	return jiffies_till_flush;
+	return jiffies_lazy_flush;
 }
-EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
+EXPORT_SYMBOL(rcu_get_jiffies_lazy_flush);
 #endif
 
 /*
@@ -299,7 +299,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
 	 */
 	if (waketype == RCU_NOCB_WAKE_LAZY &&
 	    rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
-		mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
+		mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush());
 		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
 	} else if (waketype == RCU_NOCB_WAKE_BYPASS) {
 		mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
@@ -482,7 +482,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 	// flush ->nocb_bypass to ->cblist.
 	if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
 	    (ncbs &&  bypass_is_lazy &&
-	     (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) ||
+	     (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()))) ||
 	    ncbs >= qhimark) {
 		rcu_nocb_lock(rdp);
 		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
@@ -723,7 +723,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
 		lazy_ncbs = READ_ONCE(rdp->lazy_len);
 
 		if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
-		    (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) ||
+		    (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()) ||
 		     bypass_ncbs > 2 * qhimark)) {
 			flush_bypass = true;
 		} else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&

From 573e094763eaccc6f4d58fea420ce8b890edbcbb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 15 Nov 2023 14:11:27 -0500
Subject: [PATCH 200/707] rcu/nocb: Remove needless LOAD-ACQUIRE

The LOAD-ACQUIRE access performed on rdp->nocb_cb_sleep advertizes
ordering callback execution against grace period completion. However
this is contradicted by the following:

* This LOAD-ACQUIRE doesn't pair with anything. The only counterpart
  barrier that can be found is the smp_mb() placed after callbacks
  advancing in nocb_gp_wait(). However the barrier is placed _after_
  ->nocb_cb_sleep write.

* Callbacks can be concurrently advanced between the LOAD-ACQUIRE on
  ->nocb_cb_sleep and the call to rcu_segcblist_extract_done_cbs() in
  rcu_do_batch(), making any ordering based on ->nocb_cb_sleep broken.

* Both rcu_segcblist_extract_done_cbs() and rcu_advance_cbs() are called
  under the nocb_lock, the latter hereby providing already the desired
  ACQUIRE semantics.

Therefore it is safe to access ->nocb_cb_sleep with a simple compiler
barrier.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_nocb.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index aecef51166c7e2..eb27878d46f1fe 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -933,8 +933,7 @@ static void nocb_cb_wait(struct rcu_data *rdp)
 		swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
 						    nocb_cb_wait_cond(rdp));
 
-		// VVV Ensure CB invocation follows _sleep test.
-		if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+		if (READ_ONCE(rdp->nocb_cb_sleep)) {
 			WARN_ON(signal_pending(current));
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
 		}

From e1173cfcd991870c0782f5b8c6ea5e9ffed66771 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 15 Nov 2023 14:11:28 -0500
Subject: [PATCH 201/707] rcu/nocb: Remove needless full barrier after callback
 advancing

A full barrier is issued from nocb_gp_wait() upon callbacks advancing
to order grace period completion with callbacks execution.

However these two events are already ordered by the
smp_mb__after_unlock_lock() barrier within the call to
raw_spin_lock_rcu_node() that is necessary for callbacks advancing to
happen.

The following litmus test shows the kind of guarantee that this barrier
provides:

	C smp_mb__after_unlock_lock

	{}

	// rcu_gp_cleanup()
	P0(spinlock_t *rnp_lock, int *gpnum)
	{
		// Grace period cleanup increase gp sequence number
		spin_lock(rnp_lock);
		WRITE_ONCE(*gpnum, 1);
		spin_unlock(rnp_lock);
	}

	// nocb_gp_wait()
	P1(spinlock_t *rnp_lock, spinlock_t *nocb_lock, int *gpnum, int *cb_ready)
	{
		int r1;

		// Call rcu_advance_cbs() from nocb_gp_wait()
		spin_lock(nocb_lock);
		spin_lock(rnp_lock);
		smp_mb__after_unlock_lock();
		r1 = READ_ONCE(*gpnum);
		WRITE_ONCE(*cb_ready, 1);
		spin_unlock(rnp_lock);
		spin_unlock(nocb_lock);
	}

	// nocb_cb_wait()
	P2(spinlock_t *nocb_lock, int *cb_ready, int *cb_executed)
	{
		int r2;

		// rcu_do_batch() -> rcu_segcblist_extract_done_cbs()
		spin_lock(nocb_lock);
		r2 = READ_ONCE(*cb_ready);
		spin_unlock(nocb_lock);

		// Actual callback execution
		WRITE_ONCE(*cb_executed, 1);
	}

	P3(int *cb_executed, int *gpnum)
	{
		int r3;

		WRITE_ONCE(*cb_executed, 2);
		smp_mb();
		r3 = READ_ONCE(*gpnum);
	}

	exists (1:r1=1 /\ 2:r2=1 /\ cb_executed=2 /\ 3:r3=0) (* Bad outcome. *)

Here the bad outcome only occurs if the smp_mb__after_unlock_lock() is
removed. This barrier orders the grace period completion against
callbacks advancing and even later callbacks invocation, thanks to the
opportunistic propagation via the ->nocb_lock to nocb_cb_wait().

Therefore the smp_mb() placed after callbacks advancing can be safely
removed.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c      | 6 ++++++
 kernel/rcu/tree_nocb.h | 1 -
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b2bccfd37c383d..d540d210e5c71a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2145,6 +2145,12 @@ static void rcu_do_batch(struct rcu_data *rdp)
 	 * Extract the list of ready callbacks, disabling IRQs to prevent
 	 * races with call_rcu() from interrupt handlers.  Leave the
 	 * callback counts, as rcu_barrier() needs to be conservative.
+	 *
+	 * Callbacks execution is fully ordered against preceding grace period
+	 * completion (materialized by rnp->gp_seq update) thanks to the
+	 * smp_mb__after_unlock_lock() upon node locking required for callbacks
+	 * advancing. In NOCB mode this ordering is then further relayed through
+	 * the nocb locking that protects both callbacks advancing and extraction.
 	 */
 	rcu_nocb_lock_irqsave(rdp, flags);
 	WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index eb27878d46f1fe..d82f96a66600cb 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -779,7 +779,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
 		if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
 			needwake = rdp->nocb_cb_sleep;
 			WRITE_ONCE(rdp->nocb_cb_sleep, false);
-			smp_mb(); /* CB invocation -after- GP end. */
 		} else {
 			needwake = false;
 		}

From 8fc521f2a25ad3ca51d30ee0ecd0a693d2f9c6fc Mon Sep 17 00:00:00 2001
From: Qais Yousef <qyousef@layalina.io>
Date: Sun, 3 Dec 2023 01:12:52 +0000
Subject: [PATCH 202/707] rcu: Provide a boot time parameter to control lazy
 RCU

To allow more flexible arrangements while still provide a single kernel
for distros, provide a boot time parameter to enable/disable lazy RCU.

Specify:

	rcutree.enable_rcu_lazy=[y|1|n|0]

Which also requires

	rcu_nocbs=all

at boot time to enable/disable lazy RCU.

To disable it by default at build time when CONFIG_RCU_LAZY=y, the new
CONFIG_RCU_LAZY_DEFAULT_OFF can be used.

Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
Tested-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  5 +++++
 kernel/rcu/Kconfig                              | 13 +++++++++++++
 kernel/rcu/tree.c                               |  7 ++++++-
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 4839f2919fdfa6..94314d0eb3019b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5047,6 +5047,11 @@
 			this kernel boot parameter, forcibly setting it
 			to zero.
 
+	rcutree.enable_rcu_lazy= [KNL]
+			To save power, batch RCU callbacks and flush after
+			delay, memory pressure or callback list growing too
+			big.
+
 	rcuscale.gp_async= [KNL]
 			Measure performance of asynchronous
 			grace-period primitives such as call_rcu().
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index bdd7eadb33d8fe..e7d2dd2675931f 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -314,6 +314,19 @@ config RCU_LAZY
 	  To save power, batch RCU callbacks and flush after delay, memory
 	  pressure, or callback list growing too big.
 
+	  Requires rcu_nocbs=all to be set.
+
+	  Use rcutree.enable_rcu_lazy=0 to turn it off at boot time.
+
+config RCU_LAZY_DEFAULT_OFF
+	bool "Turn RCU lazy invocation off by default"
+	depends on RCU_LAZY
+	default n
+	help
+	  Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default
+	  off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch
+	  it back on.
+
 config RCU_DOUBLE_CHECK_CB_TIME
 	bool "RCU callback-batch backup time check"
 	depends on RCU_EXPERT
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d540d210e5c71a..49980323417606 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2759,6 +2759,9 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
 }
 
 #ifdef CONFIG_RCU_LAZY
+static bool enable_rcu_lazy __read_mostly = !IS_ENABLED(CONFIG_RCU_LAZY_DEFAULT_OFF);
+module_param(enable_rcu_lazy, bool, 0444);
+
 /**
  * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
  * flush all lazy callbacks (including the new one) to the main ->cblist while
@@ -2784,6 +2787,8 @@ void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
 	__call_rcu_common(head, func, false);
 }
 EXPORT_SYMBOL_GPL(call_rcu_hurry);
+#else
+#define enable_rcu_lazy		false
 #endif
 
 /**
@@ -2832,7 +2837,7 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
  */
 void call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
-	__call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
+	__call_rcu_common(head, func, enable_rcu_lazy);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 

From f17ad8bd109b904a9368a5fa85d8fa18b7c3e704 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 4 Dec 2023 20:34:58 -0800
Subject: [PATCH 203/707] context_tracking: Fix kerneldoc headers for
 __ct_user_{enter,exit}()

Document the "state" parameter of both of these functions.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202312041922.YZCcEPYD-lkp@intel.com/
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/context_tracking.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 6ef0b35fc28c5a..70ae70d0382337 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -458,6 +458,8 @@ static __always_inline void context_tracking_recursion_exit(void)
  * __ct_user_enter - Inform the context tracking that the CPU is going
  *		     to enter user or guest space mode.
  *
+ * @state: userspace context-tracking state to enter.
+ *
  * This function must be called right before we switch from the kernel
  * to user or guest space, when it's guaranteed the remaining kernel
  * instructions to execute won't use any RCU read side critical section
@@ -595,6 +597,8 @@ NOKPROBE_SYMBOL(user_enter_callable);
  * __ct_user_exit - Inform the context tracking that the CPU is
  *		    exiting user or guest mode and entering the kernel.
  *
+ * @state: userspace context-tracking state being exited from.
+ *
  * This function must be called after we entered the kernel from user or
  * guest space before any use of RCU read side critical section. This
  * potentially include any high level kernel code like syscalls, exceptions,

From de95870fd0fb9355904531693de92e01b27e688c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 11 Dec 2023 11:55:17 -0800
Subject: [PATCH 204/707] doc: Clarify use of slab constructors and
 SLAB_TYPESAFE_BY_RCU

This commit explicitly states that you should initialize any locks to
be used by readers in your SLAB_TYPESAFE_BY_RCU constructor.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/whatisRCU.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst
index 246ce0d0b4d116..872ac665223fbd 100644
--- a/Documentation/RCU/whatisRCU.rst
+++ b/Documentation/RCU/whatisRCU.rst
@@ -963,8 +963,8 @@ unfortunately any spinlock in a ``SLAB_TYPESAFE_BY_RCU`` object must be
 initialized after each and every call to kmem_cache_alloc(), which renders
 reference-free spinlock acquisition completely unsafe.  Therefore, when
 using ``SLAB_TYPESAFE_BY_RCU``, make proper use of a reference counter.
-(Those willing to use a kmem_cache constructor may also use locking,
-including cache-friendly sequence locking.)
+(Those willing to initialize their locks in a kmem_cache constructor
+may also use locking, including cache-friendly sequence locking.)
 
 With traditional reference counting -- such as that implemented by the
 kref library in Linux -- there is typically code that runs when the last

From 18cfea5d54482c8ba35a2011db5685100455862b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 13 Dec 2023 09:49:20 -0800
Subject: [PATCH 205/707] doc: Update checklist.rst discussion of callback
 execution

This commit completes the list of call_rcu*() functions that are not
guaranteed to have their callbacks executing on the same CPU.  While in
the area, fix an unrelated typo.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/checklist.rst | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
index addd5c1547a420..3e6407de231c99 100644
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -383,16 +383,17 @@ over a rather long period of time, but improvements are always welcome!
 	must use whatever locking or other synchronization is required
 	to safely access and/or modify that data structure.
 
-	Do not assume that RCU callbacks will be executed on the same
-	CPU that executed the corresponding call_rcu() or call_srcu().
-	For example, if a given CPU goes offline while having an RCU
-	callback pending, then that RCU callback will execute on some
-	surviving CPU.	(If this was not the case, a self-spawning RCU
-	callback would prevent the victim CPU from ever going offline.)
-	Furthermore, CPUs designated by rcu_nocbs= might well *always*
-	have their RCU callbacks executed on some other CPUs, in fact,
-	for some  real-time workloads, this is the whole point of using
-	the rcu_nocbs= kernel boot parameter.
+	Do not assume that RCU callbacks will be executed on
+	the same CPU that executed the corresponding call_rcu(),
+	call_srcu(), call_rcu_tasks(), call_rcu_tasks_rude(), or
+	call_rcu_tasks_trace().  For example, if a given CPU goes offline
+	while having an RCU callback pending, then that RCU callback
+	will execute on some surviving CPU.  (If this was not the case,
+	a self-spawning RCU callback would prevent the victim CPU from
+	ever going offline.)  Furthermore, CPUs designated by rcu_nocbs=
+	might well *always* have their RCU callbacks executed on some
+	other CPUs, in fact, for some  real-time workloads, this is the
+	whole point of using the rcu_nocbs= kernel boot parameter.
 
 	In addition, do not assume that callbacks queued in a given order
 	will be invoked in that order, even if they all are queued on the

From ff0f64285a9e6d3e5390ba16252ec8c410577fc3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 19 Dec 2023 00:19:14 +0100
Subject: [PATCH 206/707] hrtimer: Report offline hrtimer enqueue

The hrtimers migration on CPU-down hotplug process has been moved
earlier, before the CPU actually goes to die. This leaves a small window
of opportunity to queue an hrtimer in a blind spot, leaving it ignored.

For example a practical case has been reported with RCU waking up a
SCHED_FIFO task right before the CPUHP_AP_IDLE_DEAD stage, queuing that
way a sched/rt timer to the local offline CPU.

Make sure such situations never go unnoticed and warn when that happens.

[ paulmck: Apply Stephen Rothwell feedback. ]

Reported-by: Paul E. McKenney <paulmck@kernel.org>
Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/hrtimer.h | 4 +++-
 kernel/time/hrtimer.c   | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 87e3bedf8eb003..991c83e929b456 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -151,6 +151,7 @@ enum  hrtimer_base_type {
  * @hang_detected:	The last hrtimer interrupt detected a hang
  * @softirq_activated:	displays, if the softirq is raised - update of softirq
  *			related settings is not required then.
+ * @online:		CPU is online from an hrtimers viewpoint
  * @nr_events:		Total number of hrtimer interrupt events
  * @nr_retries:		Total number of hrtimer interrupt retries
  * @nr_hangs:		Total number of hrtimer interrupt hangs
@@ -179,7 +180,8 @@ struct hrtimer_cpu_base {
 	unsigned int			hres_active		: 1,
 					in_hrtirq		: 1,
 					hang_detected		: 1,
-					softirq_activated       : 1;
+					softirq_activated       : 1,
+					online			: 1;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	unsigned int			nr_events;
 	unsigned short			nr_retries;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 760793998cdd70..edb0f821dceaa1 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1085,6 +1085,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
 			   enum hrtimer_mode mode)
 {
 	debug_activate(timer, mode);
+	WARN_ON_ONCE(!base->cpu_base->online);
 
 	base->cpu_base->active_bases |= 1 << base->index;
 
@@ -2183,6 +2184,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
 	cpu_base->softirq_next_timer = NULL;
 	cpu_base->expires_next = KTIME_MAX;
 	cpu_base->softirq_expires_next = KTIME_MAX;
+	cpu_base->online = 1;
 	hrtimer_cpu_base_init_expiry_lock(cpu_base);
 	return 0;
 }
@@ -2250,6 +2252,7 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
 	smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
 
 	raw_spin_unlock(&new_base->lock);
+	old_base->online = 0;
 	raw_spin_unlock(&old_base->lock);
 
 	return 0;

From bd4f7f10c9028148395beec6d79e42f5c98568c0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 19 Dec 2023 00:19:16 +0100
Subject: [PATCH 207/707] rcu/exp: Remove full barrier upon main thread wakeup

When an expedited grace period is ending, care must be taken so that all
the quiescent states propagated up to the root are correctly ordered
against the wake up of the main expedited grace period workqueue.

This ordering is already carried through the root rnp locking augmented
by an smp_mb__after_unlock_lock() barrier.

Therefore the explicit smp_mb() placed before the wake up is not needed
and can be removed.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 2ac440bc7e10bc..014ddf672165d3 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -198,10 +198,9 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
 		}
 		if (rnp->parent == NULL) {
 			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-			if (wake) {
-				smp_mb(); /* EGP done before wake_up(). */
+			if (wake)
 				swake_up_one_online(&rcu_state.expedited_wq);
-			}
+
 			break;
 		}
 		mask = rnp->grpmask;

From f5b859c4a2fe44c0ece7aad9dd2e3d3b7fd0385c Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Wed, 27 Dec 2023 12:47:38 -0500
Subject: [PATCH 208/707] srcu: Improve comments about acceleration leak

The comments added in commit 1ef990c4b36b ("srcu: No need to
advance/accelerate if no callback enqueued") are a bit confusing.
The comments are describing a scenario for code that was moved and is
no longer the way it was (snapshot after advancing). Improve the code
comments to reflect this and also document why acceleration can never
fail.

Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Neeraj Upadhyay <neeraj.iitr10@gmail.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 0351a4e83529e3..e4d673fc30f42f 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1234,11 +1234,20 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 	if (rhp)
 		rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
 	/*
-	 * The snapshot for acceleration must be taken _before_ the read of the
-	 * current gp sequence used for advancing, otherwise advancing may fail
-	 * and acceleration may then fail too.
+	 * It's crucial to capture the snapshot 's' for acceleration before
+	 * reading the current gp_seq that is used for advancing. This is
+	 * essential because if the acceleration snapshot is taken after a
+	 * failed advancement attempt, there's a risk that a grace period may
+	 * conclude and a new one may start in the interim. If the snapshot is
+	 * captured after this sequence of events, the acceleration snapshot 's'
+	 * could be excessively advanced, leading to acceleration failure.
+	 * In such a scenario, an 'acceleration leak' can occur, where new
+	 * callbacks become indefinitely stuck in the RCU_NEXT_TAIL segment.
+	 * Also note that encountering advancing failures is a normal
+	 * occurrence when the grace period for RCU_WAIT_TAIL is in progress.
 	 *
-	 * This could happen if:
+	 * To see this, consider the following events which occur if
+	 * rcu_seq_snap() were to be called after advance:
 	 *
 	 *  1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the
 	 *     RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8).
@@ -1264,6 +1273,13 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 	if (rhp) {
 		rcu_segcblist_advance(&sdp->srcu_cblist,
 				      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+		/*
+		 * Acceleration can never fail because the base current gp_seq
+		 * used for acceleration is <= the value of gp_seq used for
+		 * advancing. This means that RCU_NEXT_TAIL segment will
+		 * always be able to be emptied by the acceleration into the
+		 * RCU_NEXT_READY_TAIL or RCU_WAIT_TAIL segments.
+		 */
 		WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s));
 	}
 	if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {

From 296088f9b5a9a98bd84b19f71b12600c7bfa2c05 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Jan 2024 15:55:12 -0800
Subject: [PATCH 209/707] tsc: Check for sockets instead of CPUs to make code
 match comment

The unsynchronized_tsc() eventually checks num_possible_cpus(), and
if the system is non-Intel and the number of possible CPUs is greater
than one, assumes that TSCs are unsynchronized.  This despite the
comment saying "assume multi socket systems are not synchronized",
that is, socket rather than CPU.  This behavior was preserved by
commit 8fbbc4b45ce3 ("x86: merge tsc_init and clocksource code") and
by the previous relevant commit 7e69f2b1ead2 ("clocksource: Remove the
update callback").

The clocksource drivers were added by commit 5d0cf410e94b ("Time: i386
Clocksource Drivers") back in 2006, and the comment still said "socket"
rather than "CPU".

Therefore, bravely (and perhaps foolishly) make the code match the
comment.

Note that it is possible to bypass both code and comment by booting
with tsc=reliable, but this also disables the clocksource watchdog,
which is undesirable when trust in the TSC is strictly limited.

Reported-by: Zhengxu Chen <zhxchen17@meta.com>
Reported-by: Danielle Costantino <dcostantino@meta.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Waiman Long <longman@redhat.com>
Cc: John Stultz <jstultz@google.com>
Cc: <x86@kernel.org>
---
 arch/x86/kernel/tsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 15f97c0abc9d09..d45084c6a15ed3 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1287,7 +1287,7 @@ int unsynchronized_tsc(void)
 	 */
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
 		/* assume multi socket systems are not synchronized: */
-		if (num_possible_cpus() > 1)
+		if (nr_online_nodes > 1)
 			return 1;
 	}
 

From 5dd763ec737d07330f3b9f4cdc78318a96bcc7af Mon Sep 17 00:00:00 2001
From: Zhenhua Huang <quic_zhenhuah@quicinc.com>
Date: Tue, 2 Jan 2024 18:19:37 +0800
Subject: [PATCH 210/707] fs/proc: remove redudant comments from
 /proc/bootconfig

commit 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to
/proc/bootconfig") adds bootloader argument comments into /proc/bootconfig.

/proc/bootconfig shows boot_command_line[] multiple times following
every xbc key value pair, that's duplicated and not necessary.
Remove redundant ones.

Output before and after the fix is like:
key1 = value1
*bootloader argument comments*
key2 = value2
*bootloader argument comments*
key3 = value3
*bootloader argument comments*
...

key1 = value1
key2 = value2
key3 = value3
*bootloader argument comments*
...

Fixes: 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to /proc/bootconfig")
Signed-off-by: Zhenhua Huang <quic_zhenhuah@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 fs/proc/bootconfig.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 902b326e1e5607..e5635a6b127b0b 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -62,12 +62,12 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
 				break;
 			dst += ret;
 		}
-		if (ret >= 0 && boot_command_line[0]) {
-			ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n",
-				       boot_command_line);
-			if (ret > 0)
-				dst += ret;
-		}
+	}
+	if (ret >= 0 && boot_command_line[0]) {
+		ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n",
+			       boot_command_line);
+		if (ret > 0)
+			dst += ret;
 	}
 out:
 	kfree(key);

From cf59604abb7e8e7784d105a4476e63ca1ccaa209 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Jan 2024 10:59:25 -0800
Subject: [PATCH 211/707] rcutorture: Suppress rtort_pipe_count warnings until
 after stalls

Currently, if rcu_torture_writer() sees fewer than ten grace periods
having elapsed during a call to stutter_wait() that actually waited,
the rtort_pipe_count warning is emitted.  This has worked well for
a long time.  Except that the rcutorture TREE07 scenario now does a
short-term 14-second RCU CPU stall, which can most definitely case
false-positive rtort_pipe_count warnings.

This commit therefore changes rcu_torture_writer() to compute the
full expected holdoff and stall duration, and to refuse to report any
rtort_pipe_count warnings until after all stalls have completed.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/rcutorture.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7567ca8e743ca6..45d6b4c3d199c1 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1368,9 +1368,13 @@ rcu_torture_writer(void *arg)
 	struct rcu_torture *rp;
 	struct rcu_torture *old_rp;
 	static DEFINE_TORTURE_RANDOM(rand);
+	unsigned long stallsdone = jiffies;
 	bool stutter_waited;
 	unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE];
 
+	// If a new stall test is added, this must be adjusted.
+	if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu)
+		stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * HZ;
 	VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
 	if (!can_expedite)
 		pr_alert("%s" TORTURE_FLAG
@@ -1576,11 +1580,11 @@ rcu_torture_writer(void *arg)
 		    !atomic_read(&rcu_fwd_cb_nodelay) &&
 		    !cur_ops->slow_gps &&
 		    !torture_must_stop() &&
-		    boot_ended)
+		    boot_ended &&
+		    time_after(jiffies, stallsdone))
 			for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
 				if (list_empty(&rcu_tortures[i].rtort_free) &&
-				    rcu_access_pointer(rcu_torture_current) !=
-				    &rcu_tortures[i]) {
+				    rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) {
 					tracing_off();
 					show_rcu_gp_kthreads();
 					WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
@@ -2441,7 +2445,8 @@ static struct notifier_block rcu_torture_stall_block = {
 
 /*
  * CPU-stall kthread.  It waits as specified by stall_cpu_holdoff, then
- * induces a CPU stall for the time specified by stall_cpu.
+ * induces a CPU stall for the time specified by stall_cpu.  If a new
+ * stall test is added, stallsdone in rcu_torture_writer() must be adjusted.
  */
 static int rcu_torture_stall(void *args)
 {

From 5d15c90ead66043db4ab79c17e35f658e1a3ea51 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 9 Jan 2024 23:24:00 +0100
Subject: [PATCH 212/707] rcu/nocb: Make IRQs disablement symmetric

Currently IRQs are disabled on call_rcu() and then depending on the
context:

* If the CPU is in nocb mode:

   - If the callback is enqueued in the bypass list, IRQs are re-enabled
     implictly by rcu_nocb_try_bypass()

   - If the callback is enqueued in the normal list, IRQs are re-enabled
     implicitly by __call_rcu_nocb_wake()

* If the CPU is NOT in nocb mode, IRQs are reenabled explicitly from call_rcu()

This makes the code a bit hard to follow, especially as it interleaves
with nocb locking.

To make the IRQ flags coverage clearer and also in order to prepare for
moving all the nocb enqueue code to its own function, always re-enable
the IRQ flags explicitly from call_rcu().

Reviewed-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c      |  9 ++++++---
 kernel/rcu/tree_nocb.h | 20 +++++++++-----------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 49980323417606..91b2eb772e861f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2735,8 +2735,10 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
 	}
 
 	check_cb_ovld(rdp);
-	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
+	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) {
+		local_irq_restore(flags);
 		return; // Enqueued onto ->nocb_bypass, so just leave.
+	}
 	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
 	rcu_segcblist_enqueue(&rdp->cblist, head);
 	if (__is_kvfree_rcu_offset((unsigned long)func))
@@ -2754,8 +2756,8 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
 		__call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
 	} else {
 		__call_rcu_core(rdp, head, flags);
-		local_irq_restore(flags);
 	}
+	local_irq_restore(flags);
 }
 
 #ifdef CONFIG_RCU_LAZY
@@ -4651,8 +4653,9 @@ void rcutree_migrate_callbacks(int cpu)
 		__call_rcu_nocb_wake(my_rdp, true, flags);
 	} else {
 		rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
-		raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
+		raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
 	}
+	local_irq_restore(flags);
 	if (needwake)
 		rcu_gp_kthread_wake();
 	lockdep_assert_irqs_enabled();
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index d82f96a66600cb..06c8ff85850ccb 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -532,9 +532,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 	// 2. Both of these conditions are met:
 	//    a. The bypass list previously had only lazy CBs, and:
 	//    b. The new CB is non-lazy.
-	if (ncbs && (!bypass_is_lazy || lazy)) {
-		local_irq_restore(flags);
-	} else {
+	if (!ncbs || (bypass_is_lazy && !lazy)) {
 		// No-CBs GP kthread might be indefinitely asleep, if so, wake.
 		rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
 		if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
@@ -544,7 +542,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 		} else {
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 					    TPS("FirstBQnoWake"));
-			rcu_nocb_unlock_irqrestore(rdp, flags);
+			rcu_nocb_unlock(rdp);
 		}
 	}
 	return true; // Callback already enqueued.
@@ -570,7 +568,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 	// If we are being polled or there is no kthread, just leave.
 	t = READ_ONCE(rdp->nocb_gp_kthread);
 	if (rcu_nocb_poll || !t) {
-		rcu_nocb_unlock_irqrestore(rdp, flags);
+		rcu_nocb_unlock(rdp);
 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 				    TPS("WakeNotPoll"));
 		return;
@@ -583,17 +581,17 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 		rdp->qlen_last_fqs_check = len;
 		// Only lazy CBs in bypass list
 		if (lazy_len && bypass_len == lazy_len) {
-			rcu_nocb_unlock_irqrestore(rdp, flags);
+			rcu_nocb_unlock(rdp);
 			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
 					   TPS("WakeLazy"));
 		} else if (!irqs_disabled_flags(flags)) {
 			/* ... if queue was empty ... */
-			rcu_nocb_unlock_irqrestore(rdp, flags);
+			rcu_nocb_unlock(rdp);
 			wake_nocb_gp(rdp, false);
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 					    TPS("WakeEmpty"));
 		} else {
-			rcu_nocb_unlock_irqrestore(rdp, flags);
+			rcu_nocb_unlock(rdp);
 			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
 					   TPS("WakeEmptyIsDeferred"));
 		}
@@ -611,15 +609,15 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 		if ((rdp->nocb_cb_sleep ||
 		     !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
 		    !timer_pending(&rdp->nocb_timer)) {
-			rcu_nocb_unlock_irqrestore(rdp, flags);
+			rcu_nocb_unlock(rdp);
 			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
 					   TPS("WakeOvfIsDeferred"));
 		} else {
-			rcu_nocb_unlock_irqrestore(rdp, flags);
+			rcu_nocb_unlock(rdp);
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
 		}
 	} else {
-		rcu_nocb_unlock_irqrestore(rdp, flags);
+		rcu_nocb_unlock(rdp);
 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
 	}
 }

From c1856a43a68ada47d1285d4c7223d3bca94e3f44 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 9 Jan 2024 23:24:01 +0100
Subject: [PATCH 213/707] rcu/nocb: Re-arrange call_rcu() NOCB specific code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the call_rcu() function interleaves NOCB and !NOCB enqueue
code in a complicated way such that:

* The bypass enqueue code may or may not have enqueued and may or may
  not have locked the ->nocb_lock. Everything that follows is in a
  Schrödinger locking state for the unwary reviewer's eyes.

* The was_alldone is always set but only used in NOCB related code.

* The NOCB wake up is distantly related to the locking hopefully
  performed by the bypass enqueue code that did not enqueue on the
  bypass list.

Unconfuse the whole and gather NOCB and !NOCB specific enqueue code to
their own functions.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c      | 44 +++++++++++++++++++-----------------------
 kernel/rcu/tree.h      |  9 ++++-----
 kernel/rcu/tree_nocb.h | 18 ++++++++++++++---
 3 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 91b2eb772e861f..de5796ce024fec 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2597,12 +2597,26 @@ static int __init rcu_spawn_core_kthreads(void)
 	return 0;
 }
 
+static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func)
+{
+	rcu_segcblist_enqueue(&rdp->cblist, head);
+	if (__is_kvfree_rcu_offset((unsigned long)func))
+		trace_rcu_kvfree_callback(rcu_state.name, head,
+					 (unsigned long)func,
+					 rcu_segcblist_n_cbs(&rdp->cblist));
+	else
+		trace_rcu_callback(rcu_state.name, head,
+				   rcu_segcblist_n_cbs(&rdp->cblist));
+	trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
+}
+
 /*
  * Handle any core-RCU processing required by a call_rcu() invocation.
  */
-static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
-			    unsigned long flags)
+static void call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
+			  rcu_callback_t func, unsigned long flags)
 {
+	rcutree_enqueue(rdp, head, func);
 	/*
 	 * If called from an extended quiescent state, invoke the RCU
 	 * core in order to force a re-evaluation of RCU's idleness.
@@ -2698,7 +2712,6 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
 	unsigned long flags;
 	bool lazy;
 	struct rcu_data *rdp;
-	bool was_alldone;
 
 	/* Misaligned rcu_head! */
 	WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
@@ -2735,28 +2748,11 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
 	}
 
 	check_cb_ovld(rdp);
-	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) {
-		local_irq_restore(flags);
-		return; // Enqueued onto ->nocb_bypass, so just leave.
-	}
-	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
-	rcu_segcblist_enqueue(&rdp->cblist, head);
-	if (__is_kvfree_rcu_offset((unsigned long)func))
-		trace_rcu_kvfree_callback(rcu_state.name, head,
-					 (unsigned long)func,
-					 rcu_segcblist_n_cbs(&rdp->cblist));
-	else
-		trace_rcu_callback(rcu_state.name, head,
-				   rcu_segcblist_n_cbs(&rdp->cblist));
-
-	trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
 
-	/* Go handle any RCU core processing required. */
-	if (unlikely(rcu_rdp_is_offloaded(rdp))) {
-		__call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
-	} else {
-		__call_rcu_core(rdp, head, flags);
-	}
+	if (unlikely(rcu_rdp_is_offloaded(rdp)))
+		call_rcu_nocb(rdp, head, func, flags, lazy);
+	else
+		call_rcu_core(rdp, head, func, flags);
 	local_irq_restore(flags);
 }
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e9821a8422dbe7..bf478da89a8f33 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -467,11 +467,10 @@ static void rcu_init_one_nocb(struct rcu_node *rnp);
 static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 				  unsigned long j, bool lazy);
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				bool *was_alldone, unsigned long flags,
-				bool lazy);
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
-				 unsigned long flags);
+static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
+			  rcu_callback_t func, unsigned long flags, bool lazy);
+static void __maybe_unused __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+						unsigned long flags);
 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
 static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 06c8ff85850ccb..5fd47ea6d20eaa 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -622,6 +622,18 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 	}
 }
 
+static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
+			  rcu_callback_t func, unsigned long flags, bool lazy)
+{
+	bool was_alldone;
+
+	if (!rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) {
+		/* Not enqueued on bypass but locked, do regular enqueue */
+		rcutree_enqueue(rdp, head, func);
+		__call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
+	}
+}
+
 static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
 			       bool *wake_state)
 {
@@ -1764,10 +1776,10 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 	return true;
 }
 
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				bool *was_alldone, unsigned long flags, bool lazy)
+static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
+			  rcu_callback_t func, unsigned long flags, bool lazy)
 {
-	return false;
+	WARN_ON_ONCE(1);  /* Should be dead code! */
 }
 
 static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,

From 862890ed53fb1da491874f1cab49f4b45c2151bc Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang1211@gmail.com>
Date: Wed, 10 Jan 2024 16:11:28 +0800
Subject: [PATCH 214/707] rcu/nocb: Fix WARN_ON_ONCE() in the
 rcu_nocb_bypass_lock()

For the kernels built with CONFIG_RCU_NOCB_CPU_DEFAULT_ALL=y and
CONFIG_RCU_LAZY=y, the following scenarios will trigger WARN_ON_ONCE()
in the rcu_nocb_bypass_lock() and rcu_nocb_wait_contended() functions:

        CPU2                                               CPU11
kthread
rcu_nocb_cb_kthread                                       ksys_write
rcu_do_batch                                              vfs_write
rcu_torture_timer_cb                                      proc_sys_write
__kmem_cache_free                                         proc_sys_call_handler
kmemleak_free                                             drop_caches_sysctl_handler
delete_object_full                                        drop_slab
__delete_object                                           shrink_slab
put_object                                                lazy_rcu_shrink_scan
call_rcu                                                  rcu_nocb_flush_bypass
__call_rcu_commn                                            rcu_nocb_bypass_lock
                                                            raw_spin_trylock(&rdp->nocb_bypass_lock) fail
                                                            atomic_inc(&rdp->nocb_lock_contended);
rcu_nocb_wait_contended                                     WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
 WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))                                          |
                            |_ _ _ _ _ _ _ _ _ _same rdp and rdp->cpu != 11_ _ _ _ _ _ _ _ _ __|

Reproduce this bug with "echo 3 > /proc/sys/vm/drop_caches".

This commit therefore uses rcu_nocb_try_flush_bypass() instead of
rcu_nocb_flush_bypass() in lazy_rcu_shrink_scan().  If the nocb_bypass
queue is being flushed, then rcu_nocb_try_flush_bypass will return
directly.

Signed-off-by: Zqiang <qiang.zhang1211@gmail.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_nocb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 5fd47ea6d20eaa..54971afc3a9b25 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1391,7 +1391,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 			rcu_nocb_unlock_irqrestore(rdp, flags);
 			continue;
 		}
-		WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+		rcu_nocb_try_flush_bypass(rdp, jiffies);
 		rcu_nocb_unlock_irqrestore(rdp, flags);
 		wake_nocb_gp(rdp, false);
 		sc->nr_to_scan -= _count;

From 35a2b37256083fa5302f87a106261cdb4a11a3a9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 12 Jan 2024 16:46:15 +0100
Subject: [PATCH 215/707] rcu/exp: Fix RCU expedited parallel grace period
 kworker allocation failure recovery

Under CONFIG_RCU_EXP_KTHREAD=y, the nodes initialization for expedited
grace periods is queued to a kworker. However if the allocation of that
kworker failed, the nodes initialization is performed synchronously by
the caller instead.

Now the check for kworker initialization failure relies on the kworker
pointer to be NULL while its value might actually encapsulate an
allocation failure error.

Make sure to handle this case.

Reviewed-by: Kalesh Singh <kaleshsingh@google.com>
Fixes: 9621fbee44df ("rcu: Move expedited grace period (GP) work to RT kthread_worker")
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index de5796ce024fec..65d730a2b492e5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4759,6 +4759,7 @@ static void __init rcu_start_exp_gp_kworkers(void)
 	rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
 	if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
 		pr_err("Failed to create %s!\n", par_gp_kworker_name);
+		rcu_exp_par_gp_kworker = NULL;
 		kthread_destroy_worker(rcu_exp_gp_kworker);
 		return;
 	}

From f3a491f0ecde531df2a3dd75684565e959a16f5a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 12 Jan 2024 16:46:16 +0100
Subject: [PATCH 216/707] rcu/exp: Handle RCU expedited grace period kworker
 allocation failure

Just like is done for the kworker performing nodes initialization,
gracefully handle the possible allocation failure of the RCU expedited
grace period main kworker.

While at it perform a rename of the related checking functions to better
reflect the expedited specifics.

Reviewed-by: Kalesh Singh <kaleshsingh@google.com>
Fixes: 9621fbee44df ("rcu: Move expedited grace period (GP) work to RT kthread_worker")
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c     |  2 ++
 kernel/rcu/tree_exp.h | 25 +++++++++++++++++++------
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 65d730a2b492e5..3777fd305f2ef5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4753,6 +4753,7 @@ static void __init rcu_start_exp_gp_kworkers(void)
 	rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
 	if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
 		pr_err("Failed to create %s!\n", gp_kworker_name);
+		rcu_exp_gp_kworker = NULL;
 		return;
 	}
 
@@ -4761,6 +4762,7 @@ static void __init rcu_start_exp_gp_kworkers(void)
 		pr_err("Failed to create %s!\n", par_gp_kworker_name);
 		rcu_exp_par_gp_kworker = NULL;
 		kthread_destroy_worker(rcu_exp_gp_kworker);
+		rcu_exp_gp_kworker = NULL;
 		return;
 	}
 
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 014ddf672165d3..6123a60d9a4d7c 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -427,7 +427,12 @@ static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
 	__sync_rcu_exp_select_node_cpus(rewp);
 }
 
-static inline bool rcu_gp_par_worker_started(void)
+static inline bool rcu_exp_worker_started(void)
+{
+	return !!READ_ONCE(rcu_exp_gp_kworker);
+}
+
+static inline bool rcu_exp_par_worker_started(void)
 {
 	return !!READ_ONCE(rcu_exp_par_gp_kworker);
 }
@@ -477,7 +482,12 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
 	__sync_rcu_exp_select_node_cpus(rewp);
 }
 
-static inline bool rcu_gp_par_worker_started(void)
+static inline bool rcu_exp_worker_started(void)
+{
+	return !!READ_ONCE(rcu_gp_wq);
+}
+
+static inline bool rcu_exp_par_worker_started(void)
 {
 	return !!READ_ONCE(rcu_par_gp_wq);
 }
@@ -540,7 +550,7 @@ static void sync_rcu_exp_select_cpus(void)
 		rnp->exp_need_flush = false;
 		if (!READ_ONCE(rnp->expmask))
 			continue; /* Avoid early boot non-existent wq. */
-		if (!rcu_gp_par_worker_started() ||
+		if (!rcu_exp_par_worker_started() ||
 		    rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
 		    rcu_is_last_leaf_node(rnp)) {
 			/* No worker started yet or last leaf, do direct call. */
@@ -955,7 +965,7 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
  */
 void synchronize_rcu_expedited(void)
 {
-	bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
+	bool use_worker;
 	unsigned long flags;
 	struct rcu_exp_work rew;
 	struct rcu_node *rnp;
@@ -966,6 +976,9 @@ void synchronize_rcu_expedited(void)
 			 lock_is_held(&rcu_sched_lock_map),
 			 "Illegal synchronize_rcu_expedited() in RCU read-side critical section");
 
+	use_worker = (rcu_scheduler_active != RCU_SCHEDULER_INIT) &&
+		      rcu_exp_worker_started();
+
 	/* Is the state is such that the call is a grace period? */
 	if (rcu_blocking_is_gp()) {
 		// Note well that this code runs with !PREEMPT && !SMP.
@@ -995,7 +1008,7 @@ void synchronize_rcu_expedited(void)
 		return;  /* Someone else did our work for us. */
 
 	/* Ensure that load happens before action based on it. */
-	if (unlikely(boottime)) {
+	if (unlikely(!use_worker)) {
 		/* Direct call during scheduler init and early_initcalls(). */
 		rcu_exp_sel_wait_wake(s);
 	} else {
@@ -1013,7 +1026,7 @@ void synchronize_rcu_expedited(void)
 	/* Let the next expedited grace period start. */
 	mutex_unlock(&rcu_state.exp_mutex);
 
-	if (likely(!boottime))
+	if (likely(use_worker))
 		synchronize_rcu_expedited_destroy_work(&rew);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);

From 7eebf1849f6e455861215c5710c8d55eb581a4cc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 12 Jan 2024 16:46:17 +0100
Subject: [PATCH 217/707] rcu: s/boost_kthread_mutex/kthread_mutex

This mutex is currently protecting per node boost kthreads creation and
affinity setting across CPU hotplug operations.

Since the expedited kworkers will soon be split per node as well, they
will be subject to the same concurrency constraints against hotplug.

Therefore their creation and affinity tuning operations will be grouped
with those of boost kthreads and then rely on the same mutex.

To prepare for that, generalize its name.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c        |  2 +-
 kernel/rcu/tree.h        |  2 +-
 kernel/rcu/tree_plugin.h | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3777fd305f2ef5..0bf6971895194b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4928,7 +4928,7 @@ static void __init rcu_init_one(void)
 			init_waitqueue_head(&rnp->exp_wq[2]);
 			init_waitqueue_head(&rnp->exp_wq[3]);
 			spin_lock_init(&rnp->exp_lock);
-			mutex_init(&rnp->boost_kthread_mutex);
+			mutex_init(&rnp->kthread_mutex);
 			raw_spin_lock_init(&rnp->exp_poll_lock);
 			rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
 			INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bf478da89a8f33..adf8609f27d038 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -113,7 +113,7 @@ struct rcu_node {
 				/*  side effect, not as a lock. */
 	unsigned long boost_time;
 				/* When to start boosting (jiffies). */
-	struct mutex boost_kthread_mutex;
+	struct mutex kthread_mutex;
 				/* Exclusion for thread spawning and affinity */
 				/*  manipulation. */
 	struct task_struct *boost_kthread_task;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 41021080ad258d..0d307674915c60 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1195,7 +1195,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
 	struct sched_param sp;
 	struct task_struct *t;
 
-	mutex_lock(&rnp->boost_kthread_mutex);
+	mutex_lock(&rnp->kthread_mutex);
 	if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
 		goto out;
 
@@ -1212,7 +1212,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
 	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
 
  out:
-	mutex_unlock(&rnp->boost_kthread_mutex);
+	mutex_unlock(&rnp->kthread_mutex);
 }
 
 /*
@@ -1224,7 +1224,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
  * no outgoing CPU.  If there are no CPUs left in the affinity set,
  * this function allows the kthread to execute on any CPU.
  *
- * Any future concurrent calls are serialized via ->boost_kthread_mutex.
+ * Any future concurrent calls are serialized via ->kthread_mutex.
  */
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 {
@@ -1237,7 +1237,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 		return;
 	if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
 		return;
-	mutex_lock(&rnp->boost_kthread_mutex);
+	mutex_lock(&rnp->kthread_mutex);
 	mask = rcu_rnp_online_cpus(rnp);
 	for_each_leaf_node_possible_cpu(rnp, cpu)
 		if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
@@ -1250,7 +1250,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 			cpumask_clear_cpu(outgoingcpu, cm);
 	}
 	set_cpus_allowed_ptr(t, cm);
-	mutex_unlock(&rnp->boost_kthread_mutex);
+	mutex_unlock(&rnp->kthread_mutex);
 	free_cpumask_var(cm);
 }
 

From a7658d497dfc48b8b35ffe97a599f891aa8d0d75 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 12 Jan 2024 16:46:18 +0100
Subject: [PATCH 218/707] rcu/exp: Move expedited kthread worker creation
 functions above rcutree_prepare_cpu()

The expedited kthread worker performing the per node initialization is
going to be split into per node kthreads. As such, the future per node
kthread creation will need to be called from CPU hotplug callbacks
instead of an initcall, right beside the per node boost kthread
creation.

To prepare for that, move the kthread worker creation above
rcutree_prepare_cpu() as a first step to make the review smoother for
the upcoming modifications.

No intended functional change.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 96 +++++++++++++++++++++++------------------------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0bf6971895194b..9fc71dd712d6e1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4403,6 +4403,54 @@ rcu_boot_init_percpu_data(int cpu)
 	rcu_boot_init_nocb_percpu_data(rdp);
 }
 
+#ifdef CONFIG_RCU_EXP_KTHREAD
+struct kthread_worker *rcu_exp_gp_kworker;
+struct kthread_worker *rcu_exp_par_gp_kworker;
+
+static void __init rcu_start_exp_gp_kworkers(void)
+{
+	const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
+	const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
+	struct sched_param param = { .sched_priority = kthread_prio };
+
+	rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
+	if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
+		pr_err("Failed to create %s!\n", gp_kworker_name);
+		rcu_exp_gp_kworker = NULL;
+		return;
+	}
+
+	rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
+	if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
+		pr_err("Failed to create %s!\n", par_gp_kworker_name);
+		rcu_exp_par_gp_kworker = NULL;
+		kthread_destroy_worker(rcu_exp_gp_kworker);
+		rcu_exp_gp_kworker = NULL;
+		return;
+	}
+
+	sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
+	sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
+				   &param);
+}
+
+static inline void rcu_alloc_par_gp_wq(void)
+{
+}
+#else /* !CONFIG_RCU_EXP_KTHREAD */
+struct workqueue_struct *rcu_par_gp_wq;
+
+static void __init rcu_start_exp_gp_kworkers(void)
+{
+}
+
+static inline void rcu_alloc_par_gp_wq(void)
+{
+	rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+	WARN_ON(!rcu_par_gp_wq);
+}
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+
 /*
  * Invoked early in the CPU-online process, when pretty much all services
  * are available.  The incoming CPU is not present.
@@ -4740,54 +4788,6 @@ static int rcu_pm_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-#ifdef CONFIG_RCU_EXP_KTHREAD
-struct kthread_worker *rcu_exp_gp_kworker;
-struct kthread_worker *rcu_exp_par_gp_kworker;
-
-static void __init rcu_start_exp_gp_kworkers(void)
-{
-	const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
-	const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
-	struct sched_param param = { .sched_priority = kthread_prio };
-
-	rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
-	if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
-		pr_err("Failed to create %s!\n", gp_kworker_name);
-		rcu_exp_gp_kworker = NULL;
-		return;
-	}
-
-	rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
-	if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
-		pr_err("Failed to create %s!\n", par_gp_kworker_name);
-		rcu_exp_par_gp_kworker = NULL;
-		kthread_destroy_worker(rcu_exp_gp_kworker);
-		rcu_exp_gp_kworker = NULL;
-		return;
-	}
-
-	sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
-	sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
-				   &param);
-}
-
-static inline void rcu_alloc_par_gp_wq(void)
-{
-}
-#else /* !CONFIG_RCU_EXP_KTHREAD */
-struct workqueue_struct *rcu_par_gp_wq;
-
-static void __init rcu_start_exp_gp_kworkers(void)
-{
-}
-
-static inline void rcu_alloc_par_gp_wq(void)
-{
-	rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
-	WARN_ON(!rcu_par_gp_wq);
-}
-#endif /* CONFIG_RCU_EXP_KTHREAD */
-
 /*
  * Spawn the kthreads that handle RCU's grace periods.
  */

From de84f732f404d30643ddcb55c6aad45c88ea10bc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 12 Jan 2024 16:46:19 +0100
Subject: [PATCH 219/707] rcu/exp: Make parallel exp gp kworker per rcu node

When CONFIG_RCU_EXP_KTHREAD=n, the expedited grace period per node
initialization is performed in parallel via workqueues (one work per
node).

However in CONFIG_RCU_EXP_KTHREAD=y, this per node initialization is
performed by a single kworker serializing each node initialization (one
work for all nodes).

The second part is certainly less scalable and efficient beyond a single
leaf node.

To improve this, expand this single kworker into per-node kworkers. This
new layout is eventually intended to remove the workqueues based
implementation since it will essentially now become duplicate code.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcu.h         |  1 -
 kernel/rcu/tree.c        | 61 +++++++++++++++++++++++++++-------------
 kernel/rcu/tree.h        |  3 ++
 kernel/rcu/tree_exp.h    | 10 +++----
 kernel/rcu/tree_plugin.h | 10 ++-----
 5 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index dcfb666f24993f..4bc8cd6d461e64 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -625,7 +625,6 @@ void rcu_force_quiescent_state(void);
 extern struct workqueue_struct *rcu_gp_wq;
 #ifdef CONFIG_RCU_EXP_KTHREAD
 extern struct kthread_worker *rcu_exp_gp_kworker;
-extern struct kthread_worker *rcu_exp_par_gp_kworker;
 #else /* !CONFIG_RCU_EXP_KTHREAD */
 extern struct workqueue_struct *rcu_par_gp_wq;
 #endif /* CONFIG_RCU_EXP_KTHREAD */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9fc71dd712d6e1..5371f8fa0ee21b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4405,33 +4405,39 @@ rcu_boot_init_percpu_data(int cpu)
 
 #ifdef CONFIG_RCU_EXP_KTHREAD
 struct kthread_worker *rcu_exp_gp_kworker;
-struct kthread_worker *rcu_exp_par_gp_kworker;
 
-static void __init rcu_start_exp_gp_kworkers(void)
+static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
 {
-	const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
-	const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
+	struct kthread_worker *kworker;
+	const char *name = "rcu_exp_par_gp_kthread_worker/%d";
 	struct sched_param param = { .sched_priority = kthread_prio };
+	int rnp_index = rnp - rcu_get_root();
 
-	rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
-	if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
-		pr_err("Failed to create %s!\n", gp_kworker_name);
-		rcu_exp_gp_kworker = NULL;
+	if (rnp->exp_kworker)
+		return;
+
+	kworker = kthread_create_worker(0, name, rnp_index);
+	if (IS_ERR_OR_NULL(kworker)) {
+		pr_err("Failed to create par gp kworker on %d/%d\n",
+		       rnp->grplo, rnp->grphi);
 		return;
 	}
+	WRITE_ONCE(rnp->exp_kworker, kworker);
+	sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
+}
 
-	rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
-	if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
-		pr_err("Failed to create %s!\n", par_gp_kworker_name);
-		rcu_exp_par_gp_kworker = NULL;
-		kthread_destroy_worker(rcu_exp_gp_kworker);
+static void __init rcu_start_exp_gp_kworker(void)
+{
+	const char *name = "rcu_exp_gp_kthread_worker";
+	struct sched_param param = { .sched_priority = kthread_prio };
+
+	rcu_exp_gp_kworker = kthread_create_worker(0, name);
+	if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
+		pr_err("Failed to create %s!\n", name);
 		rcu_exp_gp_kworker = NULL;
 		return;
 	}
-
 	sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
-	sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
-				   &param);
 }
 
 static inline void rcu_alloc_par_gp_wq(void)
@@ -4440,7 +4446,11 @@ static inline void rcu_alloc_par_gp_wq(void)
 #else /* !CONFIG_RCU_EXP_KTHREAD */
 struct workqueue_struct *rcu_par_gp_wq;
 
-static void __init rcu_start_exp_gp_kworkers(void)
+static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
+{
+}
+
+static void __init rcu_start_exp_gp_kworker(void)
 {
 }
 
@@ -4451,6 +4461,17 @@ static inline void rcu_alloc_par_gp_wq(void)
 }
 #endif /* CONFIG_RCU_EXP_KTHREAD */
 
+static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp)
+{
+	if ((IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) ||
+	     IS_ENABLED(CONFIG_RCU_BOOST)) && rcu_scheduler_fully_active) {
+		mutex_lock(&rnp->kthread_mutex);
+		rcu_spawn_one_boost_kthread(rnp);
+		rcu_spawn_exp_par_gp_kworker(rnp);
+		mutex_unlock(&rnp->kthread_mutex);
+	}
+}
+
 /*
  * Invoked early in the CPU-online process, when pretty much all services
  * are available.  The incoming CPU is not present.
@@ -4499,7 +4520,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
 	rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
 	trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-	rcu_spawn_one_boost_kthread(rnp);
+	rcu_spawn_rnp_kthreads(rnp);
 	rcu_spawn_cpu_nocb_kthread(cpu);
 	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
 
@@ -4822,10 +4843,10 @@ static int __init rcu_spawn_gp_kthread(void)
 	 * due to rcu_scheduler_fully_active.
 	 */
 	rcu_spawn_cpu_nocb_kthread(smp_processor_id());
-	rcu_spawn_one_boost_kthread(rdp->mynode);
+	rcu_spawn_rnp_kthreads(rdp->mynode);
 	rcu_spawn_core_kthreads();
 	/* Create kthread worker for expedited GPs */
-	rcu_start_exp_gp_kworkers();
+	rcu_start_exp_gp_kworker();
 	return 0;
 }
 early_initcall(rcu_spawn_gp_kthread);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index adf8609f27d038..a0c02d631f448e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -72,6 +72,9 @@ struct rcu_node {
 				/* Online CPUs for next expedited GP. */
 				/*  Any CPU that has ever been online will */
 				/*  have its bit set. */
+	struct kthread_worker *exp_kworker;
+				/* Workers performing per node expedited GP */
+				/* initialization. */
 	unsigned long cbovldmask;
 				/* CPUs experiencing callback overload. */
 	unsigned long ffmask;	/* Fully functional CPUs. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 6123a60d9a4d7c..0318a8a062d5c0 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -432,9 +432,9 @@ static inline bool rcu_exp_worker_started(void)
 	return !!READ_ONCE(rcu_exp_gp_kworker);
 }
 
-static inline bool rcu_exp_par_worker_started(void)
+static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp)
 {
-	return !!READ_ONCE(rcu_exp_par_gp_kworker);
+	return !!READ_ONCE(rnp->exp_kworker);
 }
 
 static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
@@ -445,7 +445,7 @@ static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
 	 * another work item on the same kthread worker can result in
 	 * deadlock.
 	 */
-	kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work);
+	kthread_queue_work(READ_ONCE(rnp->exp_kworker), &rnp->rew.rew_work);
 }
 
 static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
@@ -487,7 +487,7 @@ static inline bool rcu_exp_worker_started(void)
 	return !!READ_ONCE(rcu_gp_wq);
 }
 
-static inline bool rcu_exp_par_worker_started(void)
+static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp)
 {
 	return !!READ_ONCE(rcu_par_gp_wq);
 }
@@ -550,7 +550,7 @@ static void sync_rcu_exp_select_cpus(void)
 		rnp->exp_need_flush = false;
 		if (!READ_ONCE(rnp->expmask))
 			continue; /* Avoid early boot non-existent wq. */
-		if (!rcu_exp_par_worker_started() ||
+		if (!rcu_exp_par_worker_started(rnp) ||
 		    rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
 		    rcu_is_last_leaf_node(rnp)) {
 			/* No worker started yet or last leaf, do direct call. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0d307674915c60..09bdd36ca9ffcc 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1195,14 +1195,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
 	struct sched_param sp;
 	struct task_struct *t;
 
-	mutex_lock(&rnp->kthread_mutex);
-	if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
-		goto out;
+	if (rnp->boost_kthread_task)
+		return;
 
 	t = kthread_create(rcu_boost_kthread, (void *)rnp,
 			   "rcub/%d", rnp_index);
 	if (WARN_ON_ONCE(IS_ERR(t)))
-		goto out;
+		return;
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	rnp->boost_kthread_task = t;
@@ -1210,9 +1209,6 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
 	sp.sched_priority = kthread_prio;
 	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
-
- out:
-	mutex_unlock(&rnp->kthread_mutex);
 }
 
 /*

From a48606b1f7c8d1dbc53c0a980b1303798e5b8177 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 12 Jan 2024 16:46:20 +0100
Subject: [PATCH 220/707] rcu/exp: Handle parallel exp gp kworkers affinity

Affine the parallel expedited gp kworkers to their respective RCU node
in order to make them close to the cache their are playing with.

This reuses the boost kthreads machinery that probe into CPU hotplug
operations such that the kthreads become/stay affine to their respective
node as soon/long as they contain online CPUs. Otherwise and if the
current CPU going down was the last online on the leaf node, the related
kthread is affine to the housekeeping CPUs.

In the long run, this affinity VS CPU hotplug operation game should
probably be implemented at the generic kthread level.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c        | 79 +++++++++++++++++++++++++++++++++++++---
 kernel/rcu/tree_plugin.h | 42 ++-------------------
 2 files changed, 78 insertions(+), 43 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5371f8fa0ee21b..40bfc58f18213d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -145,7 +145,7 @@ static int rcu_scheduler_fully_active __read_mostly;
 
 static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
 			      unsigned long gps, unsigned long flags);
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static struct task_struct *rcu_boost_task(struct rcu_node *rnp);
 static void invoke_rcu_core(void);
 static void rcu_report_exp_rdp(struct rcu_data *rdp);
 static void sync_sched_exp_online_cleanup(int cpu);
@@ -4426,6 +4426,16 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
 	sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
 }
 
+static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
+{
+	struct kthread_worker *kworker = READ_ONCE(rnp->exp_kworker);
+
+	if (!kworker)
+		return NULL;
+
+	return kworker->task;
+}
+
 static void __init rcu_start_exp_gp_kworker(void)
 {
 	const char *name = "rcu_exp_gp_kthread_worker";
@@ -4450,6 +4460,11 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
 {
 }
 
+static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
+{
+	return NULL;
+}
+
 static void __init rcu_start_exp_gp_kworker(void)
 {
 }
@@ -4528,13 +4543,67 @@ int rcutree_prepare_cpu(unsigned int cpu)
 }
 
 /*
- * Update RCU priority boot kthread affinity for CPU-hotplug changes.
+ * Update kthreads affinity during CPU-hotplug changes.
+ *
+ * Set the per-rcu_node kthread's affinity to cover all CPUs that are
+ * served by the rcu_node in question.  The CPU hotplug lock is still
+ * held, so the value of rnp->qsmaskinit will be stable.
+ *
+ * We don't include outgoingcpu in the affinity set, use -1 if there is
+ * no outgoing CPU.  If there are no CPUs left in the affinity set,
+ * this function allows the kthread to execute on any CPU.
+ *
+ * Any future concurrent calls are serialized via ->kthread_mutex.
  */
-static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
+static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu)
 {
-	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+	cpumask_var_t cm;
+	unsigned long mask;
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+	struct task_struct *task_boost, *task_exp;
+
+	if (!IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) && !IS_ENABLED(CONFIG_RCU_BOOST))
+		return;
+
+	rdp = per_cpu_ptr(&rcu_data, cpu);
+	rnp = rdp->mynode;
+
+	task_boost = rcu_boost_task(rnp);
+	task_exp = rcu_exp_par_gp_task(rnp);
+
+	/*
+	 * If CPU is the boot one, those tasks are created later from early
+	 * initcall since kthreadd must be created first.
+	 */
+	if (!task_boost && !task_exp)
+		return;
+
+	if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
+		return;
+
+	mutex_lock(&rnp->kthread_mutex);
+	mask = rcu_rnp_online_cpus(rnp);
+	for_each_leaf_node_possible_cpu(rnp, cpu)
+		if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
+		    cpu != outgoingcpu)
+			cpumask_set_cpu(cpu, cm);
+	cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
+	if (cpumask_empty(cm)) {
+		cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
+		if (outgoingcpu >= 0)
+			cpumask_clear_cpu(outgoingcpu, cm);
+	}
+
+	if (task_exp)
+		set_cpus_allowed_ptr(task_exp, cm);
+
+	if (task_boost)
+		set_cpus_allowed_ptr(task_boost, cm);
+
+	mutex_unlock(&rnp->kthread_mutex);
 
-	rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
+	free_cpumask_var(cm);
 }
 
 /*
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 09bdd36ca9ffcc..08246cca663f0f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1211,43 +1211,9 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
 	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
 }
 
-/*
- * Set the per-rcu_node kthread's affinity to cover all CPUs that are
- * served by the rcu_node in question.  The CPU hotplug lock is still
- * held, so the value of rnp->qsmaskinit will be stable.
- *
- * We don't include outgoingcpu in the affinity set, use -1 if there is
- * no outgoing CPU.  If there are no CPUs left in the affinity set,
- * this function allows the kthread to execute on any CPU.
- *
- * Any future concurrent calls are serialized via ->kthread_mutex.
- */
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static struct task_struct *rcu_boost_task(struct rcu_node *rnp)
 {
-	struct task_struct *t = rnp->boost_kthread_task;
-	unsigned long mask;
-	cpumask_var_t cm;
-	int cpu;
-
-	if (!t)
-		return;
-	if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
-		return;
-	mutex_lock(&rnp->kthread_mutex);
-	mask = rcu_rnp_online_cpus(rnp);
-	for_each_leaf_node_possible_cpu(rnp, cpu)
-		if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
-		    cpu != outgoingcpu)
-			cpumask_set_cpu(cpu, cm);
-	cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
-	if (cpumask_empty(cm)) {
-		cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
-		if (outgoingcpu >= 0)
-			cpumask_clear_cpu(outgoingcpu, cm);
-	}
-	set_cpus_allowed_ptr(t, cm);
-	mutex_unlock(&rnp->kthread_mutex);
-	free_cpumask_var(cm);
+	return READ_ONCE(rnp->boost_kthread_task);
 }
 
 #else /* #ifdef CONFIG_RCU_BOOST */
@@ -1266,10 +1232,10 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
 {
 }
 
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static struct task_struct * rcu_boost_task(struct rcu_node *rnp)
 {
+	return NULL;
 }
-
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
 
 /*

From 5c2905e35a3aa9a65e7f651a6a07a2bd2eb9449a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 12 Jan 2024 16:46:21 +0100
Subject: [PATCH 221/707] rcu/exp: Remove rcu_par_gp_wq

TREE04 running on short iterations can produce writer stalls of the
following kind:

 ??? Writer stall state RTWS_EXP_SYNC(4) g3968 f0x0 ->state 0x2 cpu 0
 task:rcu_torture_wri state:D stack:14568 pid:83    ppid:2      flags:0x00004000
 Call Trace:
  <TASK>
  __schedule+0x2de/0x850
  ? trace_event_raw_event_rcu_exp_funnel_lock+0x6d/0xb0
  schedule+0x4f/0x90
  synchronize_rcu_expedited+0x430/0x670
  ? __pfx_autoremove_wake_function+0x10/0x10
  ? __pfx_synchronize_rcu_expedited+0x10/0x10
  do_rtws_sync.constprop.0+0xde/0x230
  rcu_torture_writer+0x4b4/0xcd0
  ? __pfx_rcu_torture_writer+0x10/0x10
  kthread+0xc7/0xf0
  ? __pfx_kthread+0x10/0x10
  ret_from_fork+0x2f/0x50
  ? __pfx_kthread+0x10/0x10
  ret_from_fork_asm+0x1b/0x30
  </TASK>

Waiting for an expedited grace period and polling for an expedited
grace period both are operations that internally rely on the same
workqueue performing necessary asynchronous work.

However, a dependency chain is involved between those two operations,
as depicted below:

       ====== CPU 0 =======                          ====== CPU 1 =======

                                                     synchronize_rcu_expedited()
                                                         exp_funnel_lock()
                                                             mutex_lock(&rcu_state.exp_mutex);
    start_poll_synchronize_rcu_expedited
        queue_work(rcu_gp_wq, &rnp->exp_poll_wq);
                                                         synchronize_rcu_expedited_queue_work()
                                                             queue_work(rcu_gp_wq, &rew->rew_work);
                                                         wait_event() // A, wait for &rew->rew_work completion
                                                         mutex_unlock() // B
    //======> switch to kworker

    sync_rcu_do_polled_gp() {
        synchronize_rcu_expedited()
            exp_funnel_lock()
                mutex_lock(&rcu_state.exp_mutex); // C, wait B
                ....
    } // D

Since workqueues are usually implemented on top of several kworkers
handling the queue concurrently, the above situation wouldn't deadlock
most of the time because A then doesn't depend on D. But in case of
memory stress, a single kworker may end up handling alone all the works
in a serialized way. In that case the above layout becomes a problem
because A then waits for D, closing a circular dependency:

	A -> D -> C -> B -> A

This however only happens when CONFIG_RCU_EXP_KTHREAD=n. Indeed
synchronize_rcu_expedited() is otherwise implemented on top of a kthread
worker while polling still relies on rcu_gp_wq workqueue, breaking the
above circular dependency chain.

Fix this with making expedited grace period to always rely on kthread
worker. The workqueue based implementation is essentially a duplicate
anyway now that the per-node initialization is performed by per-node
kthread workers.

Meanwhile the CONFIG_RCU_EXP_KTHREAD switch is still kept around to
manage the scheduler policy of these kthread workers.

Reported-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Reported-by: Thomas Gleixner <tglx@linutronix.de>
Suggested-by: Joel Fernandes <joel@joelfernandes.org>
Suggested-by: Paul E. McKenney <paulmck@kernel.org>
Suggested-by: Neeraj upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcu.h      |  4 ---
 kernel/rcu/tree.c     | 40 ++++--------------------
 kernel/rcu/tree.h     |  6 +---
 kernel/rcu/tree_exp.h | 73 +------------------------------------------
 4 files changed, 8 insertions(+), 115 deletions(-)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 4bc8cd6d461e64..4e65a92e528e5c 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -623,11 +623,7 @@ int rcu_get_gp_kthreads_prio(void);
 void rcu_fwd_progress_check(unsigned long j);
 void rcu_force_quiescent_state(void);
 extern struct workqueue_struct *rcu_gp_wq;
-#ifdef CONFIG_RCU_EXP_KTHREAD
 extern struct kthread_worker *rcu_exp_gp_kworker;
-#else /* !CONFIG_RCU_EXP_KTHREAD */
-extern struct workqueue_struct *rcu_par_gp_wq;
-#endif /* CONFIG_RCU_EXP_KTHREAD */
 void rcu_gp_slow_register(atomic_t *rgssp);
 void rcu_gp_slow_unregister(atomic_t *rgssp);
 #endif /* #else #ifdef CONFIG_TINY_RCU */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 40bfc58f18213d..c8980d76f40292 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4403,7 +4403,6 @@ rcu_boot_init_percpu_data(int cpu)
 	rcu_boot_init_nocb_percpu_data(rdp);
 }
 
-#ifdef CONFIG_RCU_EXP_KTHREAD
 struct kthread_worker *rcu_exp_gp_kworker;
 
 static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
@@ -4423,7 +4422,9 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
 		return;
 	}
 	WRITE_ONCE(rnp->exp_kworker, kworker);
-	sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
+
+	if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
+		sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
 }
 
 static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
@@ -4447,39 +4448,14 @@ static void __init rcu_start_exp_gp_kworker(void)
 		rcu_exp_gp_kworker = NULL;
 		return;
 	}
-	sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
-}
-
-static inline void rcu_alloc_par_gp_wq(void)
-{
-}
-#else /* !CONFIG_RCU_EXP_KTHREAD */
-struct workqueue_struct *rcu_par_gp_wq;
-
-static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
-{
-}
-
-static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
-{
-	return NULL;
-}
-
-static void __init rcu_start_exp_gp_kworker(void)
-{
-}
 
-static inline void rcu_alloc_par_gp_wq(void)
-{
-	rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
-	WARN_ON(!rcu_par_gp_wq);
+	if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
+		sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
 }
-#endif /* CONFIG_RCU_EXP_KTHREAD */
 
 static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp)
 {
-	if ((IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) ||
-	     IS_ENABLED(CONFIG_RCU_BOOST)) && rcu_scheduler_fully_active) {
+	if (rcu_scheduler_fully_active) {
 		mutex_lock(&rnp->kthread_mutex);
 		rcu_spawn_one_boost_kthread(rnp);
 		rcu_spawn_exp_par_gp_kworker(rnp);
@@ -4563,9 +4539,6 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu)
 	struct rcu_node *rnp;
 	struct task_struct *task_boost, *task_exp;
 
-	if (!IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) && !IS_ENABLED(CONFIG_RCU_BOOST))
-		return;
-
 	rdp = per_cpu_ptr(&rcu_data, cpu);
 	rnp = rdp->mynode;
 
@@ -5255,7 +5228,6 @@ void __init rcu_init(void)
 	/* Create workqueue for Tree SRCU and for expedited GPs. */
 	rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
 	WARN_ON(!rcu_gp_wq);
-	rcu_alloc_par_gp_wq();
 
 	/* Fill in default value for rcutree.qovld boot parameter. */
 	/* -After- the rcu_node ->lock fields are initialized! */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a0c02d631f448e..df48160b3136dc 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -21,14 +21,10 @@
 
 #include "rcu_segcblist.h"
 
-/* Communicate arguments to a workqueue handler. */
+/* Communicate arguments to a kthread worker handler. */
 struct rcu_exp_work {
 	unsigned long rew_s;
-#ifdef CONFIG_RCU_EXP_KTHREAD
 	struct kthread_work rew_work;
-#else
-	struct work_struct rew_work;
-#endif /* CONFIG_RCU_EXP_KTHREAD */
 };
 
 /* RCU's kthread states for tracing. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 0318a8a062d5c0..6b83537480b12f 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -418,7 +418,6 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
 
 static void rcu_exp_sel_wait_wake(unsigned long s);
 
-#ifdef CONFIG_RCU_EXP_KTHREAD
 static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
 {
 	struct rcu_exp_work *rewp =
@@ -470,69 +469,6 @@ static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew
 	kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
 }
 
-static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
-{
-}
-#else /* !CONFIG_RCU_EXP_KTHREAD */
-static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
-{
-	struct rcu_exp_work *rewp =
-		container_of(wp, struct rcu_exp_work, rew_work);
-
-	__sync_rcu_exp_select_node_cpus(rewp);
-}
-
-static inline bool rcu_exp_worker_started(void)
-{
-	return !!READ_ONCE(rcu_gp_wq);
-}
-
-static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp)
-{
-	return !!READ_ONCE(rcu_par_gp_wq);
-}
-
-static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
-{
-	int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
-
-	INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
-	/* If all offline, queue the work on an unbound CPU. */
-	if (unlikely(cpu > rnp->grphi - rnp->grplo))
-		cpu = WORK_CPU_UNBOUND;
-	else
-		cpu += rnp->grplo;
-	queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
-}
-
-static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
-{
-	flush_work(&rnp->rew.rew_work);
-}
-
-/*
- * Work-queue handler to drive an expedited grace period forward.
- */
-static void wait_rcu_exp_gp(struct work_struct *wp)
-{
-	struct rcu_exp_work *rewp;
-
-	rewp = container_of(wp, struct rcu_exp_work, rew_work);
-	rcu_exp_sel_wait_wake(rewp->rew_s);
-}
-
-static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
-{
-	INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp);
-	queue_work(rcu_gp_wq, &rew->rew_work);
-}
-
-static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
-{
-	destroy_work_on_stack(&rew->rew_work);
-}
-#endif /* CONFIG_RCU_EXP_KTHREAD */
-
 /*
  * Select the nodes that the upcoming expedited grace period needs
  * to wait for.
@@ -965,7 +901,6 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
  */
 void synchronize_rcu_expedited(void)
 {
-	bool use_worker;
 	unsigned long flags;
 	struct rcu_exp_work rew;
 	struct rcu_node *rnp;
@@ -976,9 +911,6 @@ void synchronize_rcu_expedited(void)
 			 lock_is_held(&rcu_sched_lock_map),
 			 "Illegal synchronize_rcu_expedited() in RCU read-side critical section");
 
-	use_worker = (rcu_scheduler_active != RCU_SCHEDULER_INIT) &&
-		      rcu_exp_worker_started();
-
 	/* Is the state is such that the call is a grace period? */
 	if (rcu_blocking_is_gp()) {
 		// Note well that this code runs with !PREEMPT && !SMP.
@@ -1008,7 +940,7 @@ void synchronize_rcu_expedited(void)
 		return;  /* Someone else did our work for us. */
 
 	/* Ensure that load happens before action based on it. */
-	if (unlikely(!use_worker)) {
+	if (unlikely((rcu_scheduler_active == RCU_SCHEDULER_INIT) || !rcu_exp_worker_started())) {
 		/* Direct call during scheduler init and early_initcalls(). */
 		rcu_exp_sel_wait_wake(s);
 	} else {
@@ -1025,9 +957,6 @@ void synchronize_rcu_expedited(void)
 
 	/* Let the next expedited grace period start. */
 	mutex_unlock(&rcu_state.exp_mutex);
-
-	if (likely(use_worker))
-		synchronize_rcu_expedited_destroy_work(&rew);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 

From fd3b5e371b1b8aa5bb050ba5bc2839feb58d95b9 Mon Sep 17 00:00:00 2001
From: Onkarnath <onkarnath.1@samsung.com>
Date: Thu, 11 Jan 2024 14:57:22 +0530
Subject: [PATCH 222/707] rcu/sync: remove un-used rcu_sync_enter_start
 function

With commit '6a010a49b63a ("cgroup: Make !percpu threadgroup_rwsem
operations optional")' usage of rcu_sync_enter_start is removed.

So this function can also be removed.

In the words of Oleg Nesterov:

	__rcu_sync_enter(wait => false) is a better alternative if
	someone needs rcu_sync_enter_start() again.

Link: https://lore.kernel.org/all/20220725121208.GB28662@redhat.com/
Signed-off-by: Onkarnath <onkarnath.1@samsung.com>
Signed-off-by: Maninder Singh <maninder1.s@samsung.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcu_sync.h |  1 -
 kernel/rcu/sync.c        | 16 ----------------
 2 files changed, 17 deletions(-)

diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
index 0027d4c8087c9a..3860dbb9107a21 100644
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -37,7 +37,6 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
 }
 
 extern void rcu_sync_init(struct rcu_sync *);
-extern void rcu_sync_enter_start(struct rcu_sync *);
 extern void rcu_sync_enter(struct rcu_sync *);
 extern void rcu_sync_exit(struct rcu_sync *);
 extern void rcu_sync_dtor(struct rcu_sync *);
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index e550f97779b8dc..86df878a2fee8b 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -24,22 +24,6 @@ void rcu_sync_init(struct rcu_sync *rsp)
 	init_waitqueue_head(&rsp->gp_wait);
 }
 
-/**
- * rcu_sync_enter_start - Force readers onto slow path for multiple updates
- * @rsp: Pointer to rcu_sync structure to use for synchronization
- *
- * Must be called after rcu_sync_init() and before first use.
- *
- * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
- * pairs turn into NO-OPs.
- */
-void rcu_sync_enter_start(struct rcu_sync *rsp)
-{
-	rsp->gp_count++;
-	rsp->gp_state = GP_PASSED;
-}
-
-
 static void rcu_sync_func(struct rcu_head *rhp);
 
 static void rcu_sync_call(struct rcu_sync *rsp)

From 7b4989d7fbc92d7ff1d317e1293bea4a47aab18a Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang1211@gmail.com>
Date: Wed, 17 Jan 2024 18:26:16 +0800
Subject: [PATCH 223/707] rcu/nocb: Check rdp_gp->nocb_timer in
 __call_rcu_nocb_wake()

Currently, only rdp_gp->nocb_timer is used, for nocb_timer of
no-rdp_gp structure, the timer_pending() is always return false,
this commit therefore need to check rdp_gp->nocb_timer in
__call_rcu_nocb_wake().

Signed-off-by: Zqiang <qiang.zhang1211@gmail.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_nocb.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 54971afc3a9b25..3f85577bddd4ef 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -564,6 +564,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 	long lazy_len;
 	long len;
 	struct task_struct *t;
+	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
 
 	// If we are being polled or there is no kthread, just leave.
 	t = READ_ONCE(rdp->nocb_gp_kthread);
@@ -608,7 +609,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 		smp_mb(); /* Enqueue before timer_pending(). */
 		if ((rdp->nocb_cb_sleep ||
 		     !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
-		    !timer_pending(&rdp->nocb_timer)) {
+		    !timer_pending(&rdp_gp->nocb_timer)) {
 			rcu_nocb_unlock(rdp);
 			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
 					   TPS("WakeOvfIsDeferred"));

From bc31e6cb27a9334140ff2f0a209d59b08bc0bc8c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sat, 20 Jan 2024 07:07:08 -0800
Subject: [PATCH 224/707] rcu-tasks: Eliminate deadlocks involving do_exit()
 and RCU tasks

Holding a mutex across synchronize_rcu_tasks() and acquiring
that same mutex in code called from do_exit() after its call to
exit_tasks_rcu_start() but before its call to exit_tasks_rcu_stop()
results in deadlock.  This is by design, because tasks that are far
enough into do_exit() are no longer present on the tasks list, making
it a bit difficult for RCU Tasks to find them, let alone wait on them
to do a voluntary context switch.  However, such deadlocks are becoming
more frequent.  In addition, lockdep currently does not detect such
deadlocks and they can be difficult to reproduce.

In addition, if a task voluntarily context switches during that time
(for example, if it blocks acquiring a mutex), then this task is in an
RCU Tasks quiescent state.  And with some adjustments, RCU Tasks could
just as well take advantage of that fact.

This commit therefore eliminates these deadlock by replacing the
SRCU-based wait for do_exit() completion with per-CPU lists of tasks
currently exiting.  A given task will be on one of these per-CPU lists for
the same period of time that this task would previously have been in the
previous SRCU read-side critical section.  These lists enable RCU Tasks
to find the tasks that have already been removed from the tasks list,
but that must nevertheless be waited upon.

The RCU Tasks grace period gathers any of these do_exit() tasks that it
must wait on, and adds them to the list of holdouts.  Per-CPU locking
and get_task_struct() are used to synchronize addition to and removal
from these lists.

Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@huawei.com/

Reported-by: Chen Zhongjin <chenzhongjin@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/sched.h |  2 +
 init/init_task.c      |  1 +
 kernel/fork.c         |  1 +
 kernel/rcu/tasks.h    | 89 ++++++++++++++++++++++++++++++-------------
 4 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cdb8ea53c365ba..4f0e9274da2de4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -858,6 +858,8 @@ struct task_struct {
 	u8				rcu_tasks_idx;
 	int				rcu_tasks_idle_cpu;
 	struct list_head		rcu_tasks_holdout_list;
+	int				rcu_tasks_exit_cpu;
+	struct list_head		rcu_tasks_exit_list;
 #endif /* #ifdef CONFIG_TASKS_RCU */
 
 #ifdef CONFIG_TASKS_TRACE_RCU
diff --git a/init/init_task.c b/init/init_task.c
index 7ecb458eb3da60..4daee6d761c86c 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -147,6 +147,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 	.rcu_tasks_holdout = false,
 	.rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list),
 	.rcu_tasks_idle_cpu = -1,
+	.rcu_tasks_exit_list = LIST_HEAD_INIT(init_task.rcu_tasks_exit_list),
 #endif
 #ifdef CONFIG_TASKS_TRACE_RCU
 	.trc_reader_nesting = 0,
diff --git a/kernel/fork.c b/kernel/fork.c
index 47ff3b35352e0b..3eb86f30e66418 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1975,6 +1975,7 @@ static inline void rcu_copy_process(struct task_struct *p)
 	p->rcu_tasks_holdout = false;
 	INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
 	p->rcu_tasks_idle_cpu = -1;
+	INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
 #endif /* #ifdef CONFIG_TASKS_RCU */
 #ifdef CONFIG_TASKS_TRACE_RCU
 	p->trc_reader_nesting = 0;
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 732ad5b39946a5..bd4a51fd5b1fba 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -32,6 +32,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
  * @rtp_irq_work: IRQ work queue for deferred wakeups.
  * @barrier_q_head: RCU callback for barrier operation.
  * @rtp_blkd_tasks: List of tasks blocked as readers.
+ * @rtp_exit_list: List of tasks in the latter portion of do_exit().
  * @cpu: CPU number corresponding to this entry.
  * @rtpp: Pointer to the rcu_tasks structure.
  */
@@ -46,6 +47,7 @@ struct rcu_tasks_percpu {
 	struct irq_work rtp_irq_work;
 	struct rcu_head barrier_q_head;
 	struct list_head rtp_blkd_tasks;
+	struct list_head rtp_exit_list;
 	int cpu;
 	struct rcu_tasks *rtpp;
 };
@@ -144,8 +146,6 @@ static struct rcu_tasks rt_name =							\
 }
 
 #ifdef CONFIG_TASKS_RCU
-/* Track exiting tasks in order to allow them to be waited for. */
-DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
 
 /* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
 static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
@@ -275,6 +275,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
 		rtpcp->rtpp = rtp;
 		if (!rtpcp->rtp_blkd_tasks.next)
 			INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+		if (!rtpcp->rtp_exit_list.next)
+			INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
 	}
 
 	pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name,
@@ -851,10 +853,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
 //	number of voluntary context switches, and add that task to the
 //	holdout list.
 // rcu_tasks_postscan():
-//	Invoke synchronize_srcu() to ensure that all tasks that were
-//	in the process of exiting (and which thus might not know to
-//	synchronize with this RCU Tasks grace period) have completed
-//	exiting.
+//	Gather per-CPU lists of tasks in do_exit() to ensure that all
+//	tasks that were in the process of exiting (and which thus might
+//	not know to synchronize with this RCU Tasks grace period) have
+//	completed exiting.  The synchronize_rcu() in rcu_tasks_postgp()
+//	will take care of any tasks stuck in the non-preemptible region
+//	of do_exit() following its call to exit_tasks_rcu_stop().
 // check_all_holdout_tasks(), repeatedly until holdout list is empty:
 //	Scans the holdout list, attempting to identify a quiescent state
 //	for each task on the list.  If there is a quiescent state, the
@@ -867,8 +871,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
 //	with interrupts disabled.
 //
 // For each exiting task, the exit_tasks_rcu_start() and
-// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
-// read-side critical sections waited for by rcu_tasks_postscan().
+// exit_tasks_rcu_finish() functions add and remove, respectively, the
+// current task to a per-CPU list of tasks that rcu_tasks_postscan() must
+// wait on.  This is necessary because rcu_tasks_postscan() must wait on
+// tasks that have already been removed from the global list of tasks.
 //
 // Pre-grace-period update-side code is ordered before the grace
 // via the raw_spin_lock.*rcu_node().  Pre-grace-period read-side code
@@ -932,9 +938,13 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
 	}
 }
 
+void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
+DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
+
 /* Processing between scanning taskslist and draining the holdout list. */
 static void rcu_tasks_postscan(struct list_head *hop)
 {
+	int cpu;
 	int rtsi = READ_ONCE(rcu_task_stall_info);
 
 	if (!IS_ENABLED(CONFIG_TINY_RCU)) {
@@ -948,9 +958,9 @@ static void rcu_tasks_postscan(struct list_head *hop)
 	 * this, divide the fragile exit path part in two intersecting
 	 * read side critical sections:
 	 *
-	 * 1) An _SRCU_ read side starting before calling exit_notify(),
-	 *    which may remove the task from the tasklist, and ending after
-	 *    the final preempt_disable() call in do_exit().
+	 * 1) A task_struct list addition before calling exit_notify(),
+	 *    which may remove the task from the tasklist, with the
+	 *    removal after the final preempt_disable() call in do_exit().
 	 *
 	 * 2) An _RCU_ read side starting with the final preempt_disable()
 	 *    call in do_exit() and ending with the final call to schedule()
@@ -959,7 +969,18 @@ static void rcu_tasks_postscan(struct list_head *hop)
 	 * This handles the part 1). And postgp will handle part 2) with a
 	 * call to synchronize_rcu().
 	 */
-	synchronize_srcu(&tasks_rcu_exit_srcu);
+
+	for_each_possible_cpu(cpu) {
+		unsigned long flags;
+		struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu);
+		struct task_struct *t;
+
+		raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+		list_for_each_entry(t, &rtpcp->rtp_exit_list, rcu_tasks_exit_list)
+			if (list_empty(&t->rcu_tasks_holdout_list))
+				rcu_tasks_pertask(t, hop);
+		raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+	}
 
 	if (!IS_ENABLED(CONFIG_TINY_RCU))
 		del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
@@ -1027,7 +1048,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
 	 *
 	 * In addition, this synchronize_rcu() waits for exiting tasks
 	 * to complete their final preempt_disable() region of execution,
-	 * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu),
 	 * enforcing the whole region before tasklist removal until
 	 * the final schedule() with TASK_DEAD state to be an RCU TASKS
 	 * read side critical section.
@@ -1035,9 +1055,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
 	synchronize_rcu();
 }
 
-void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
-DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
-
 static void tasks_rcu_exit_srcu_stall(struct timer_list *unused)
 {
 #ifndef CONFIG_TINY_RCU
@@ -1147,25 +1164,45 @@ struct task_struct *get_rcu_tasks_gp_kthread(void)
 EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread);
 
 /*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
+ * Protect against tasklist scan blind spot while the task is exiting and
+ * may be removed from the tasklist.  Do this by adding the task to yet
+ * another list.
  */
-void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
+void exit_tasks_rcu_start(void)
 {
-	current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
+	unsigned long flags;
+	struct rcu_tasks_percpu *rtpcp;
+	struct task_struct *t = current;
+
+	WARN_ON_ONCE(!list_empty(&t->rcu_tasks_exit_list));
+	get_task_struct(t);
+	preempt_disable();
+	rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu);
+	t->rcu_tasks_exit_cpu = smp_processor_id();
+	raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+	if (!rtpcp->rtp_exit_list.next)
+		INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
+	list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list);
+	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+	preempt_enable();
 }
 
 /*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
+ * Remove the task from the "yet another list" because do_exit() is now
+ * non-preemptible, allowing synchronize_rcu() to wait beyond this point.
  */
-void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu)
+void exit_tasks_rcu_stop(void)
 {
+	unsigned long flags;
+	struct rcu_tasks_percpu *rtpcp;
 	struct task_struct *t = current;
 
-	__srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx);
+	WARN_ON_ONCE(list_empty(&t->rcu_tasks_exit_list));
+	rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, t->rcu_tasks_exit_cpu);
+	raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+	list_del_init(&t->rcu_tasks_exit_list);
+	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+	put_task_struct(t);
 }
 
 /*

From c931eafc5a82082e3b5ff429024af07f4886a8de Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 22 Jan 2024 09:02:47 -0500
Subject: [PATCH 225/707] Bluetooth: hci_event: Fix handling of
 HCI_EV_IO_CAPA_REQUEST

If we received HCI_EV_IO_CAPA_REQUEST while
HCI_OP_READ_REMOTE_EXT_FEATURES is yet to be responded assume the remote
does support SSP since otherwise this event shouldn't be generated.

Link: https://lore.kernel.org/linux-bluetooth/CABBYNZ+9UdG1cMZVmdtN3U2aS16AKMCyTARZZyFX7xTEDWcMOw@mail.gmail.com/T/#t
Fixes: c7f59461f5a7 ("Bluetooth: Fix a refcnt underflow problem for hci_conn")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 6130c969f361a7..a15924db83d9fe 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -5327,9 +5327,12 @@ static void hci_io_capa_request_evt(struct hci_dev *hdev, void *data,
 	hci_dev_lock(hdev);
 
 	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
-	if (!conn || !hci_conn_ssp_enabled(conn))
+	if (!conn || !hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
 		goto unlock;
 
+	/* Assume remote supports SSP since it has triggered this event */
+	set_bit(HCI_CONN_SSP_ENABLED, &conn->flags);
+
 	hci_conn_hold(conn);
 
 	if (!hci_dev_test_flag(hdev, HCI_MGMT))

From 9434e62334d5030e442c2be1fd96aad8816052a5 Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Thu, 25 Jan 2024 14:50:28 +0800
Subject: [PATCH 226/707] Bluetooth: Enforce validation on max value of
 connection interval

Right now Linux BT stack cannot pass test case "GAP/CONN/CPUP/BV-05-C
'Connection Parameter Update Procedure Invalid Parameters Central
Responder'" in Bluetooth Test Suite revision GAP.TS.p44. [0]

That was revoled by commit c49a8682fc5d ("Bluetooth: validate BLE
connection interval updates"), but later got reverted due to devices
like keyboards and mice may require low connection interval.

So only validate the max value connection interval to pass the Test
Suite, and let devices to request low connection interval if needed.

[0] https://www.bluetooth.org/docman/handlers/DownloadDoc.ashx?doc_id=229869
Fixes: 68d19d7d9957 ("Revert "Bluetooth: validate BLE connection interval updates"")

Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c  | 4 ++++
 net/bluetooth/l2cap_core.c | 8 +++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index a15924db83d9fe..31df5f5b799455 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -6795,6 +6795,10 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev, void *data,
 		return send_conn_param_neg_reply(hdev, handle,
 						 HCI_ERROR_UNKNOWN_CONN_ID);
 
+	if (max > hcon->le_conn_max_interval)
+		return send_conn_param_neg_reply(hdev, handle,
+						 HCI_ERROR_INVALID_LL_PARAMS);
+
 	if (hci_check_conn_params(min, max, latency, timeout))
 		return send_conn_param_neg_reply(hdev, handle,
 						 HCI_ERROR_INVALID_LL_PARAMS);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 60298975d5c456..656f49b299d20d 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -5613,7 +5613,13 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn,
 
 	memset(&rsp, 0, sizeof(rsp));
 
-	err = hci_check_conn_params(min, max, latency, to_multiplier);
+	if (max > hcon->le_conn_max_interval) {
+		BT_DBG("requested connection interval exceeds current bounds.");
+		err = -EINVAL;
+	} else {
+		err = hci_check_conn_params(min, max, latency, to_multiplier);
+	}
+
 	if (err)
 		rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED);
 	else

From 693a94db9e8cff14cce892cba6818bc67ab51ec4 Mon Sep 17 00:00:00 2001
From: Edward Adam Davis <eadavis@qq.com>
Date: Thu, 18 Jan 2024 12:40:34 +0800
Subject: [PATCH 227/707] Bluetooth: btintel: Fix null ptr deref in
 btintel_read_version

If hci_cmd_sync_complete() is triggered and skb is NULL, then
hdev->req_skb is NULL, which will cause this issue.

Reported-and-tested-by: syzbot+830d9e3fa61968246abd@syzkaller.appspotmail.com
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btintel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c
index cdc5c08824a0ad..e5b043d9620730 100644
--- a/drivers/bluetooth/btintel.c
+++ b/drivers/bluetooth/btintel.c
@@ -435,7 +435,7 @@ int btintel_read_version(struct hci_dev *hdev, struct intel_version *ver)
 	struct sk_buff *skb;
 
 	skb = __hci_cmd_sync(hdev, 0xfc05, 0, NULL, HCI_CMD_TIMEOUT);
-	if (IS_ERR(skb)) {
+	if (IS_ERR_OR_NULL(skb)) {
 		bt_dev_err(hdev, "Reading Intel version information failed (%ld)",
 			   PTR_ERR(skb));
 		return PTR_ERR(skb);

From 64692e12507b3efd71b4ff5596c9742d91f1ffe5 Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Fri, 19 Jan 2024 17:45:30 +0800
Subject: [PATCH 228/707] Bluetooth: qca: Fix wrong event type for patch config
 command

Vendor-specific command patch config has HCI_Command_Complete event as
response, but qca_send_patch_config_cmd() wrongly expects vendor-specific
event for the command, fixed by using right event type.

Btmon log for the vendor-specific command are shown below:
< HCI Command: Vendor (0x3f|0x0000) plen 5
        28 01 00 00 00
> HCI Event: Command Complete (0x0e) plen 5
      Vendor (0x3f|0x0000) ncmd 1
        Status: Success (0x00)
        28

Fixes: 4fac8a7ac80b ("Bluetooth: btqca: sequential validation")
Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btqca.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c
index fdb0fae88d1c58..b40b32fa7f1c38 100644
--- a/drivers/bluetooth/btqca.c
+++ b/drivers/bluetooth/btqca.c
@@ -152,7 +152,7 @@ static int qca_send_patch_config_cmd(struct hci_dev *hdev)
 	bt_dev_dbg(hdev, "QCA Patch config");
 
 	skb = __hci_cmd_sync_ev(hdev, EDL_PATCH_CMD_OPCODE, sizeof(cmd),
-				cmd, HCI_EV_VENDOR, HCI_INIT_TIMEOUT);
+				cmd, 0, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
 		err = PTR_ERR(skb);
 		bt_dev_err(hdev, "Sending QCA Patch config failed (%d)", err);

From fb09ad63d798533d4a3a338513c1ef1a547955f0 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 12 Jan 2024 09:11:13 -0800
Subject: [PATCH 229/707] f2fs: remove unnecessary f2fs_put_page in f2fs_rename

[1] changed the below condition, which made f2fs_put_page() voided.
This patch reapplies the AL's resolution in -next from [2].

-       if (S_ISDIR(old_inode->i_mode)) {
+       if (old_is_dir && old_dir != new_dir) {
                old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page);
                if (!old_dir_entry) {
                        if (IS_ERR(old_dir_page))

[1] 7deee77b993a ("f2fs: Avoid reading renamed directory if parent does not change")
[2] https://lore.kernel.org/all/20231220013402.GW1674809@ZenIV/

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/namei.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index b3bb815fc6aa45..ba11298b78379a 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -1105,14 +1105,11 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 		iput(whiteout);
 	}
 
-	if (old_is_dir) {
-		if (old_dir_entry)
-			f2fs_set_link(old_inode, old_dir_entry,
-						old_dir_page, new_dir);
-		else
-			f2fs_put_page(old_dir_page, 0);
+	if (old_dir_entry)
+		f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir);
+	if (old_is_dir)
 		f2fs_i_links_write(old_dir, false);
-	}
+
 	if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) {
 		f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
 		if (S_ISDIR(old_inode->i_mode))

From e2f29120ff1f25b3bd59310b0443ce7f049c5f0a Mon Sep 17 00:00:00 2001
From: Wu Bo <bo.wu@vivo.com>
Date: Thu, 28 Dec 2023 20:25:07 -0700
Subject: [PATCH 230/707] f2fs: check free sections before disable checkpoint

'f2fs_is_checkpoint_ready()' checks free sections. If there is not
enough free sections, most f2fs operations will return -ENOSPC when
checkpoint is disabled.

It would be better to check free sections before disable checkpoint.

Signed-off-by: Wu Bo <bo.wu@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4c8836ded90fc2..f0f12b1eddc8b0 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -906,6 +906,8 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable)
 	if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) &&
 		dirty_segments(sbi) > ovp_hole_segs)
 		return -EAGAIN;
+	if (has_not_enough_free_secs(sbi, 0, 0))
+		return -EAGAIN;
 	return 0;
 }
 

From fe2b98bcae7e5120134a3c44051c51fade60fa15 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Sat, 13 Jan 2024 03:41:27 +0800
Subject: [PATCH 231/707] f2fs: compress: fix to guarantee persisting
 compressed blocks by CP

If data block in compressed cluster is not persisted with metadata
during checkpoint, after SPOR, the data may be corrupted, let's
guarantee to write compressed page by checkpoint.

Fixes: 4c8ff7095bef ("f2fs: support data compression")
Reviewed-by: Daeho Jeong <daehojeong@google.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c |  4 +++-
 fs/f2fs/data.c     | 17 +++++++++--------
 fs/f2fs/f2fs.h     |  4 +++-
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 531517dac07967..3a8d8a213b4063 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1418,6 +1418,8 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
 	struct f2fs_sb_info *sbi = bio->bi_private;
 	struct compress_io_ctx *cic =
 			(struct compress_io_ctx *)page_private(page);
+	enum count_type type = WB_DATA_TYPE(page,
+				f2fs_is_compressed_page(page));
 	int i;
 
 	if (unlikely(bio->bi_status))
@@ -1425,7 +1427,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
 
 	f2fs_compress_free_page(page);
 
-	dec_page_count(sbi, F2FS_WB_DATA);
+	dec_page_count(sbi, type);
 
 	if (atomic_dec_return(&cic->pending_pages))
 		return;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 26e317696b3389..d00e92b6c90250 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -48,7 +48,7 @@ void f2fs_destroy_bioset(void)
 	bioset_exit(&f2fs_bioset);
 }
 
-static bool __is_cp_guaranteed(struct page *page)
+bool f2fs_is_cp_guaranteed(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode;
@@ -65,8 +65,6 @@ static bool __is_cp_guaranteed(struct page *page)
 			S_ISDIR(inode->i_mode))
 		return true;
 
-	if (f2fs_is_compressed_page(page))
-		return false;
 	if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) ||
 			page_private_gcing(page))
 		return true;
@@ -338,7 +336,7 @@ static void f2fs_write_end_io(struct bio *bio)
 
 	bio_for_each_segment_all(bvec, bio, iter_all) {
 		struct page *page = bvec->bv_page;
-		enum count_type type = WB_DATA_TYPE(page);
+		enum count_type type = WB_DATA_TYPE(page, false);
 
 		if (page_private_dummy(page)) {
 			clear_page_private_dummy(page);
@@ -762,7 +760,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 		wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
 
 	inc_page_count(fio->sbi, is_read_io(fio->op) ?
-			__read_io_type(page) : WB_DATA_TYPE(fio->page));
+			__read_io_type(page) : WB_DATA_TYPE(fio->page, false));
 
 	if (is_read_io(bio_op(bio)))
 		f2fs_submit_read_bio(fio->sbi, bio, fio->type);
@@ -973,7 +971,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
 	if (fio->io_wbc)
 		wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
 
-	inc_page_count(fio->sbi, WB_DATA_TYPE(page));
+	inc_page_count(fio->sbi, WB_DATA_TYPE(page, false));
 
 	*fio->last_block = fio->new_blkaddr;
 	*fio->bio = bio;
@@ -1007,6 +1005,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
 	enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
 	struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp;
 	struct page *bio_page;
+	enum count_type type;
 
 	f2fs_bug_on(sbi, is_read_io(fio->op));
 
@@ -1046,7 +1045,8 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
 	/* set submitted = true as a return value */
 	fio->submitted = 1;
 
-	inc_page_count(sbi, WB_DATA_TYPE(bio_page));
+	type = WB_DATA_TYPE(bio_page, fio->compressed_page);
+	inc_page_count(sbi, type);
 
 	if (io->bio &&
 	    (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio,
@@ -1059,7 +1059,8 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
 		if (F2FS_IO_ALIGNED(sbi) &&
 				(fio->type == DATA || fio->type == NODE) &&
 				fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) {
-			dec_page_count(sbi, WB_DATA_TYPE(bio_page));
+			dec_page_count(sbi, WB_DATA_TYPE(bio_page,
+						fio->compressed_page));
 			fio->retry = 1;
 			goto skip;
 		}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 65294e3b0bef88..50f3d546ded858 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1080,7 +1080,8 @@ struct f2fs_sm_info {
  * f2fs monitors the number of several block types such as on-writeback,
  * dirty dentry blocks, dirty node blocks, and dirty meta blocks.
  */
-#define WB_DATA_TYPE(p)	(__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
+#define WB_DATA_TYPE(p, f)			\
+	(f || f2fs_is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
 enum count_type {
 	F2FS_DIRTY_DENTS,
 	F2FS_DIRTY_DATA,
@@ -3804,6 +3805,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi);
  */
 int __init f2fs_init_bioset(void);
 void f2fs_destroy_bioset(void);
+bool f2fs_is_cp_guaranteed(struct page *page);
 int f2fs_init_bio_entry_cache(void);
 void f2fs_destroy_bio_entry_cache(void);
 void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,

From bd0dddafd4a51e15349f6dfbaf0f052be3334f25 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Sat, 13 Jan 2024 03:41:28 +0800
Subject: [PATCH 232/707] f2fs: compress: fix to cover normal cluster write
 with cp_rwsem

When we overwrite compressed cluster w/ normal cluster, we should
not unlock cp_rwsem during f2fs_write_raw_pages(), otherwise data
will be corrupted if partial blocks were persisted before CP & SPOR,
due to cluster metadata wasn't updated atomically.

Fixes: 4c8ff7095bef ("f2fs: support data compression")
Reviewed-by: Daeho Jeong <daehojeong@google.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c | 27 ++++++++++++++++++---------
 fs/f2fs/data.c     |  3 ++-
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 3a8d8a213b4063..ff26b49c0d71ff 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1443,12 +1443,14 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
 }
 
 static int f2fs_write_raw_pages(struct compress_ctx *cc,
-					int *submitted,
+					int *submitted_p,
 					struct writeback_control *wbc,
 					enum iostat_type io_type)
 {
 	struct address_space *mapping = cc->inode->i_mapping;
-	int _submitted, compr_blocks, ret, i;
+	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
+	int submitted, compr_blocks, i;
+	int ret = 0;
 
 	compr_blocks = f2fs_compressed_blocks(cc);
 
@@ -1463,6 +1465,10 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
 	if (compr_blocks < 0)
 		return compr_blocks;
 
+	/* overwrite compressed cluster w/ normal cluster */
+	if (compr_blocks > 0)
+		f2fs_lock_op(sbi);
+
 	for (i = 0; i < cc->cluster_size; i++) {
 		if (!cc->rpages[i])
 			continue;
@@ -1487,7 +1493,7 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
 		if (!clear_page_dirty_for_io(cc->rpages[i]))
 			goto continue_unlock;
 
-		ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted,
+		ret = f2fs_write_single_data_page(cc->rpages[i], &submitted,
 						NULL, NULL, wbc, io_type,
 						compr_blocks, false);
 		if (ret) {
@@ -1495,26 +1501,29 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
 				unlock_page(cc->rpages[i]);
 				ret = 0;
 			} else if (ret == -EAGAIN) {
+				ret = 0;
 				/*
 				 * for quota file, just redirty left pages to
 				 * avoid deadlock caused by cluster update race
 				 * from foreground operation.
 				 */
 				if (IS_NOQUOTA(cc->inode))
-					return 0;
-				ret = 0;
+					goto out;
 				f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
 				goto retry_write;
 			}
-			return ret;
+			goto out;
 		}
 
-		*submitted += _submitted;
+		*submitted_p += submitted;
 	}
 
-	f2fs_balance_fs(F2FS_M_SB(mapping), true);
+out:
+	if (compr_blocks > 0)
+		f2fs_unlock_op(sbi);
 
-	return 0;
+	f2fs_balance_fs(sbi, true);
+	return ret;
 }
 
 int f2fs_write_multi_pages(struct compress_ctx *cc,
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d00e92b6c90250..7a93a99fbd04be 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2839,7 +2839,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
 		.encrypted_page = NULL,
 		.submitted = 0,
 		.compr_blocks = compr_blocks,
-		.need_lock = LOCK_RETRY,
+		.need_lock = compr_blocks ? LOCK_DONE : LOCK_RETRY,
 		.post_read = f2fs_post_read_required(inode) ? 1 : 0,
 		.io_type = io_type,
 		.io_wbc = wbc,
@@ -2920,6 +2920,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
 	if (err == -EAGAIN) {
 		err = f2fs_do_write_data_page(&fio);
 		if (err == -EAGAIN) {
+			f2fs_bug_on(sbi, compr_blocks);
 			fio.need_lock = LOCK_REQ;
 			err = f2fs_do_write_data_page(&fio);
 		}

From 787f217e7f04b175794390572a8f528cb1840be5 Mon Sep 17 00:00:00 2001
From: Sheng Yong <shengyong@oppo.com>
Date: Sat, 13 Jan 2024 03:41:29 +0800
Subject: [PATCH 233/707] f2fs: compress: fix to check unreleased compressed
 cluster

Compressed cluster may not be released due to we can fail in
release_compress_blocks(), fix to handle reserved compressed
cluster correctly in reserve_compress_blocks().

Fixes: 4c8ff7095bef ("f2fs: support data compression")
Signed-off-by: Sheng Yong <shengyong@oppo.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b58ab1157b7ef2..941e02c0953ccf 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -3624,7 +3624,13 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
 				goto next;
 			}
 
-			if (__is_valid_data_blkaddr(blkaddr)) {
+			/*
+			 * compressed cluster was not released due to it
+			 * fails in release_compress_blocks(), so NEW_ADDR
+			 * is a possible case.
+			 */
+			if (blkaddr == NEW_ADDR ||
+				__is_valid_data_blkaddr(blkaddr)) {
 				compr_blocks++;
 				continue;
 			}
@@ -3633,6 +3639,11 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
 		}
 
 		reserved = cluster_size - compr_blocks;
+
+		/* for the case all blocks in cluster were reserved */
+		if (reserved == 1)
+			goto next;
+
 		ret = inc_valid_block_count(sbi, dn->inode, &reserved);
 		if (ret)
 			return ret;

From 6d05c8d5997ac3543c87f0a8d9a1b30ec3cc860f Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Sat, 13 Jan 2024 03:41:30 +0800
Subject: [PATCH 234/707] f2fs: compress: fix to avoid inconsistence bewteen
 i_blocks and dnode

In reserve_compress_blocks(), we update blkaddrs of dnode in prior to
inc_valid_block_count(), it may cause inconsistent status bewteen
i_blocks and blkaddrs once inc_valid_block_count() fails.

To fix this issue, it needs to reverse their invoking order.

Fixes: c75488fb4d82 ("f2fs: introduce F2FS_IOC_RESERVE_COMPRESS_BLOCKS")
Reviewed-by: Daeho Jeong <daehojeong@google.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c    |  5 +++--
 fs/f2fs/f2fs.h    |  7 ++++++-
 fs/f2fs/file.c    | 26 ++++++++++++++------------
 fs/f2fs/segment.c |  2 +-
 4 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7a93a99fbd04be..65fe48bb17d16b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1219,7 +1219,8 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
 
 	if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
 		return -EPERM;
-	if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count))))
+	err = inc_valid_block_count(sbi, dn->inode, &count, true);
+	if (unlikely(err))
 		return err;
 
 	trace_f2fs_reserve_new_blocks(dn->inode, dn->nid,
@@ -1476,7 +1477,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
 
 	dn->data_blkaddr = f2fs_data_blkaddr(dn);
 	if (dn->data_blkaddr == NULL_ADDR) {
-		err = inc_valid_block_count(sbi, dn->inode, &count);
+		err = inc_valid_block_count(sbi, dn->inode, &count, true);
 		if (unlikely(err))
 			return err;
 	}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 50f3d546ded858..69e71460a9502d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2252,7 +2252,7 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi,
 
 static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool);
 static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
-				 struct inode *inode, blkcnt_t *count)
+				 struct inode *inode, blkcnt_t *count, bool partial)
 {
 	blkcnt_t diff = 0, release = 0;
 	block_t avail_user_block_count;
@@ -2292,6 +2292,11 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
 			avail_user_block_count = 0;
 	}
 	if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
+		if (!partial) {
+			spin_unlock(&sbi->stat_lock);
+			goto enospc;
+		}
+
 		diff = sbi->total_valid_block_count - avail_user_block_count;
 		if (diff > *count)
 			diff = *count;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 941e02c0953ccf..1ff1c45e192711 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -3614,14 +3614,16 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
 		blkcnt_t reserved;
 		int ret;
 
-		for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) {
-			blkaddr = f2fs_data_blkaddr(dn);
+		for (i = 0; i < cluster_size; i++) {
+			blkaddr = data_blkaddr(dn->inode, dn->node_page,
+						dn->ofs_in_node + i);
 
 			if (i == 0) {
-				if (blkaddr == COMPRESS_ADDR)
-					continue;
-				dn->ofs_in_node += cluster_size;
-				goto next;
+				if (blkaddr != COMPRESS_ADDR) {
+					dn->ofs_in_node += cluster_size;
+					goto next;
+				}
+				continue;
 			}
 
 			/*
@@ -3634,8 +3636,6 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
 				compr_blocks++;
 				continue;
 			}
-
-			f2fs_set_data_blkaddr(dn, NEW_ADDR);
 		}
 
 		reserved = cluster_size - compr_blocks;
@@ -3644,12 +3644,14 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
 		if (reserved == 1)
 			goto next;
 
-		ret = inc_valid_block_count(sbi, dn->inode, &reserved);
-		if (ret)
+		ret = inc_valid_block_count(sbi, dn->inode, &reserved, false);
+		if (unlikely(ret))
 			return ret;
 
-		if (reserved != cluster_size - compr_blocks)
-			return -ENOSPC;
+		for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) {
+			if (f2fs_data_blkaddr(dn) == NULL_ADDR)
+				f2fs_set_data_blkaddr(dn, NEW_ADDR);
+		}
 
 		f2fs_i_compr_blocks_update(dn->inode, compr_blocks, true);
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index f0f12b1eddc8b0..7901ede5811357 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -248,7 +248,7 @@ static int __replace_atomic_write_block(struct inode *inode, pgoff_t index,
 	} else {
 		blkcnt_t count = 1;
 
-		err = inc_valid_block_count(sbi, inode, &count);
+		err = inc_valid_block_count(sbi, inode, &count, true);
 		if (err) {
 			f2fs_put_dnode(&dn);
 			return err;

From ef62f2496f99fd736daef4622ec43175a360bd08 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Sat, 13 Jan 2024 03:41:31 +0800
Subject: [PATCH 235/707] f2fs: fix to remove unnecessary f2fs_bug_on() to
 avoid panic

verify_blkaddr() will trigger panic once we inject fault into
f2fs_is_valid_blkaddr(), fix to remove this unnecessary f2fs_bug_on().

Fixes: 18792e64c86d ("f2fs: support fault injection for f2fs_is_valid_blkaddr()")
Reviewed-by: Daeho Jeong <daehojeong@google.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 69e71460a9502d..ab710bb6d8b32e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3470,11 +3470,9 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 static inline void verify_blkaddr(struct f2fs_sb_info *sbi,
 					block_t blkaddr, int type)
 {
-	if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) {
+	if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type))
 		f2fs_err(sbi, "invalid blkaddr: %u, type: %d, run fsck to fix.",
 			 blkaddr, type);
-		f2fs_bug_on(sbi, 1);
-	}
 }
 
 static inline bool __is_valid_data_blkaddr(block_t blkaddr)

From e2c2cb1a331fc09cedcb8b395a6b70acf2c33815 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Sat, 13 Jan 2024 03:41:32 +0800
Subject: [PATCH 236/707] f2fs: introduce FAULT_BLKADDR_CONSISTENCE

We will encounter below inconsistent status when FAULT_BLKADDR type
fault injection is on.

Info: checkpoint state = d6 :  nat_bits crc fsck compacted_summary orphan_inodes sudden-power-off
[ASSERT] (fsck_chk_inode_blk:1254)  --> ino: 0x1c100 has i_blocks: 000000c0, but has 191 blocks
[FIX] (fsck_chk_inode_blk:1260)  --> [0x1c100] i_blocks=0x000000c0 -> 0xbf
[FIX] (fsck_chk_inode_blk:1269)  --> [0x1c100] i_compr_blocks=0x00000026 -> 0x27
[ASSERT] (fsck_chk_inode_blk:1254)  --> ino: 0x1cadb has i_blocks: 0000002f, but has 46 blocks
[FIX] (fsck_chk_inode_blk:1260)  --> [0x1cadb] i_blocks=0x0000002f -> 0x2e
[FIX] (fsck_chk_inode_blk:1269)  --> [0x1cadb] i_compr_blocks=0x00000011 -> 0x12
[ASSERT] (fsck_chk_inode_blk:1254)  --> ino: 0x1c62c has i_blocks: 00000002, but has 1 blocks
[FIX] (fsck_chk_inode_blk:1260)  --> [0x1c62c] i_blocks=0x00000002 -> 0x1

After we inject fault into f2fs_is_valid_blkaddr() during truncation,
a) it missed to increase @nr_free or @valid_blocks
b) it can cause in blkaddr leak in truncated dnode
Which may cause inconsistent status.

This patch separates FAULT_BLKADDR_CONSISTENCE from FAULT_BLKADDR,
and rename FAULT_BLKADDR to FAULT_BLKADDR_VALIDITY
so that we can:
a) use FAULT_BLKADDR_CONSISTENCE in f2fs_truncate_data_blocks_range()
to simulate inconsistent issue independently, then it can verify fsck
repair flow.
b) FAULT_BLKADDR_VALIDITY fault will not cause any inconsistent status,
we can just use it to check error path handling in kernel side.

Reviewed-by: Daeho Jeong <daehojeong@google.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/ABI/testing/sysfs-fs-f2fs | 47 +++++++++++++------------
 Documentation/filesystems/f2fs.rst      | 47 +++++++++++++------------
 fs/f2fs/checkpoint.c                    | 19 +++++++---
 fs/f2fs/f2fs.h                          |  5 ++-
 fs/f2fs/file.c                          |  8 +++--
 fs/f2fs/super.c                         | 37 +++++++++----------
 6 files changed, 92 insertions(+), 71 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 99fa87a43926e6..48c135e24eb57d 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -701,29 +701,30 @@ Description:	Support configuring fault injection type, should be
 		enabled with fault_injection option, fault type value
 		is shown below, it supports single or combined type.
 
-		===================      ===========
-		Type_Name                Type_Value
-		===================      ===========
-		FAULT_KMALLOC            0x000000001
-		FAULT_KVMALLOC           0x000000002
-		FAULT_PAGE_ALLOC         0x000000004
-		FAULT_PAGE_GET           0x000000008
-		FAULT_ALLOC_BIO          0x000000010 (obsolete)
-		FAULT_ALLOC_NID          0x000000020
-		FAULT_ORPHAN             0x000000040
-		FAULT_BLOCK              0x000000080
-		FAULT_DIR_DEPTH          0x000000100
-		FAULT_EVICT_INODE        0x000000200
-		FAULT_TRUNCATE           0x000000400
-		FAULT_READ_IO            0x000000800
-		FAULT_CHECKPOINT         0x000001000
-		FAULT_DISCARD            0x000002000
-		FAULT_WRITE_IO           0x000004000
-		FAULT_SLAB_ALLOC         0x000008000
-		FAULT_DQUOT_INIT         0x000010000
-		FAULT_LOCK_OP            0x000020000
-		FAULT_BLKADDR            0x000040000
-		===================      ===========
+		===========================      ===========
+		Type_Name                        Type_Value
+		===========================      ===========
+		FAULT_KMALLOC                    0x000000001
+		FAULT_KVMALLOC                   0x000000002
+		FAULT_PAGE_ALLOC                 0x000000004
+		FAULT_PAGE_GET                   0x000000008
+		FAULT_ALLOC_BIO                  0x000000010 (obsolete)
+		FAULT_ALLOC_NID                  0x000000020
+		FAULT_ORPHAN                     0x000000040
+		FAULT_BLOCK                      0x000000080
+		FAULT_DIR_DEPTH                  0x000000100
+		FAULT_EVICT_INODE                0x000000200
+		FAULT_TRUNCATE                   0x000000400
+		FAULT_READ_IO                    0x000000800
+		FAULT_CHECKPOINT                 0x000001000
+		FAULT_DISCARD                    0x000002000
+		FAULT_WRITE_IO                   0x000004000
+		FAULT_SLAB_ALLOC                 0x000008000
+		FAULT_DQUOT_INIT                 0x000010000
+		FAULT_LOCK_OP                    0x000020000
+		FAULT_BLKADDR_VALIDITY           0x000040000
+		FAULT_BLKADDR_CONSISTENCE        0x000080000
+		===========================      ===========
 
 What:		/sys/fs/f2fs/<disk>/discard_io_aware_gran
 Date:		January 2023
diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index d32c6209685d64..32cbfa864f389b 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -184,29 +184,30 @@ fault_type=%d		 Support configuring fault injection type, should be
 			 enabled with fault_injection option, fault type value
 			 is shown below, it supports single or combined type.
 
-			 ===================	  ===========
-			 Type_Name		  Type_Value
-			 ===================	  ===========
-			 FAULT_KMALLOC		  0x000000001
-			 FAULT_KVMALLOC		  0x000000002
-			 FAULT_PAGE_ALLOC	  0x000000004
-			 FAULT_PAGE_GET		  0x000000008
-			 FAULT_ALLOC_BIO	  0x000000010 (obsolete)
-			 FAULT_ALLOC_NID	  0x000000020
-			 FAULT_ORPHAN		  0x000000040
-			 FAULT_BLOCK		  0x000000080
-			 FAULT_DIR_DEPTH	  0x000000100
-			 FAULT_EVICT_INODE	  0x000000200
-			 FAULT_TRUNCATE		  0x000000400
-			 FAULT_READ_IO		  0x000000800
-			 FAULT_CHECKPOINT	  0x000001000
-			 FAULT_DISCARD		  0x000002000
-			 FAULT_WRITE_IO		  0x000004000
-			 FAULT_SLAB_ALLOC	  0x000008000
-			 FAULT_DQUOT_INIT	  0x000010000
-			 FAULT_LOCK_OP		  0x000020000
-			 FAULT_BLKADDR		  0x000040000
-			 ===================	  ===========
+			 ===========================      ===========
+			 Type_Name                        Type_Value
+			 ===========================      ===========
+			 FAULT_KMALLOC                    0x000000001
+			 FAULT_KVMALLOC                   0x000000002
+			 FAULT_PAGE_ALLOC                 0x000000004
+			 FAULT_PAGE_GET                   0x000000008
+			 FAULT_ALLOC_BIO                  0x000000010 (obsolete)
+			 FAULT_ALLOC_NID                  0x000000020
+			 FAULT_ORPHAN                     0x000000040
+			 FAULT_BLOCK                      0x000000080
+			 FAULT_DIR_DEPTH                  0x000000100
+			 FAULT_EVICT_INODE                0x000000200
+			 FAULT_TRUNCATE                   0x000000400
+			 FAULT_READ_IO                    0x000000800
+			 FAULT_CHECKPOINT                 0x000001000
+			 FAULT_DISCARD                    0x000002000
+			 FAULT_WRITE_IO                   0x000004000
+			 FAULT_SLAB_ALLOC                 0x000008000
+			 FAULT_DQUOT_INIT                 0x000010000
+			 FAULT_LOCK_OP                    0x000020000
+			 FAULT_BLKADDR_VALIDITY           0x000040000
+			 FAULT_BLKADDR_CONSISTENCE        0x000080000
+			 ===========================      ===========
 mode=%s			 Control block allocation mode which supports "adaptive"
 			 and "lfs". In "lfs" mode, there should be no random
 			 writes towards main area.
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b0597a539fc548..b85820e70f5e44 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -170,12 +170,9 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
 	return exist;
 }
 
-bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
+static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 					block_t blkaddr, int type)
 {
-	if (time_to_inject(sbi, FAULT_BLKADDR))
-		return false;
-
 	switch (type) {
 	case META_NAT:
 		break;
@@ -230,6 +227,20 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 	return true;
 }
 
+bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
+					block_t blkaddr, int type)
+{
+	if (time_to_inject(sbi, FAULT_BLKADDR_VALIDITY))
+		return false;
+	return __f2fs_is_valid_blkaddr(sbi, blkaddr, type);
+}
+
+bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi,
+					block_t blkaddr, int type)
+{
+	return __f2fs_is_valid_blkaddr(sbi, blkaddr, type);
+}
+
 /*
  * Readahead CP/NAT/SIT/SSA/POR pages
  */
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ab710bb6d8b32e..4481f68d64181c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -60,7 +60,8 @@ enum {
 	FAULT_SLAB_ALLOC,
 	FAULT_DQUOT_INIT,
 	FAULT_LOCK_OP,
-	FAULT_BLKADDR,
+	FAULT_BLKADDR_VALIDITY,
+	FAULT_BLKADDR_CONSISTENCE,
 	FAULT_MAX,
 };
 
@@ -3768,6 +3769,8 @@ struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index);
 struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index);
 bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 					block_t blkaddr, int type);
+bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi,
+					block_t blkaddr, int type);
 int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 			int type, bool sync);
 void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 1ff1c45e192711..25b119cf3499d7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -590,9 +590,13 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 		f2fs_set_data_blkaddr(dn, NULL_ADDR);
 
 		if (__is_valid_data_blkaddr(blkaddr)) {
-			if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
-					DATA_GENERIC_ENHANCE))
+			if (time_to_inject(sbi, FAULT_BLKADDR_CONSISTENCE))
+				continue;
+			if (!f2fs_is_valid_blkaddr_raw(sbi, blkaddr,
+						DATA_GENERIC_ENHANCE)) {
+				f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
 				continue;
+			}
 			if (compressed_cluster)
 				valid_blocks++;
 		}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index d45ab0992ae594..e2c066fbc0fa11 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -44,24 +44,25 @@ static struct kmem_cache *f2fs_inode_cachep;
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 
 const char *f2fs_fault_name[FAULT_MAX] = {
-	[FAULT_KMALLOC]		= "kmalloc",
-	[FAULT_KVMALLOC]	= "kvmalloc",
-	[FAULT_PAGE_ALLOC]	= "page alloc",
-	[FAULT_PAGE_GET]	= "page get",
-	[FAULT_ALLOC_NID]	= "alloc nid",
-	[FAULT_ORPHAN]		= "orphan",
-	[FAULT_BLOCK]		= "no more block",
-	[FAULT_DIR_DEPTH]	= "too big dir depth",
-	[FAULT_EVICT_INODE]	= "evict_inode fail",
-	[FAULT_TRUNCATE]	= "truncate fail",
-	[FAULT_READ_IO]		= "read IO error",
-	[FAULT_CHECKPOINT]	= "checkpoint error",
-	[FAULT_DISCARD]		= "discard error",
-	[FAULT_WRITE_IO]	= "write IO error",
-	[FAULT_SLAB_ALLOC]	= "slab alloc",
-	[FAULT_DQUOT_INIT]	= "dquot initialize",
-	[FAULT_LOCK_OP]		= "lock_op",
-	[FAULT_BLKADDR]		= "invalid blkaddr",
+	[FAULT_KMALLOC]			= "kmalloc",
+	[FAULT_KVMALLOC]		= "kvmalloc",
+	[FAULT_PAGE_ALLOC]		= "page alloc",
+	[FAULT_PAGE_GET]		= "page get",
+	[FAULT_ALLOC_NID]		= "alloc nid",
+	[FAULT_ORPHAN]			= "orphan",
+	[FAULT_BLOCK]			= "no more block",
+	[FAULT_DIR_DEPTH]		= "too big dir depth",
+	[FAULT_EVICT_INODE]		= "evict_inode fail",
+	[FAULT_TRUNCATE]		= "truncate fail",
+	[FAULT_READ_IO]			= "read IO error",
+	[FAULT_CHECKPOINT]		= "checkpoint error",
+	[FAULT_DISCARD]			= "discard error",
+	[FAULT_WRITE_IO]		= "write IO error",
+	[FAULT_SLAB_ALLOC]		= "slab alloc",
+	[FAULT_DQUOT_INIT]		= "dquot initialize",
+	[FAULT_LOCK_OP]			= "lock_op",
+	[FAULT_BLKADDR_VALIDITY]	= "invalid blkaddr",
+	[FAULT_BLKADDR_CONSISTENCE]	= "inconsistent blkaddr",
 };
 
 void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,

From 38c9ce091a4bd0ff272438131424e98ea0e3906d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Draszik?= <andre.draszik@linaro.org>
Date: Fri, 26 Jan 2024 11:55:16 +0000
Subject: [PATCH 237/707] dt-bindings: samsung: exynos-sysreg: gs101-peric0
 requires a clock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

... otherwise it won't be accessible.

Update the schema to make this obvious.

Signed-off-by: André Draszik <andre.draszik@linaro.org>
Link: https://lore.kernel.org/r/20240126115517.1751971-1-andre.draszik@linaro.org
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 .../devicetree/bindings/soc/samsung/samsung,exynos-sysreg.yaml   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/soc/samsung/samsung,exynos-sysreg.yaml b/Documentation/devicetree/bindings/soc/samsung/samsung,exynos-sysreg.yaml
index 1794e3799f2110..33d837ae4f4577 100644
--- a/Documentation/devicetree/bindings/soc/samsung/samsung,exynos-sysreg.yaml
+++ b/Documentation/devicetree/bindings/soc/samsung/samsung,exynos-sysreg.yaml
@@ -72,6 +72,7 @@ allOf:
         compatible:
           contains:
             enum:
+              - google,gs101-peric0-sysreg
               - samsung,exynos850-cmgp-sysreg
               - samsung,exynos850-peri-sysreg
               - samsung,exynos850-sysreg

From 34b2321cc648a246d08cc51e423532eac690ccf1 Mon Sep 17 00:00:00 2001
From: Andreas Larsson <andreas@gaisler.com>
Date: Mon, 15 Jan 2024 16:02:00 +0100
Subject: [PATCH 238/707] MAINTAINERS: Add Andreas Larsson as co-maintainer for
 arch/sparc

Dave has not been very active on arch/sparc for the past two years.
I have been contributing to the SPARC32 port as well as maintaining
out-of-tree SPARC32 patches for LEON3/4/5 (SPARCv8 with CAS support)
since 2012. I am willing to step up as an arch/sparc (co-)maintainer.

For recent discussions on the matter, see [1] and [2].

[1] https://lore.kernel.org/r/20230713075235.2164609-1-u.kleine-koenig@pengutronix.de
[2] https://lore.kernel.org/r/20231209105816.GA1085691@ravnborg.org/

Signed-off-by: Andreas Larsson <andreas@gaisler.com>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Acked-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Jose E. Marchesi <jose.marchesi@oracle.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8d1052fa6a6924..542ab762be7de4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -20549,6 +20549,7 @@ F:	Documentation/translations/sp_SP/
 
 SPARC + UltraSPARC (sparc/sparc64)
 M:	"David S. Miller" <davem@davemloft.net>
+M:	Andreas Larsson <andreas@gaisler.com>
 L:	sparclinux@vger.kernel.org
 S:	Maintained
 Q:	http://patchwork.ozlabs.org/project/sparclinux/list/

From 34b82a2fb7475aba5adfd3ae9f2c66da7f1979f7 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Fri, 26 Jan 2024 07:28:53 -0800
Subject: [PATCH 239/707] lkdtm/bugs: In lkdtm_HUNG_TASK() use BUG(), not
 BUG_ON(1)

In commit edb6538da3df ("lkdtm/bugs: Adjust lkdtm_HUNG_TASK() to avoid
tail call optimization") we marked lkdtm_HUNG_TASK() as
__noreturn. The compiler gets unhappy if it thinks a __noreturn
function might return, so there's a BUG_ON(1) at the end. Any human
can see that the function won't return and the compiler can figure
that out too. Except when it can't.

The MIPS architecture defines HAVE_ARCH_BUG_ON and defines its own
version of BUG_ON(). The MIPS version of BUG_ON() is not a macro but
is instead an inline function. Apparently this prevents the compiler
from realizing that the condition to BUG_ON() is constant and that the
function will never return.

Let's change the BUG_ON(1) to just BUG(), which it should have been to
begin with. The only reason I used BUG_ON(1) to begin with was because
I was used to using WARN_ON(1) when writing test code and WARN() and
BUG() are oddly inconsistent in this manner. :-/

Fixes: edb6538da3df ("lkdtm/bugs: Adjust lkdtm_HUNG_TASK() to avoid tail call optimization")
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202401262204.wUFKRYZF-lkp@intel.com/
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20240126072852.1.Ib065e528a8620474a72f15baa2feead1f3d89865@changeid
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/misc/lkdtm/bugs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c
index d1222d3eda2f19..b92767d6bdd244 100644
--- a/drivers/misc/lkdtm/bugs.c
+++ b/drivers/misc/lkdtm/bugs.c
@@ -298,7 +298,7 @@ static void __noreturn lkdtm_HUNG_TASK(void)
 {
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	schedule();
-	BUG_ON(1);
+	BUG();
 }
 
 static volatile unsigned int huge = INT_MAX - 2;

From 3ef826924303470507ccd6e84d34c901c1ee34da Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 16 Jan 2024 14:14:59 -0700
Subject: [PATCH 240/707] block: move cgroup time handling code into blk.h

In preparation for moving time keeping into blk.h, move the cgroup
related code for timestamps in here too. This will help avoid a circular
dependency, and also moves it into a more appropriate header as this one
is private to the block layer code.

Leave struct bio_issue in blk_types.h as it's a proper time definition.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.h        |  1 +
 block/blk.h               | 42 +++++++++++++++++++++++++++++++++++++++
 include/linux/blk_types.h | 42 ---------------------------------------
 3 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index b927a4a0ad0301..78b74106bf10c5 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -19,6 +19,7 @@
 #include <linux/kthread.h>
 #include <linux/blk-mq.h>
 #include <linux/llist.h>
+#include "blk.h"
 
 struct blkcg_gq;
 struct blkg_policy_data;
diff --git a/block/blk.h b/block/blk.h
index 1ef920f72e0f87..620e3a035da116 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -516,4 +516,46 @@ static inline int req_ref_read(struct request *req)
 	return atomic_read(&req->ref);
 }
 
+/*
+ * From most significant bit:
+ * 1 bit: reserved for other usage, see below
+ * 12 bits: original size of bio
+ * 51 bits: issue time of bio
+ */
+#define BIO_ISSUE_RES_BITS      1
+#define BIO_ISSUE_SIZE_BITS     12
+#define BIO_ISSUE_RES_SHIFT     (64 - BIO_ISSUE_RES_BITS)
+#define BIO_ISSUE_SIZE_SHIFT    (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS)
+#define BIO_ISSUE_TIME_MASK     ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1)
+#define BIO_ISSUE_SIZE_MASK     \
+	(((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT)
+#define BIO_ISSUE_RES_MASK      (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1))
+
+/* Reserved bit for blk-throtl */
+#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63)
+
+static inline u64 __bio_issue_time(u64 time)
+{
+	return time & BIO_ISSUE_TIME_MASK;
+}
+
+static inline u64 bio_issue_time(struct bio_issue *issue)
+{
+	return __bio_issue_time(issue->value);
+}
+
+static inline sector_t bio_issue_size(struct bio_issue *issue)
+{
+	return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT);
+}
+
+static inline void bio_issue_init(struct bio_issue *issue,
+				       sector_t size)
+{
+	size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
+	issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
+			(ktime_get_ns() & BIO_ISSUE_TIME_MASK) |
+			((u64)size << BIO_ISSUE_SIZE_SHIFT));
+}
+
 #endif /* BLK_INTERNAL_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index f288c94374b307..1c07848dea7ec0 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -206,52 +206,10 @@ static inline bool blk_path_error(blk_status_t error)
 	return true;
 }
 
-/*
- * From most significant bit:
- * 1 bit: reserved for other usage, see below
- * 12 bits: original size of bio
- * 51 bits: issue time of bio
- */
-#define BIO_ISSUE_RES_BITS      1
-#define BIO_ISSUE_SIZE_BITS     12
-#define BIO_ISSUE_RES_SHIFT     (64 - BIO_ISSUE_RES_BITS)
-#define BIO_ISSUE_SIZE_SHIFT    (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS)
-#define BIO_ISSUE_TIME_MASK     ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1)
-#define BIO_ISSUE_SIZE_MASK     \
-	(((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT)
-#define BIO_ISSUE_RES_MASK      (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1))
-
-/* Reserved bit for blk-throtl */
-#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63)
-
 struct bio_issue {
 	u64 value;
 };
 
-static inline u64 __bio_issue_time(u64 time)
-{
-	return time & BIO_ISSUE_TIME_MASK;
-}
-
-static inline u64 bio_issue_time(struct bio_issue *issue)
-{
-	return __bio_issue_time(issue->value);
-}
-
-static inline sector_t bio_issue_size(struct bio_issue *issue)
-{
-	return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT);
-}
-
-static inline void bio_issue_init(struct bio_issue *issue,
-				       sector_t size)
-{
-	size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
-	issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
-			(ktime_get_ns() & BIO_ISSUE_TIME_MASK) |
-			((u64)size << BIO_ISSUE_SIZE_SHIFT));
-}
-
 typedef __u32 __bitwise blk_opf_t;
 
 typedef unsigned int blk_qc_t;

From 29bb740b66cc15175a119cd6fc59076580390116 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 15 Jan 2024 14:45:07 -0700
Subject: [PATCH 241/707] block: add blk_time_get_ns() and blk_time_get()
 helpers

Convert any user of ktime_get_ns() to use blk_time_get_ns(), and
ktime_get() to blk_time_get(), so we have a unified API for querying the
current time in nanoseconds or as ktime.

No functional changes intended, this patch just wraps ktime_get_ns()
and ktime_get() with a block helper.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-cgroup.c    | 14 +++++++-------
 block/bfq-iosched.c   | 28 ++++++++++++++--------------
 block/blk-cgroup.c    |  2 +-
 block/blk-flush.c     |  2 +-
 block/blk-iocost.c    |  8 ++++----
 block/blk-iolatency.c |  6 +++---
 block/blk-mq.c        | 16 ++++++++--------
 block/blk-throttle.c  |  6 +++---
 block/blk-wbt.c       |  6 +++---
 block/blk.h           | 13 ++++++++++++-
 10 files changed, 56 insertions(+), 45 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 2c90e5de0acd94..d442ee358fc257 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -127,7 +127,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
 	if (!bfqg_stats_waiting(stats))
 		return;
 
-	now = ktime_get_ns();
+	now = blk_time_get_ns();
 	if (now > stats->start_group_wait_time)
 		bfq_stat_add(&stats->group_wait_time,
 			      now - stats->start_group_wait_time);
@@ -144,7 +144,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
 		return;
 	if (bfqg == curr_bfqg)
 		return;
-	stats->start_group_wait_time = ktime_get_ns();
+	stats->start_group_wait_time = blk_time_get_ns();
 	bfqg_stats_mark_waiting(stats);
 }
 
@@ -156,7 +156,7 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
 	if (!bfqg_stats_empty(stats))
 		return;
 
-	now = ktime_get_ns();
+	now = blk_time_get_ns();
 	if (now > stats->start_empty_time)
 		bfq_stat_add(&stats->empty_time,
 			      now - stats->start_empty_time);
@@ -183,7 +183,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
 	if (bfqg_stats_empty(stats))
 		return;
 
-	stats->start_empty_time = ktime_get_ns();
+	stats->start_empty_time = blk_time_get_ns();
 	bfqg_stats_mark_empty(stats);
 }
 
@@ -192,7 +192,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
 	struct bfqg_stats *stats = &bfqg->stats;
 
 	if (bfqg_stats_idling(stats)) {
-		u64 now = ktime_get_ns();
+		u64 now = blk_time_get_ns();
 
 		if (now > stats->start_idle_time)
 			bfq_stat_add(&stats->idle_time,
@@ -205,7 +205,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
 {
 	struct bfqg_stats *stats = &bfqg->stats;
 
-	stats->start_idle_time = ktime_get_ns();
+	stats->start_idle_time = blk_time_get_ns();
 	bfqg_stats_mark_idling(stats);
 }
 
@@ -242,7 +242,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
 				  u64 io_start_time_ns, blk_opf_t opf)
 {
 	struct bfqg_stats *stats = &bfqg->stats;
-	u64 now = ktime_get_ns();
+	u64 now = blk_time_get_ns();
 
 	if (now > io_start_time_ns)
 		blkg_rwstat_add(&stats->service_time, opf,
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3cce6de464a7b7..4b88a54a9b76cb 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -1005,7 +1005,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
 
 	rq = rq_entry_fifo(bfqq->fifo.next);
 
-	if (rq == last || ktime_get_ns() < rq->fifo_time)
+	if (rq == last || blk_time_get_ns() < rq->fifo_time)
 		return NULL;
 
 	bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
@@ -1829,7 +1829,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 		 * bfq_bfqq_update_budg_for_activation for
 		 * details on the usage of the next variable.
 		 */
-		arrived_in_time =  ktime_get_ns() <=
+		arrived_in_time =  blk_time_get_ns() <=
 			bfqq->ttime.last_end_request +
 			bfqd->bfq_slice_idle * 3;
 	unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio);
@@ -2208,7 +2208,7 @@ static void bfq_add_request(struct request *rq)
 	struct request *next_rq, *prev;
 	unsigned int old_wr_coeff = bfqq->wr_coeff;
 	bool interactive = false;
-	u64 now_ns = ktime_get_ns();
+	u64 now_ns = blk_time_get_ns();
 
 	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
 	bfqq->queued[rq_is_sync(rq)]++;
@@ -2262,7 +2262,7 @@ static void bfq_add_request(struct request *rq)
 		      bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) &&
 		    time_is_before_eq_jiffies(bfqq->decrease_time_jif +
 					      msecs_to_jiffies(10))) {
-			bfqd->last_empty_occupied_ns = ktime_get_ns();
+			bfqd->last_empty_occupied_ns = blk_time_get_ns();
 			/*
 			 * Start the state machine for measuring the
 			 * total service time of rq: setting
@@ -3294,7 +3294,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd,
 	else
 		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
 
-	bfqd->last_budget_start = ktime_get();
+	bfqd->last_budget_start = blk_time_get();
 
 	bfqq->budget_timeout = jiffies +
 		bfqd->bfq_timeout * timeout_coeff;
@@ -3394,7 +3394,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 	else if (bfqq->wr_coeff > 1)
 		sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC);
 
-	bfqd->last_idling_start = ktime_get();
+	bfqd->last_idling_start = blk_time_get();
 	bfqd->last_idling_start_jiffies = jiffies;
 
 	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
@@ -3433,7 +3433,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd,
 				       struct request *rq)
 {
 	if (rq != NULL) { /* new rq dispatch now, reset accordingly */
-		bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
+		bfqd->last_dispatch = bfqd->first_dispatch = blk_time_get_ns();
 		bfqd->peak_rate_samples = 1;
 		bfqd->sequential_samples = 0;
 		bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
@@ -3590,7 +3590,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
  */
 static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
 {
-	u64 now_ns = ktime_get_ns();
+	u64 now_ns = blk_time_get_ns();
 
 	if (bfqd->peak_rate_samples == 0) { /* first dispatch */
 		bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
@@ -4162,7 +4162,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	if (compensate)
 		delta_ktime = bfqd->last_idling_start;
 	else
-		delta_ktime = ktime_get();
+		delta_ktime = blk_time_get();
 	delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
 	delta_usecs = ktime_to_us(delta_ktime);
 
@@ -5591,7 +5591,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 			  struct bfq_io_cq *bic, pid_t pid, int is_sync,
 			  unsigned int act_idx)
 {
-	u64 now_ns = ktime_get_ns();
+	u64 now_ns = blk_time_get_ns();
 
 	bfqq->actuator_idx = act_idx;
 	RB_CLEAR_NODE(&bfqq->entity.rb_node);
@@ -5903,7 +5903,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd,
 	 */
 	if (bfqq->dispatched || bfq_bfqq_busy(bfqq))
 		return;
-	elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
+	elapsed = blk_time_get_ns() - bfqq->ttime.last_end_request;
 	elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
 
 	ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
@@ -6194,7 +6194,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
 	bfq_add_request(rq);
 	idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
 
-	rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
+	rq->fifo_time = blk_time_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
 	list_add_tail(&rq->queuelist, &bfqq->fifo);
 
 	bfq_rq_enqueued(bfqd, bfqq, rq);
@@ -6370,7 +6370,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
 		bfq_weights_tree_remove(bfqq);
 	}
 
-	now_ns = ktime_get_ns();
+	now_ns = blk_time_get_ns();
 
 	bfqq->ttime.last_end_request = now_ns;
 
@@ -6585,7 +6585,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
 static void bfq_update_inject_limit(struct bfq_data *bfqd,
 				    struct bfq_queue *bfqq)
 {
-	u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns;
+	u64 tot_time_ns = blk_time_get_ns() - bfqd->last_empty_occupied_ns;
 	unsigned int old_limit = bfqq->inject_limit;
 
 	if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) {
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ff93c385ba5afb..bdbb557feb5a0e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1846,7 +1846,7 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
 {
 	unsigned long pflags;
 	bool clamp;
-	u64 now = ktime_to_ns(ktime_get());
+	u64 now = blk_time_get_ns();
 	u64 exp;
 	u64 delay_nsec = 0;
 	int tok;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 3f4d41952ef210..b0f314f4bc1493 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -143,7 +143,7 @@ static void blk_account_io_flush(struct request *rq)
 	part_stat_lock();
 	part_stat_inc(part, ios[STAT_FLUSH]);
 	part_stat_add(part, nsecs[STAT_FLUSH],
-		      ktime_get_ns() - rq->start_time_ns);
+		      blk_time_get_ns() - rq->start_time_ns);
 	part_stat_unlock();
 }
 
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index c8beec6d7df086..4b0b483a969353 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -829,7 +829,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
 
 	/* step up/down based on the vrate */
 	vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
-	now_ns = ktime_get_ns();
+	now_ns = blk_time_get_ns();
 
 	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
 		if (!ioc->autop_too_fast_at)
@@ -1044,7 +1044,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now)
 	unsigned seq;
 	u64 vrate;
 
-	now->now_ns = ktime_get();
+	now->now_ns = blk_time_get_ns();
 	now->now = ktime_to_us(now->now_ns);
 	vrate = atomic64_read(&ioc->vtime_rate);
 
@@ -2810,7 +2810,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
 		return;
 	}
 
-	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
+	on_q_ns = blk_time_get_ns() - rq->alloc_time_ns;
 	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
 	size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
 
@@ -2893,7 +2893,7 @@ static int blk_iocost_init(struct gendisk *disk)
 	ioc->vtime_base_rate = VTIME_PER_USEC;
 	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
 	seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
-	ioc->period_at = ktime_to_us(ktime_get());
+	ioc->period_at = ktime_to_us(blk_time_get());
 	atomic64_set(&ioc->cur_period, 0);
 	atomic_set(&ioc->hweight_gen, 0);
 
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index c1a6aba1d59e4d..ebb522788d9780 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -609,7 +609,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
 	if (!iolat->blkiolat->enabled)
 		return;
 
-	now = ktime_to_ns(ktime_get());
+	now = blk_time_get_ns();
 	while (blkg && blkg->parent) {
 		iolat = blkg_to_lat(blkg);
 		if (!iolat) {
@@ -661,7 +661,7 @@ static void blkiolatency_timer_fn(struct timer_list *t)
 	struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
 	struct blkcg_gq *blkg;
 	struct cgroup_subsys_state *pos_css;
-	u64 now = ktime_to_ns(ktime_get());
+	u64 now = blk_time_get_ns();
 
 	rcu_read_lock();
 	blkg_for_each_descendant_pre(blkg, pos_css,
@@ -985,7 +985,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
 	struct blkcg_gq *blkg = lat_to_blkg(iolat);
 	struct rq_qos *rqos = iolat_rq_qos(blkg->q);
 	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
-	u64 now = ktime_to_ns(ktime_get());
+	u64 now = blk_time_get_ns();
 	int cpu;
 
 	if (blk_queue_nonrot(blkg->q))
diff --git a/block/blk-mq.c b/block/blk-mq.c
index aa87fcfda1ecfc..aff9e9492f59e4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -323,7 +323,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->tag = BLK_MQ_NO_TAG;
 	rq->internal_tag = BLK_MQ_NO_TAG;
-	rq->start_time_ns = ktime_get_ns();
+	rq->start_time_ns = blk_time_get_ns();
 	rq->part = NULL;
 	blk_crypto_rq_set_defaults(rq);
 }
@@ -333,7 +333,7 @@ EXPORT_SYMBOL(blk_rq_init);
 static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
 {
 	if (blk_mq_need_time_stamp(rq))
-		rq->start_time_ns = ktime_get_ns();
+		rq->start_time_ns = blk_time_get_ns();
 	else
 		rq->start_time_ns = 0;
 
@@ -444,7 +444,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 
 	/* alloc_time includes depth and tag waits */
 	if (blk_queue_rq_alloc_time(q))
-		alloc_time_ns = ktime_get_ns();
+		alloc_time_ns = blk_time_get_ns();
 
 	if (data->cmd_flags & REQ_NOWAIT)
 		data->flags |= BLK_MQ_REQ_NOWAIT;
@@ -629,7 +629,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 
 	/* alloc_time includes depth and tag waits */
 	if (blk_queue_rq_alloc_time(q))
-		alloc_time_ns = ktime_get_ns();
+		alloc_time_ns = blk_time_get_ns();
 
 	/*
 	 * If the tag allocator sleeps we could get an allocation for a
@@ -1042,7 +1042,7 @@ static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 {
 	if (blk_mq_need_time_stamp(rq))
-		__blk_mq_end_request_acct(rq, ktime_get_ns());
+		__blk_mq_end_request_acct(rq, blk_time_get_ns());
 
 	blk_mq_finish_request(rq);
 
@@ -1085,7 +1085,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
 	u64 now = 0;
 
 	if (iob->need_ts)
-		now = ktime_get_ns();
+		now = blk_time_get_ns();
 
 	while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
 		prefetch(rq->bio);
@@ -1255,7 +1255,7 @@ void blk_mq_start_request(struct request *rq)
 
 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) &&
 	    !blk_rq_is_passthrough(rq)) {
-		rq->io_start_time_ns = ktime_get_ns();
+		rq->io_start_time_ns = blk_time_get_ns();
 		rq->stats_sectors = blk_rq_sectors(rq);
 		rq->rq_flags |= RQF_STATS;
 		rq_qos_issue(q, rq);
@@ -3107,7 +3107,7 @@ blk_status_t blk_insert_cloned_request(struct request *rq)
 	blk_mq_run_dispatch_ops(q,
 			ret = blk_mq_request_issue_directly(rq, true));
 	if (ret)
-		blk_account_io_done(rq, ktime_get_ns());
+		blk_account_io_done(rq, blk_time_get_ns());
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 16f5766620a410..da9dc1f793c3b7 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1815,7 +1815,7 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg)
 	time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
 	ret = tg->latency_target == DFL_LATENCY_TARGET ||
 	      tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
-	      (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
+	      (blk_time_get_ns() >> 10) - tg->last_finish_time > time ||
 	      tg->avg_idletime > tg->idletime_threshold ||
 	      (tg->latency_target && tg->bio_cnt &&
 		tg->bad_bio_cnt * 5 < tg->bio_cnt);
@@ -2060,7 +2060,7 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
 	if (last_finish_time == 0)
 		return;
 
-	now = ktime_get_ns() >> 10;
+	now = blk_time_get_ns() >> 10;
 	if (now <= last_finish_time ||
 	    last_finish_time == tg->checked_last_finish_time)
 		return;
@@ -2327,7 +2327,7 @@ void blk_throtl_bio_endio(struct bio *bio)
 	if (!tg->td->limit_valid[LIMIT_LOW])
 		return;
 
-	finish_time_ns = ktime_get_ns();
+	finish_time_ns = blk_time_get_ns();
 	tg->last_finish_time = finish_time_ns >> 10;
 
 	start_time = bio_issue_time(&bio->bi_issue) >> 10;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eacbd..8cb53bf4c7b152 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -29,6 +29,7 @@
 #include "blk-wbt.h"
 #include "blk-rq-qos.h"
 #include "elevator.h"
+#include "blk.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/wbt.h>
@@ -274,13 +275,12 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat)
 
 static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
 {
-	u64 now, issue = READ_ONCE(rwb->sync_issue);
+	u64 issue = READ_ONCE(rwb->sync_issue);
 
 	if (!issue || !rwb->sync_cookie)
 		return 0;
 
-	now = ktime_to_ns(ktime_get());
-	return now - issue;
+	return blk_time_get_ns() - issue;
 }
 
 static inline unsigned int wbt_inflight(struct rq_wb *rwb)
diff --git a/block/blk.h b/block/blk.h
index 620e3a035da116..79ae533cdf026f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -4,6 +4,7 @@
 
 #include <linux/blk-crypto.h>
 #include <linux/memblock.h>	/* for max_pfn/max_low_pfn */
+#include <linux/timekeeping.h>
 #include <xen/xen.h>
 #include "blk-crypto-internal.h"
 
@@ -516,6 +517,16 @@ static inline int req_ref_read(struct request *req)
 	return atomic_read(&req->ref);
 }
 
+static inline u64 blk_time_get_ns(void)
+{
+	return ktime_get_ns();
+}
+
+static inline ktime_t blk_time_get(void)
+{
+	return ns_to_ktime(blk_time_get_ns());
+}
+
 /*
  * From most significant bit:
  * 1 bit: reserved for other usage, see below
@@ -554,7 +565,7 @@ static inline void bio_issue_init(struct bio_issue *issue,
 {
 	size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
 	issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
-			(ktime_get_ns() & BIO_ISSUE_TIME_MASK) |
+			(blk_time_get_ns() & BIO_ISSUE_TIME_MASK) |
 			((u64)size << BIO_ISSUE_SIZE_SHIFT));
 }
 

From 1ff288ab5cf42de6b12cbcdb1d0db1ea73723656 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 15 Jan 2024 14:46:03 -0700
Subject: [PATCH 242/707] block: cache current nsec time in struct blk_plug

Querying the current time is the most costly thing we do in the block
layer per IO, and depending on kernel config settings, we may do it
many times per IO.

None of the callers actually need nsec granularity. Take advantage of
that by caching the current time in the plug, with the assumption here
being that any time checking will be temporally close enough that the
slight loss of precision doesn't matter.

If the block plug gets flushed, eg on preempt or schedule out, then
we invalidate the cached clock.

On a basic peak IOPS test case with iostats enabled, this changes
the performance from:

IOPS=108.41M, BW=52.93GiB/s, IOS/call=31/31
IOPS=108.43M, BW=52.94GiB/s, IOS/call=32/32
IOPS=108.29M, BW=52.88GiB/s, IOS/call=31/32
IOPS=108.35M, BW=52.91GiB/s, IOS/call=32/32
IOPS=108.42M, BW=52.94GiB/s, IOS/call=31/31
IOPS=108.40M, BW=52.93GiB/s, IOS/call=32/32
IOPS=108.31M, BW=52.89GiB/s, IOS/call=32/31

to

IOPS=118.79M, BW=58.00GiB/s, IOS/call=31/32
IOPS=118.62M, BW=57.92GiB/s, IOS/call=31/31
IOPS=118.80M, BW=58.01GiB/s, IOS/call=32/31
IOPS=118.78M, BW=58.00GiB/s, IOS/call=32/32
IOPS=118.69M, BW=57.95GiB/s, IOS/call=32/31
IOPS=118.62M, BW=57.92GiB/s, IOS/call=32/31
IOPS=118.63M, BW=57.92GiB/s, IOS/call=31/32

which is more than a 9% improvement in performance. Looking at perf diff,
we can see a huge reduction in time overhead:

    10.55%     -9.88%  [kernel.vmlinux]  [k] read_tsc
     1.31%     -1.22%  [kernel.vmlinux]  [k] ktime_get

Note that since this relies on blk_plug for the caching, it's only
applicable to the issue side. But this is where most of the time calls
happen anyway. On the completion side, cached time stamping is done with
struct io_comp patch, as long as the driver supports it.

It's also worth noting that the above testing doesn't enable any of the
higher cost CPU items on the block layer side, like wbt, cgroups,
iocost, etc, which all would add additional time querying and hence
overhead. IOW, results would likely look even better in comparison with
those enabled, as distros would do.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  1 +
 block/blk.h            | 14 +++++++++++++-
 include/linux/blkdev.h |  1 +
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 11342af420d0c4..cc4db4d92c75b4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1073,6 +1073,7 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
 	if (tsk->plug)
 		return;
 
+	plug->cur_ktime = 0;
 	plug->mq_list = NULL;
 	plug->cached_rq = NULL;
 	plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
diff --git a/block/blk.h b/block/blk.h
index 79ae533cdf026f..14bbc4b780f275 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -519,7 +519,19 @@ static inline int req_ref_read(struct request *req)
 
 static inline u64 blk_time_get_ns(void)
 {
-	return ktime_get_ns();
+	struct blk_plug *plug = current->plug;
+
+	if (!plug)
+		return ktime_get_ns();
+
+	/*
+	 * 0 could very well be a valid time, but rather than flag "this is
+	 * a valid timestamp" separately, just accept that we'll do an extra
+	 * ktime_get_ns() if we just happen to get 0 as the current time.
+	 */
+	if (!plug->cur_ktime)
+		plug->cur_ktime = ktime_get_ns();
+	return plug->cur_ktime;
 }
 
 static inline ktime_t blk_time_get(void)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 99e4f5e722132c..996d2ad756ff78 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -942,6 +942,7 @@ struct blk_plug {
 
 	/* if ios_left is > 1, we can batch tag/rq allocations */
 	struct request *cached_rq;
+	u64 cur_ktime;
 	unsigned short nr_ios;
 
 	unsigned short rq_count;

From 1a3d70bf0c0e9f711b1d653d05df201eee68ab5b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 16 Jan 2024 09:18:39 -0700
Subject: [PATCH 243/707] block: update cached timestamp post
 schedule/preemption

Mark the task as having a cached timestamp when set assign it, so we
can efficiently check if it needs updating post being scheduled back in.
This covers both the actual schedule out case, which would've flushed
the plug, and the preemption case which doesn't touch the plugged
requests (for many reasons, one of them being then we'd need to have
preemption disabled around plug state manipulation).

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  2 ++
 block/blk.h            |  4 +++-
 include/linux/blkdev.h | 16 ++++++++++++++++
 include/linux/sched.h  |  2 +-
 kernel/sched/core.c    |  6 ++++--
 5 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index cc4db4d92c75b4..71c6614a97fefb 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1173,6 +1173,8 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
 	 */
 	if (unlikely(!rq_list_empty(plug->cached_rq)))
 		blk_mq_free_plug_rqs(plug);
+
+	current->flags &= ~PF_BLOCK_TS;
 }
 
 /**
diff --git a/block/blk.h b/block/blk.h
index 14bbc4b780f275..913c93838a01bf 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -529,8 +529,10 @@ static inline u64 blk_time_get_ns(void)
 	 * a valid timestamp" separately, just accept that we'll do an extra
 	 * ktime_get_ns() if we just happen to get 0 as the current time.
 	 */
-	if (!plug->cur_ktime)
+	if (!plug->cur_ktime) {
 		plug->cur_ktime = ktime_get_ns();
+		current->flags |= PF_BLOCK_TS;
+	}
 	return plug->cur_ktime;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 996d2ad756ff78..d7cac3de65b31b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -973,6 +973,18 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async)
 		__blk_flush_plug(plug, async);
 }
 
+/*
+ * tsk == current here
+ */
+static inline void blk_plug_invalidate_ts(struct task_struct *tsk)
+{
+	struct blk_plug *plug = tsk->plug;
+
+	if (plug)
+		plug->cur_ktime = 0;
+	current->flags &= ~PF_BLOCK_TS;
+}
+
 int blkdev_issue_flush(struct block_device *bdev);
 long nr_blockdev_pages(void);
 #else /* CONFIG_BLOCK */
@@ -996,6 +1008,10 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async)
 {
 }
 
+static inline void blk_plug_invalidate_ts(struct task_struct *tsk)
+{
+}
+
 static inline int blkdev_issue_flush(struct block_device *bdev)
 {
 	return 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cdb8ea53c365ba..801233cef2fc9d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1642,7 +1642,7 @@ extern struct pid *cad_pid;
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
 #define PF_MEMALLOC_PIN		0x10000000	/* Allocation context constrained to zones which allow long term pinning. */
-#define PF__HOLE__20000000	0x20000000
+#define PF_BLOCK_TS		0x20000000	/* plug has ts that needs updating */
 #define PF__HOLE__40000000	0x40000000
 #define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9116bcc903467f..083f2258182d89 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6787,10 +6787,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
 
 static void sched_update_worker(struct task_struct *tsk)
 {
-	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) {
+		if (tsk->flags & PF_BLOCK_TS)
+			blk_plug_invalidate_ts(tsk);
 		if (tsk->flags & PF_WQ_WORKER)
 			wq_worker_running(tsk);
-		else
+		else if (tsk->flags & PF_IO_WORKER)
 			io_wq_worker_running(tsk);
 	}
 }

From b588e2d813c85a2ecbaf3d64269dfde4bb735917 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 22 Jan 2024 16:34:10 +0100
Subject: [PATCH 244/707] nvmem: include bit index in cell sysfs file name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Creating sysfs files for all Cells caused a boot failure for linux-6.8-rc1 on
Apple M1, which (in downstream dts files) has multiple nvmem cells that use the
same byte address. This causes the device probe to fail with

[    0.605336] sysfs: cannot create duplicate filename '/devices/platform/soc@200000000/2922bc000.efuse/apple_efuses_nvmem0/cells/efuse@a10'
[    0.605347] CPU: 7 PID: 1 Comm: swapper/0 Tainted: G S                 6.8.0-rc1-arnd-5+ #133
[    0.605355] Hardware name: Apple Mac Studio (M1 Ultra, 2022) (DT)
[    0.605362] Call trace:
[    0.605365]  show_stack+0x18/0x2c
[    0.605374]  dump_stack_lvl+0x60/0x80
[    0.605383]  dump_stack+0x18/0x24
[    0.605388]  sysfs_warn_dup+0x64/0x80
[    0.605395]  sysfs_add_bin_file_mode_ns+0xb0/0xd4
[    0.605402]  internal_create_group+0x268/0x404
[    0.605409]  sysfs_create_groups+0x38/0x94
[    0.605415]  devm_device_add_groups+0x50/0x94
[    0.605572]  nvmem_populate_sysfs_cells+0x180/0x1b0
[    0.605682]  nvmem_register+0x38c/0x470
[    0.605789]  devm_nvmem_register+0x1c/0x6c
[    0.605895]  apple_efuses_probe+0xe4/0x120
[    0.606000]  platform_probe+0xa8/0xd0

As far as I can tell, this is a problem for any device with multiple cells on
different bits of the same address. Avoid the issue by changing the file name
to include the first bit number.

Fixes: 0331c611949f ("nvmem: core: Expose cells through sysfs")
Link: https://github.com/AsahiLinux/linux/blob/bd0a1a7d4/arch/arm64/boot/dts/apple/t600x-dieX.dtsi#L156
Cc: regressions@lists.linux.dev
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Rafał Miłecki <rafal@milecki.pl>
Cc: Chen-Yu Tsai <wenst@chromium.org>
Cc: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: asahi@lists.linux.dev
Cc: Sven Peter <sven@svenpeter.dev>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 Documentation/ABI/testing/sysfs-nvmem-cells | 16 ++++++++--------
 drivers/nvmem/core.c                        |  5 +++--
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-nvmem-cells b/Documentation/ABI/testing/sysfs-nvmem-cells
index 7af70adf3690e3..c7c9444f92a880 100644
--- a/Documentation/ABI/testing/sysfs-nvmem-cells
+++ b/Documentation/ABI/testing/sysfs-nvmem-cells
@@ -4,18 +4,18 @@ KernelVersion:	6.5
 Contact:	Miquel Raynal <miquel.raynal@bootlin.com>
 Description:
 		The "cells" folder contains one file per cell exposed by the
-		NVMEM device. The name of the file is: <name>@<where>, with
-		<name> being the cell name and <where> its location in the NVMEM
-		device, in hexadecimal (without the '0x' prefix, to mimic device
-		tree node names). The length of the file is the size of the cell
-		(when known). The content of the file is the binary content of
-		the cell (may sometimes be ASCII, likely without trailing
-		character).
+		NVMEM device. The name of the file is: "<name>@<byte>,<bit>",
+		with <name> being the cell name and <where> its location in
+		the NVMEM device, in hexadecimal bytes and bits (without the
+		'0x' prefix, to mimic device tree node names). The length of
+		the file is the size of the cell (when known). The content of
+		the file is the binary content of the cell (may sometimes be
+		ASCII, likely without trailing character).
 		Note: This file is only present if CONFIG_NVMEM_SYSFS
 		is enabled.
 
 		Example::
 
-		  hexdump -C /sys/bus/nvmem/devices/1-00563/cells/product-name@d
+		  hexdump -C /sys/bus/nvmem/devices/1-00563/cells/product-name@d,0
 		  00000000  54 4e 34 38 4d 2d 50 2d  44 4e         |TN48M-P-DN|
 		  0000000a
diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index 980123fb4dde05..eb357ac2e54a2a 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -460,8 +460,9 @@ static int nvmem_populate_sysfs_cells(struct nvmem_device *nvmem)
 	list_for_each_entry(entry, &nvmem->cells, node) {
 		sysfs_bin_attr_init(&attrs[i]);
 		attrs[i].attr.name = devm_kasprintf(&nvmem->dev, GFP_KERNEL,
-						    "%s@%x", entry->name,
-						    entry->offset);
+						    "%s@%x,%x", entry->name,
+						    entry->offset,
+						    entry->bit_offset);
 		attrs[i].attr.mode = 0444;
 		attrs[i].size = entry->bytes;
 		attrs[i].read = &nvmem_cell_attr_read;

From aa8eff72842021f52600392b245fb82d113afa8a Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 26 Jan 2024 17:39:19 +0100
Subject: [PATCH 245/707] x86/sme: Fix memory encryption setting if enabled by
 default and not overridden

Commit

  cbebd68f59f0 ("x86/mm: Fix use of uninitialized buffer in sme_enable()")

'fixed' an issue in sme_enable() detected by static analysis, and broke
the common case in the process.

cmdline_find_option() will return < 0 on an error, or when the command
line argument does not appear at all. In this particular case, the
latter is not an error condition, and so the early exit is wrong.

Instead, without mem_encrypt= on the command line, the compile time
default should be honoured, which could be to enable memory encryption,
and this is currently broken.

Fix it by setting sme_me_mask to a preliminary value based on the
compile time default, and only omitting the command line argument test
when cmdline_find_option() returns an error.

  [ bp: Drop active_by_default while at it. ]

Fixes: cbebd68f59f0 ("x86/mm: Fix use of uninitialized buffer in sme_enable()")
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Link: https://lore.kernel.org/r/20240126163918.2908990-2-ardb+git@google.com
---
 arch/x86/mm/mem_encrypt_identity.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index d73aeb16417fcf..7f72472a34d6d9 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -507,7 +507,6 @@ void __init sme_enable(struct boot_params *bp)
 	const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
 	unsigned int eax, ebx, ecx, edx;
 	unsigned long feature_mask;
-	bool active_by_default;
 	unsigned long me_mask;
 	char buffer[16];
 	bool snp;
@@ -593,22 +592,19 @@ void __init sme_enable(struct boot_params *bp)
 	     : "p" (sme_cmdline_off));
 
 	if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
-		active_by_default = true;
-	else
-		active_by_default = false;
+		sme_me_mask = me_mask;
 
 	cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
 				     ((u64)bp->ext_cmd_line_ptr << 32));
 
 	if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0)
-		return;
+		goto out;
 
 	if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
 		sme_me_mask = me_mask;
 	else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
 		sme_me_mask = 0;
-	else
-		sme_me_mask = active_by_default ? me_mask : 0;
+
 out:
 	if (sme_me_mask) {
 		physical_mask &= ~sme_me_mask;

From 290ea1d6f990c92d5ae599554d8acafdd0ddeb2e Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 25 Jan 2024 07:46:44 -0600
Subject: [PATCH 246/707] nvmem: fixed-cell: Simplify nested if/then schema

There's no reason to have a nested if/then schema as checking for compatible
being present and containing 'mac-base' can all be done in one 'if' schema.

Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 .../bindings/nvmem/layouts/fixed-cell.yaml    | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml b/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml
index ac2381e6602790..8b3826243dddfc 100644
--- a/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml
+++ b/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml
@@ -36,20 +36,18 @@ properties:
 
 allOf:
   - if:
+      properties:
+        compatible:
+          contains:
+            const: mac-base
       required: [ compatible ]
     then:
-      if:
-        properties:
-          compatible:
-            contains:
-              const: mac-base
-      then:
-        properties:
-          "#nvmem-cell-cells":
-            description: The first argument is a MAC address offset.
-            const: 1
-        required:
-          - "#nvmem-cell-cells"
+      properties:
+        "#nvmem-cell-cells":
+          description: The first argument is a MAC address offset.
+          const: 1
+      required:
+        - "#nvmem-cell-cells"
 
 required:
   - reg

From 4e6102d60d88975bc65a0dde05a8ba096c450249 Mon Sep 17 00:00:00 2001
From: William-tw Lin <william-tw.lin@mediatek.com>
Date: Fri, 22 Dec 2023 16:07:39 +0800
Subject: [PATCH 247/707] nvmem: mtk-efuse: Register MediaTek socinfo driver
 from efuse

The socinfo driver reads chip information from eFuses and does not need
any devicetree node. Register it from mtk-efuse.

While at it, also add the name for this driver's nvmem_config.

Signed-off-by: William-tw Lin <william-tw.lin@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 drivers/nvmem/mtk-efuse.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/nvmem/mtk-efuse.c b/drivers/nvmem/mtk-efuse.c
index 84f05b40a4112e..f5bebcecf9bd31 100644
--- a/drivers/nvmem/mtk-efuse.c
+++ b/drivers/nvmem/mtk-efuse.c
@@ -68,6 +68,7 @@ static int mtk_efuse_probe(struct platform_device *pdev)
 	struct nvmem_config econfig = {};
 	struct mtk_efuse_priv *priv;
 	const struct mtk_efuse_pdata *pdata;
+	struct platform_device *socinfo;
 
 	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
 	if (!priv)
@@ -85,11 +86,20 @@ static int mtk_efuse_probe(struct platform_device *pdev)
 	econfig.size = resource_size(res);
 	econfig.priv = priv;
 	econfig.dev = dev;
+	econfig.name = "mtk-efuse";
 	if (pdata->uses_post_processing)
 		econfig.fixup_dt_cell_info = &mtk_efuse_fixup_dt_cell_info;
 	nvmem = devm_nvmem_register(dev, &econfig);
+	if (IS_ERR(nvmem))
+		return PTR_ERR(nvmem);
 
-	return PTR_ERR_OR_ZERO(nvmem);
+	socinfo = platform_device_register_data(&pdev->dev, "mtk-socinfo",
+						PLATFORM_DEVID_AUTO, NULL, 0);
+	if (IS_ERR(socinfo))
+		dev_info(dev, "MediaTek SoC Information will be unavailable\n");
+
+	platform_set_drvdata(pdev, socinfo);
+	return 0;
 }
 
 static const struct mtk_efuse_pdata mtk_mt8186_efuse_pdata = {
@@ -108,8 +118,17 @@ static const struct of_device_id mtk_efuse_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, mtk_efuse_of_match);
 
+static void mtk_efuse_remove(struct platform_device *pdev)
+{
+	struct platform_device *socinfo = platform_get_drvdata(pdev);
+
+	if (!IS_ERR_OR_NULL(socinfo))
+		platform_device_unregister(socinfo);
+}
+
 static struct platform_driver mtk_efuse_driver = {
 	.probe = mtk_efuse_probe,
+	.remove_new = mtk_efuse_remove,
 	.driver = {
 		.name = "mediatek,efuse",
 		.of_match_table = mtk_efuse_of_match,

From a0cfd5e997824d0bd8c7620d40cdb324121a2fc7 Mon Sep 17 00:00:00 2001
From: Praveen Teja Kundanala <praveen.teja.kundanala@amd.com>
Date: Mon, 8 Jan 2024 10:56:16 +0530
Subject: [PATCH 248/707] dt-bindings: nvmem: Convert xlnx,zynqmp-nvmem.txt to
 yaml

Convert the xlnx,zynqmp-nvmem.txt to yaml.

Signed-off-by: Praveen Teja Kundanala <praveen.teja.kundanala@amd.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 .../bindings/nvmem/xlnx,zynqmp-nvmem.txt      | 46 -------------------
 .../bindings/nvmem/xlnx,zynqmp-nvmem.yaml     | 42 +++++++++++++++++
 2 files changed, 42 insertions(+), 46 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.txt
 create mode 100644 Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml

diff --git a/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.txt b/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.txt
deleted file mode 100644
index 4881561b3a02ac..00000000000000
--- a/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.txt
+++ /dev/null
@@ -1,46 +0,0 @@
---------------------------------------------------------------------------
-=  Zynq UltraScale+ MPSoC nvmem firmware driver binding =
---------------------------------------------------------------------------
-The nvmem_firmware node provides access to the hardware related data
-like soc revision, IDCODE... etc, By using the firmware interface.
-
-Required properties:
-- compatible: should be "xlnx,zynqmp-nvmem-fw"
-
-= Data cells =
-Are child nodes of silicon id, bindings of which as described in
-bindings/nvmem/nvmem.txt
-
--------
- Example
--------
-firmware {
-	zynqmp_firmware: zynqmp-firmware {
-		compatible = "xlnx,zynqmp-firmware";
-		method = "smc";
-
-		nvmem_firmware {
-			compatible = "xlnx,zynqmp-nvmem-fw";
-			#address-cells = <1>;
-			#size-cells = <1>;
-
-			/* Data cells */
-			soc_revision: soc_revision {
-				reg = <0x0 0x4>;
-			};
-		};
-	};
-};
-
-= Data consumers =
-Are device nodes which consume nvmem data cells.
-
-For example:
-	pcap {
-		...
-
-		nvmem-cells = <&soc_revision>;
-		nvmem-cell-names = "soc_revision";
-
-		...
-	};
diff --git a/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml b/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml
new file mode 100644
index 00000000000000..917c40d5c382f4
--- /dev/null
+++ b/Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/nvmem/xlnx,zynqmp-nvmem.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Zynq UltraScale+ MPSoC Non Volatile Memory interface
+
+description: |
+    The ZynqMP MPSoC provides access to the hardware related data
+    like SOC revision, IDCODE and specific purpose efuses.
+
+maintainers:
+  - Kalyani Akula <kalyani.akula@amd.com>
+  - Praveen Teja Kundanala <praveen.teja.kundanala@amd.com>
+
+allOf:
+  - $ref: nvmem.yaml#
+
+properties:
+  compatible:
+    const: xlnx,zynqmp-nvmem-fw
+
+required:
+  - compatible
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    nvmem {
+        compatible = "xlnx,zynqmp-nvmem-fw";
+        nvmem-layout {
+            compatible = "fixed-layout";
+            #address-cells = <1>;
+            #size-cells = <1>;
+
+            soc_revision: soc-revision@0 {
+                reg = <0x0 0x4>;
+            };
+        };
+    };

From 2f423b541ace886a109d4a03799ef14c22129ccb Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 27 Jan 2024 16:02:54 +0100
Subject: [PATCH 249/707] hwmon: Remove I2C_CLASS_HWMON from drivers w/o
 detect() and address_list

Class-based I2C probing requires detect() and address_list to be
set in the I2C client driver, see checks in i2c_detect().
It's misleading to declare I2C_CLASS_HWMON support if this
precondition isn't met.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/75747c6a-d414-4b07-8f66-5a5cdddc3c36@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/adm1177.c       | 1 -
 drivers/hwmon/ds1621.c        | 1 -
 drivers/hwmon/ds620.c         | 1 -
 drivers/hwmon/ina209.c        | 1 -
 drivers/hwmon/ina238.c        | 1 -
 drivers/hwmon/max127.c        | 1 -
 drivers/hwmon/max31760.c      | 1 -
 drivers/hwmon/max31790.c      | 1 -
 drivers/hwmon/max31827.c      | 1 -
 drivers/hwmon/max6621.c       | 1 -
 drivers/hwmon/max6697.c       | 1 -
 drivers/hwmon/occ/p8_i2c.c    | 1 -
 drivers/hwmon/pmbus/ir36021.c | 1 -
 drivers/hwmon/powr1220.c      | 1 -
 drivers/hwmon/sbrmi.c         | 1 -
 drivers/hwmon/sbtsi_temp.c    | 1 -
 drivers/hwmon/w83773g.c       | 1 -
 17 files changed, 17 deletions(-)

diff --git a/drivers/hwmon/adm1177.c b/drivers/hwmon/adm1177.c
index 60a893f271597b..3390102d2d4acf 100644
--- a/drivers/hwmon/adm1177.c
+++ b/drivers/hwmon/adm1177.c
@@ -250,7 +250,6 @@ static const struct of_device_id adm1177_dt_ids[] = {
 MODULE_DEVICE_TABLE(of, adm1177_dt_ids);
 
 static struct i2c_driver adm1177_driver = {
-	.class = I2C_CLASS_HWMON,
 	.driver = {
 		.name = "adm1177",
 		.of_match_table = adm1177_dt_ids,
diff --git a/drivers/hwmon/ds1621.c b/drivers/hwmon/ds1621.c
index 21b6350465211f..bffbc80401718d 100644
--- a/drivers/hwmon/ds1621.c
+++ b/drivers/hwmon/ds1621.c
@@ -380,7 +380,6 @@ MODULE_DEVICE_TABLE(i2c, ds1621_id);
 
 /* This is the driver that will be inserted */
 static struct i2c_driver ds1621_driver = {
-	.class		= I2C_CLASS_HWMON,
 	.driver = {
 		.name	= "ds1621",
 	},
diff --git a/drivers/hwmon/ds620.c b/drivers/hwmon/ds620.c
index 2b09536630cb6a..4fc4df012fac70 100644
--- a/drivers/hwmon/ds620.c
+++ b/drivers/hwmon/ds620.c
@@ -241,7 +241,6 @@ MODULE_DEVICE_TABLE(i2c, ds620_id);
 
 /* This is the driver that will be inserted */
 static struct i2c_driver ds620_driver = {
-	.class = I2C_CLASS_HWMON,
 	.driver = {
 		   .name = "ds620",
 	},
diff --git a/drivers/hwmon/ina209.c b/drivers/hwmon/ina209.c
index c558143e5285da..d9b57a4b3e41fb 100644
--- a/drivers/hwmon/ina209.c
+++ b/drivers/hwmon/ina209.c
@@ -589,7 +589,6 @@ MODULE_DEVICE_TABLE(of, ina209_of_match);
 
 /* This is the driver that will be inserted */
 static struct i2c_driver ina209_driver = {
-	.class		= I2C_CLASS_HWMON,
 	.driver = {
 		.name	= "ina209",
 		.of_match_table = of_match_ptr(ina209_of_match),
diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c
index ca9f5d2c811bb6..69289293bc38cc 100644
--- a/drivers/hwmon/ina238.c
+++ b/drivers/hwmon/ina238.c
@@ -629,7 +629,6 @@ static const struct of_device_id __maybe_unused ina238_of_match[] = {
 MODULE_DEVICE_TABLE(of, ina238_of_match);
 
 static struct i2c_driver ina238_driver = {
-	.class		= I2C_CLASS_HWMON,
 	.driver = {
 		.name	= "ina238",
 		.of_match_table = of_match_ptr(ina238_of_match),
diff --git a/drivers/hwmon/max127.c b/drivers/hwmon/max127.c
index ee5ead06d612df..da2289e3560aa0 100644
--- a/drivers/hwmon/max127.c
+++ b/drivers/hwmon/max127.c
@@ -335,7 +335,6 @@ static const struct i2c_device_id max127_id[] = {
 MODULE_DEVICE_TABLE(i2c, max127_id);
 
 static struct i2c_driver max127_driver = {
-	.class		= I2C_CLASS_HWMON,
 	.driver = {
 		.name	= "max127",
 	},
diff --git a/drivers/hwmon/max31760.c b/drivers/hwmon/max31760.c
index 79945eb466ae43..1b6f71bc61cb54 100644
--- a/drivers/hwmon/max31760.c
+++ b/drivers/hwmon/max31760.c
@@ -578,7 +578,6 @@ static DEFINE_SIMPLE_DEV_PM_OPS(max31760_pm_ops, max31760_suspend,
 				max31760_resume);
 
 static struct i2c_driver max31760_driver = {
-	.class		= I2C_CLASS_HWMON,
 	.driver = {
 		.name	= "max31760",
 		.of_match_table = max31760_of_match,
diff --git a/drivers/hwmon/max31790.c b/drivers/hwmon/max31790.c
index 0cd44c1e998a2e..3dc95196b229ad 100644
--- a/drivers/hwmon/max31790.c
+++ b/drivers/hwmon/max31790.c
@@ -543,7 +543,6 @@ static const struct i2c_device_id max31790_id[] = {
 MODULE_DEVICE_TABLE(i2c, max31790_id);
 
 static struct i2c_driver max31790_driver = {
-	.class		= I2C_CLASS_HWMON,
 	.probe		= max31790_probe,
 	.driver = {
 		.name	= "max31790",
diff --git a/drivers/hwmon/max31827.c b/drivers/hwmon/max31827.c
index 4a8c3e37c5d323..f8a13b30f100f2 100644
--- a/drivers/hwmon/max31827.c
+++ b/drivers/hwmon/max31827.c
@@ -652,7 +652,6 @@ static const struct of_device_id max31827_of_match[] = {
 MODULE_DEVICE_TABLE(of, max31827_of_match);
 
 static struct i2c_driver max31827_driver = {
-	.class = I2C_CLASS_HWMON,
 	.driver = {
 		.name = "max31827",
 		.of_match_table = max31827_of_match,
diff --git a/drivers/hwmon/max6621.c b/drivers/hwmon/max6621.c
index af7e6268589831..05426cde0e3635 100644
--- a/drivers/hwmon/max6621.c
+++ b/drivers/hwmon/max6621.c
@@ -549,7 +549,6 @@ static const struct of_device_id __maybe_unused max6621_of_match[] = {
 MODULE_DEVICE_TABLE(of, max6621_of_match);
 
 static struct i2c_driver max6621_driver = {
-	.class		= I2C_CLASS_HWMON,
 	.driver = {
 		.name = MAX6621_DRV_NAME,
 		.of_match_table = of_match_ptr(max6621_of_match),
diff --git a/drivers/hwmon/max6697.c b/drivers/hwmon/max6697.c
index 7d10dd434f2e11..d161ba0e7813cd 100644
--- a/drivers/hwmon/max6697.c
+++ b/drivers/hwmon/max6697.c
@@ -780,7 +780,6 @@ static const struct of_device_id __maybe_unused max6697_of_match[] = {
 MODULE_DEVICE_TABLE(of, max6697_of_match);
 
 static struct i2c_driver max6697_driver = {
-	.class = I2C_CLASS_HWMON,
 	.driver = {
 		.name	= "max6697",
 		.of_match_table = of_match_ptr(max6697_of_match),
diff --git a/drivers/hwmon/occ/p8_i2c.c b/drivers/hwmon/occ/p8_i2c.c
index 06095975f5c8d2..31159606cec7b4 100644
--- a/drivers/hwmon/occ/p8_i2c.c
+++ b/drivers/hwmon/occ/p8_i2c.c
@@ -241,7 +241,6 @@ static const struct of_device_id p8_i2c_occ_of_match[] = {
 MODULE_DEVICE_TABLE(of, p8_i2c_occ_of_match);
 
 static struct i2c_driver p8_i2c_occ_driver = {
-	.class = I2C_CLASS_HWMON,
 	.driver = {
 		.name = "occ-hwmon",
 		.of_match_table = p8_i2c_occ_of_match,
diff --git a/drivers/hwmon/pmbus/ir36021.c b/drivers/hwmon/pmbus/ir36021.c
index 382ba6b6031a03..a263afeb8ac1e1 100644
--- a/drivers/hwmon/pmbus/ir36021.c
+++ b/drivers/hwmon/pmbus/ir36021.c
@@ -63,7 +63,6 @@ static const struct of_device_id __maybe_unused ir36021_of_id[] = {
 MODULE_DEVICE_TABLE(of, ir36021_of_id);
 
 static struct i2c_driver ir36021_driver = {
-	.class = I2C_CLASS_HWMON,
 	.driver = {
 		.name = "ir36021",
 		.of_match_table = of_match_ptr(ir36021_of_id),
diff --git a/drivers/hwmon/powr1220.c b/drivers/hwmon/powr1220.c
index 4120cadb00aebc..2388d0565e7ec7 100644
--- a/drivers/hwmon/powr1220.c
+++ b/drivers/hwmon/powr1220.c
@@ -323,7 +323,6 @@ static const struct i2c_device_id powr1220_ids[] = {
 MODULE_DEVICE_TABLE(i2c, powr1220_ids);
 
 static struct i2c_driver powr1220_driver = {
-	.class		= I2C_CLASS_HWMON,
 	.driver = {
 		.name	= "powr1220",
 	},
diff --git a/drivers/hwmon/sbrmi.c b/drivers/hwmon/sbrmi.c
index 484703f0ea5f88..4318f5121145e0 100644
--- a/drivers/hwmon/sbrmi.c
+++ b/drivers/hwmon/sbrmi.c
@@ -342,7 +342,6 @@ static const struct of_device_id __maybe_unused sbrmi_of_match[] = {
 MODULE_DEVICE_TABLE(of, sbrmi_of_match);
 
 static struct i2c_driver sbrmi_driver = {
-	.class = I2C_CLASS_HWMON,
 	.driver = {
 		.name = "sbrmi",
 		.of_match_table = of_match_ptr(sbrmi_of_match),
diff --git a/drivers/hwmon/sbtsi_temp.c b/drivers/hwmon/sbtsi_temp.c
index dd85cf89f008a9..a4181acb1aa6b5 100644
--- a/drivers/hwmon/sbtsi_temp.c
+++ b/drivers/hwmon/sbtsi_temp.c
@@ -232,7 +232,6 @@ static const struct of_device_id __maybe_unused sbtsi_of_match[] = {
 MODULE_DEVICE_TABLE(of, sbtsi_of_match);
 
 static struct i2c_driver sbtsi_driver = {
-	.class = I2C_CLASS_HWMON,
 	.driver = {
 		.name = "sbtsi",
 		.of_match_table = of_match_ptr(sbtsi_of_match),
diff --git a/drivers/hwmon/w83773g.c b/drivers/hwmon/w83773g.c
index 045eea8378c2da..401a28f55f931f 100644
--- a/drivers/hwmon/w83773g.c
+++ b/drivers/hwmon/w83773g.c
@@ -290,7 +290,6 @@ static int w83773_probe(struct i2c_client *client)
 }
 
 static struct i2c_driver w83773_driver = {
-	.class = I2C_CLASS_HWMON,
 	.driver = {
 		.name	= "w83773g",
 		.of_match_table = of_match_ptr(w83773_of_match),

From 10e70ab10802e50ff6432964dc289d2bf93c2693 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Sat, 27 Jan 2024 07:40:58 -0800
Subject: [PATCH 250/707] MAINTAINERS: Drop entries for hwmon devices with
 unreachable maintainers

Drop maintainer entries for MAX31760 and MAX31827 since the e-mail
addresses of their maintainers is no longer reachable and there is
no known alternative means to contact them.

HWMON drivers have a subsystem maintainer, so individual maintainer
entries are not mandatory.

Reported-by: Heiner Kallweit <hkallweit1@gmail.com>
Cc: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 MAINTAINERS | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8d1052fa6a6924..5e7239cb40ea6a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1384,15 +1384,6 @@ F:	drivers/iio/amplifiers/hmc425a.c
 F:	drivers/staging/iio/*/ad*
 X:	drivers/iio/*/adjd*
 
-ANALOG DEVICES INC MAX31760 DRIVER
-M:	Ibrahim Tilki <Ibrahim.Tilki@analog.com>
-S:	Maintained
-W:	http://wiki.analog.com/
-W:	https://ez.analog.com/linux-software-drivers
-F:	Documentation/devicetree/bindings/hwmon/adi,max31760.yaml
-F:	Documentation/hwmon/max31760.rst
-F:	drivers/hwmon/max31760.c
-
 ANALOGBITS PLL LIBRARIES
 M:	Paul Walmsley <paul.walmsley@sifive.com>
 S:	Supported
@@ -13130,15 +13121,6 @@ F:	Documentation/userspace-api/media/drivers/max2175.rst
 F:	drivers/media/i2c/max2175*
 F:	include/uapi/linux/max2175.h
 
-MAX31827 TEMPERATURE SWITCH DRIVER
-M:	Daniel Matyas <daniel.matyas@analog.com>
-L:	linux-hwmon@vger.kernel.org
-S:	Supported
-W:	https://ez.analog.com/linux-software-drivers
-F:	Documentation/devicetree/bindings/hwmon/adi,max31827.yaml
-F:	Documentation/hwmon/max31827.rst
-F:	drivers/hwmon/max31827.c
-
 MAX31335 RTC DRIVER
 M:	Antoniu Miclaus <antoniu.miclaus@analog.com>
 L:	linux-rtc@vger.kernel.org

From fb27638836ac2b4334bae9421b04563c1cd74a39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 26 Jan 2024 13:04:33 +0100
Subject: [PATCH 251/707] pwm: atmel-hlcdc: Fix clock imbalance related to
 suspend support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The suspend callback disables the periph clock when the PWM is enabled
and resume reenables this clock if the PWM was disabled before. Judging
from the code comment it's suspend that is wrong here. Fix accordingly.

Fixes: f9bb9da7c09d ("pwm: atmel-hlcdc: Implement the suspend/resume hooks")
Link: https://lore.kernel.org/r/b51ea92b0a45eff3dc83b08adefd43d930df996c.1706269232.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-atmel-hlcdc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pwm/pwm-atmel-hlcdc.c b/drivers/pwm/pwm-atmel-hlcdc.c
index 3f2c5031a3ba85..1f6fc9a9fcf3ec 100644
--- a/drivers/pwm/pwm-atmel-hlcdc.c
+++ b/drivers/pwm/pwm-atmel-hlcdc.c
@@ -185,7 +185,7 @@ static int atmel_hlcdc_pwm_suspend(struct device *dev)
 	struct atmel_hlcdc_pwm *atmel = dev_get_drvdata(dev);
 
 	/* Keep the periph clock enabled if the PWM is still running. */
-	if (pwm_is_enabled(&atmel->chip.pwms[0]))
+	if (!pwm_is_enabled(&atmel->chip.pwms[0]))
 		clk_disable_unprepare(atmel->hlcdc->periph_clk);
 
 	return 0;

From 5043456b2de107ae65dcff7f49caf6c8f99169b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Tue, 9 Jan 2024 22:34:32 +0100
Subject: [PATCH 252/707] pwm: Drop useless member .of_pwm_n_cells of struct
 pwm_chip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Apart from the two of_xlate implementations this member is write-only.
In the of_xlate functions of_pwm_xlate_with_flags() and
of_pwm_single_xlate() it's more sensible to check for args->args_count
because this is what is actually used in the device tree.

Acked-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/53d8c545aa8f79a920358be9e72e382b3981bdc4.1704835845.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/gpu/drm/bridge/ti-sn65dsi86.c |  1 -
 drivers/pwm/core.c                    | 22 +++-------------------
 drivers/pwm/pwm-clps711x.c            |  1 -
 drivers/pwm/pwm-cros-ec.c             |  1 -
 drivers/pwm/pwm-pxa.c                 |  4 +---
 include/linux/pwm.h                   |  2 --
 6 files changed, 4 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
index 62cc3893dca5d3..1f6e929c2f6a3c 100644
--- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c
+++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
@@ -1591,7 +1591,6 @@ static int ti_sn_pwm_probe(struct auxiliary_device *adev,
 	pdata->pchip.ops = &ti_sn_pwm_ops;
 	pdata->pchip.npwm = 1;
 	pdata->pchip.of_xlate = of_pwm_single_xlate;
-	pdata->pchip.of_pwm_n_cells = 1;
 
 	devm_pm_runtime_enable(&adev->dev);
 
diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index f2728ee787d7a5..31f210872a079f 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -107,9 +107,6 @@ of_pwm_xlate_with_flags(struct pwm_chip *chip, const struct of_phandle_args *arg
 {
 	struct pwm_device *pwm;
 
-	if (chip->of_pwm_n_cells < 2)
-		return ERR_PTR(-EINVAL);
-
 	/* flags in the third cell are optional */
 	if (args->args_count < 2)
 		return ERR_PTR(-EINVAL);
@@ -124,10 +121,8 @@ of_pwm_xlate_with_flags(struct pwm_chip *chip, const struct of_phandle_args *arg
 	pwm->args.period = args->args[1];
 	pwm->args.polarity = PWM_POLARITY_NORMAL;
 
-	if (chip->of_pwm_n_cells >= 3) {
-		if (args->args_count > 2 && args->args[2] & PWM_POLARITY_INVERTED)
-			pwm->args.polarity = PWM_POLARITY_INVERSED;
-	}
+	if (args->args_count > 2 && args->args[2] & PWM_POLARITY_INVERTED)
+		pwm->args.polarity = PWM_POLARITY_INVERSED;
 
 	return pwm;
 }
@@ -138,9 +133,6 @@ of_pwm_single_xlate(struct pwm_chip *chip, const struct of_phandle_args *args)
 {
 	struct pwm_device *pwm;
 
-	if (chip->of_pwm_n_cells < 1)
-		return ERR_PTR(-EINVAL);
-
 	/* validate that one cell is specified, optionally with flags */
 	if (args->args_count != 1 && args->args_count != 2)
 		return ERR_PTR(-EINVAL);
@@ -164,16 +156,8 @@ static void of_pwmchip_add(struct pwm_chip *chip)
 	if (!chip->dev || !chip->dev->of_node)
 		return;
 
-	if (!chip->of_xlate) {
-		u32 pwm_cells;
-
-		if (of_property_read_u32(chip->dev->of_node, "#pwm-cells",
-					 &pwm_cells))
-			pwm_cells = 2;
-
+	if (!chip->of_xlate)
 		chip->of_xlate = of_pwm_xlate_with_flags;
-		chip->of_pwm_n_cells = pwm_cells;
-	}
 
 	of_node_get(chip->dev->of_node);
 }
diff --git a/drivers/pwm/pwm-clps711x.c b/drivers/pwm/pwm-clps711x.c
index 42179b3f7ec399..06562d4bb9633e 100644
--- a/drivers/pwm/pwm-clps711x.c
+++ b/drivers/pwm/pwm-clps711x.c
@@ -103,7 +103,6 @@ static int clps711x_pwm_probe(struct platform_device *pdev)
 	priv->chip.dev = &pdev->dev;
 	priv->chip.npwm = 2;
 	priv->chip.of_xlate = clps711x_pwm_xlate;
-	priv->chip.of_pwm_n_cells = 1;
 
 	spin_lock_init(&priv->lock);
 
diff --git a/drivers/pwm/pwm-cros-ec.c b/drivers/pwm/pwm-cros-ec.c
index 5fe303b8656def..339cedf3a7b18b 100644
--- a/drivers/pwm/pwm-cros-ec.c
+++ b/drivers/pwm/pwm-cros-ec.c
@@ -279,7 +279,6 @@ static int cros_ec_pwm_probe(struct platform_device *pdev)
 	chip->dev = dev;
 	chip->ops = &cros_ec_pwm_ops;
 	chip->of_xlate = cros_ec_pwm_xlate;
-	chip->of_pwm_n_cells = 1;
 
 	if (ec_pwm->use_pwm_type) {
 		chip->npwm = CROS_EC_PWM_DT_COUNT;
diff --git a/drivers/pwm/pwm-pxa.c b/drivers/pwm/pwm-pxa.c
index 76685f926c7586..61b74fa1d3481a 100644
--- a/drivers/pwm/pwm-pxa.c
+++ b/drivers/pwm/pwm-pxa.c
@@ -180,10 +180,8 @@ static int pwm_probe(struct platform_device *pdev)
 	pc->chip.ops = &pxa_pwm_ops;
 	pc->chip.npwm = (id->driver_data & HAS_SECONDARY_PWM) ? 2 : 1;
 
-	if (IS_ENABLED(CONFIG_OF)) {
+	if (IS_ENABLED(CONFIG_OF))
 		pc->chip.of_xlate = of_pwm_single_xlate;
-		pc->chip.of_pwm_n_cells = 1;
-	}
 
 	pc->mmio_base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(pc->mmio_base))
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index fcc2c4496f7316..8ffe9ae7a23a95 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -271,7 +271,6 @@ struct pwm_ops {
  * @id: unique number of this PWM chip
  * @npwm: number of PWMs controlled by this chip
  * @of_xlate: request a PWM device given a device tree PWM specifier
- * @of_pwm_n_cells: number of cells expected in the device tree PWM specifier
  * @atomic: can the driver's ->apply() be called in atomic context
  * @pwms: array of PWM devices allocated by the framework
  */
@@ -284,7 +283,6 @@ struct pwm_chip {
 
 	struct pwm_device * (*of_xlate)(struct pwm_chip *chip,
 					const struct of_phandle_args *args);
-	unsigned int of_pwm_n_cells;
 	bool atomic;
 
 	/* only used internally by the PWM framework */

From 4f0f7dce06cd1cd1eb3cca11561634f5cd44b0b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Tue, 9 Jan 2024 22:34:33 +0100
Subject: [PATCH 253/707] pwm: Let the of_xlate callbacks accept references
 without period
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With this extension of_pwm_xlate_with_flags() is suitable to replace the
custom xlate function of the pwm-clps711x driver.

While touching these very similar functions align their implementations.

Link: https://lore.kernel.org/r/127622315d07d9d419ae8e6373c7e5be7fab7a62.1704835845.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/core.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 31f210872a079f..606d9ef0c70974 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -107,8 +107,8 @@ of_pwm_xlate_with_flags(struct pwm_chip *chip, const struct of_phandle_args *arg
 {
 	struct pwm_device *pwm;
 
-	/* flags in the third cell are optional */
-	if (args->args_count < 2)
+	/* period in the second cell and flags in the third cell are optional */
+	if (args->args_count < 1)
 		return ERR_PTR(-EINVAL);
 
 	if (args->args[0] >= chip->npwm)
@@ -118,9 +118,10 @@ of_pwm_xlate_with_flags(struct pwm_chip *chip, const struct of_phandle_args *arg
 	if (IS_ERR(pwm))
 		return pwm;
 
-	pwm->args.period = args->args[1];
-	pwm->args.polarity = PWM_POLARITY_NORMAL;
+	if (args->args_count > 1)
+		pwm->args.period = args->args[1];
 
+	pwm->args.polarity = PWM_POLARITY_NORMAL;
 	if (args->args_count > 2 && args->args[2] & PWM_POLARITY_INVERTED)
 		pwm->args.polarity = PWM_POLARITY_INVERSED;
 
@@ -133,18 +134,15 @@ of_pwm_single_xlate(struct pwm_chip *chip, const struct of_phandle_args *args)
 {
 	struct pwm_device *pwm;
 
-	/* validate that one cell is specified, optionally with flags */
-	if (args->args_count != 1 && args->args_count != 2)
-		return ERR_PTR(-EINVAL);
-
 	pwm = pwm_request_from_chip(chip, 0, NULL);
 	if (IS_ERR(pwm))
 		return pwm;
 
-	pwm->args.period = args->args[0];
-	pwm->args.polarity = PWM_POLARITY_NORMAL;
+	if (args->args_count > 1)
+		pwm->args.period = args->args[0];
 
-	if (args->args_count == 2 && args->args[1] & PWM_POLARITY_INVERTED)
+	pwm->args.polarity = PWM_POLARITY_NORMAL;
+	if (args->args_count > 1 && args->args[1] & PWM_POLARITY_INVERTED)
 		pwm->args.polarity = PWM_POLARITY_INVERSED;
 
 	return pwm;

From d1d29cd19653cd94e0f5aab152b46a539b1725e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Tue, 9 Jan 2024 22:34:34 +0100
Subject: [PATCH 254/707] pwm: clps711x: Drop custom .of_xlate() callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The default of_xlate callback (of_pwm_xlate_with_flags()) does
everything the drivers expects from its .of_xlate() callback. So drop
the custom implementation.

Link: https://lore.kernel.org/r/f58336c298d536107de5cab6a57e19f957ab326c.1704835845.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-clps711x.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/drivers/pwm/pwm-clps711x.c b/drivers/pwm/pwm-clps711x.c
index 06562d4bb9633e..f3b4af7963bede 100644
--- a/drivers/pwm/pwm-clps711x.c
+++ b/drivers/pwm/pwm-clps711x.c
@@ -74,15 +74,6 @@ static const struct pwm_ops clps711x_pwm_ops = {
 	.apply = clps711x_pwm_apply,
 };
 
-static struct pwm_device *clps711x_pwm_xlate(struct pwm_chip *chip,
-					     const struct of_phandle_args *args)
-{
-	if (args->args[0] >= chip->npwm)
-		return ERR_PTR(-EINVAL);
-
-	return pwm_request_from_chip(chip, args->args[0], NULL);
-}
-
 static int clps711x_pwm_probe(struct platform_device *pdev)
 {
 	struct clps711x_chip *priv;
@@ -102,7 +93,6 @@ static int clps711x_pwm_probe(struct platform_device *pdev)
 	priv->chip.ops = &clps711x_pwm_ops;
 	priv->chip.dev = &pdev->dev;
 	priv->chip.npwm = 2;
-	priv->chip.of_xlate = clps711x_pwm_xlate;
 
 	spin_lock_init(&priv->lock);
 

From 966ed5ea39e654bdefd2b3b7d9b91b5fa87ed84b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Tue, 9 Jan 2024 22:34:35 +0100
Subject: [PATCH 255/707] pwm: Drop duplicate check against chip->npwm in
 of_pwm_xlate_with_flags()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

args->args[0] is passed as parameter "index" to pwm_request_from_chip().
The latter function also checks for index >= npwm, so
of_pwm_xlate_with_flags() doesn't need to do that.

Link: https://lore.kernel.org/r/b06e445a6ed62a339add727eccb969a33d678386.1704835845.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/core.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 606d9ef0c70974..b025d90e201c94 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -111,9 +111,6 @@ of_pwm_xlate_with_flags(struct pwm_chip *chip, const struct of_phandle_args *arg
 	if (args->args_count < 1)
 		return ERR_PTR(-EINVAL);
 
-	if (args->args[0] >= chip->npwm)
-		return ERR_PTR(-EINVAL);
-
 	pwm = pwm_request_from_chip(chip, args->args[0], NULL);
 	if (IS_ERR(pwm))
 		return pwm;

From c7005dd95719942b21dce6f02a94b6d412550714 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Thu, 25 Jan 2024 09:56:49 +0100
Subject: [PATCH 256/707] pwm: mediatek: Update kernel doc for struct
 pwm_mediatek_of_data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The struct never had a member called clk_freq. This fixes the W=1
warning:

	drivers/pwm/pwm-mediatek.c:60: warning: Excess struct member 'clk_freq' description in 'pwm_mediatek_chip'

Fixes: efecdeb82f21 ("pwm: mediatek: Allocate the clks array dynamically")
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20240125085649.1571268-2-u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/pwm-mediatek.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c
index 17d290f847af60..562102a47ac044 100644
--- a/drivers/pwm/pwm-mediatek.c
+++ b/drivers/pwm/pwm-mediatek.c
@@ -47,7 +47,6 @@ struct pwm_mediatek_of_data {
  * @clk_top: the top clock generator
  * @clk_main: the clock used by PWM core
  * @clk_pwms: the clock used by each PWM channel
- * @clk_freq: the fix clock frequency of legacy MIPS SoC
  * @soc: pointer to chip's platform data
  */
 struct pwm_mediatek_chip {

From 979c6fe7e799d2cab0a99c4b8c41cc48f10aca0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= <duje.mihanovic@skole.hr>
Date: Sun, 7 Jan 2024 12:46:59 +0100
Subject: [PATCH 257/707] dt-bindings: pxa-pwm: Convert to YAML
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert the PXA PWM binding file from TXT to YAML.

The original binding does not mention any clocks, but the PWM controller
will not probe without a clock.

Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Duje Mihanović <duje.mihanovic@skole.hr>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240107-pxa-pwm-yaml-v3-1-92ac90911c3f@skole.hr
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 .../bindings/pwm/marvell,pxa-pwm.yaml         | 51 +++++++++++++++++++
 .../devicetree/bindings/pwm/pxa-pwm.txt       | 30 -----------
 2 files changed, 51 insertions(+), 30 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/pwm/marvell,pxa-pwm.yaml
 delete mode 100644 Documentation/devicetree/bindings/pwm/pxa-pwm.txt

diff --git a/Documentation/devicetree/bindings/pwm/marvell,pxa-pwm.yaml b/Documentation/devicetree/bindings/pwm/marvell,pxa-pwm.yaml
new file mode 100644
index 00000000000000..ba6325575ea040
--- /dev/null
+++ b/Documentation/devicetree/bindings/pwm/marvell,pxa-pwm.yaml
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pwm/marvell,pxa-pwm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Marvell PXA PWM
+
+maintainers:
+  - Duje Mihanović <duje.mihanovic@skole.hr>
+
+allOf:
+  - $ref: pwm.yaml#
+
+properties:
+  compatible:
+    enum:
+      - marvell,pxa250-pwm
+      - marvell,pxa270-pwm
+      - marvell,pxa168-pwm
+      - marvell,pxa910-pwm
+
+  reg:
+    # Length should be 0x10
+    maxItems: 1
+
+  "#pwm-cells":
+    # Used for specifying the period length in nanoseconds
+    const: 1
+
+  clocks:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - "#pwm-cells"
+  - clocks
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/pxa-clock.h>
+
+    pwm0: pwm@40b00000 {
+      compatible = "marvell,pxa250-pwm";
+      reg = <0x40b00000 0x10>;
+      #pwm-cells = <1>;
+      clocks = <&clks CLK_PWM0>;
+    };
diff --git a/Documentation/devicetree/bindings/pwm/pxa-pwm.txt b/Documentation/devicetree/bindings/pwm/pxa-pwm.txt
deleted file mode 100644
index 5ae9f1e3c33896..00000000000000
--- a/Documentation/devicetree/bindings/pwm/pxa-pwm.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-Marvell PWM controller
-
-Required properties:
-- compatible: should be one or more of:
-  - "marvell,pxa250-pwm"
-  - "marvell,pxa270-pwm"
-  - "marvell,pxa168-pwm"
-  - "marvell,pxa910-pwm"
-- reg: Physical base address and length of the registers used by the PWM channel
-  Note that one device instance must be created for each PWM that is used, so the
-  length covers only the register window for one PWM output, not that of the
-  entire PWM controller.  Currently length is 0x10 for all supported devices.
-- #pwm-cells: Should be 1.  This cell is used to specify the period in
-  nanoseconds.
-
-Example PWM device node:
-
-pwm0: pwm@40b00000 {
-	compatible = "marvell,pxa250-pwm";
-	reg = <0x40b00000 0x10>;
-	#pwm-cells = <1>;
-};
-
-Example PWM client node:
-
-backlight {
-	compatible = "pwm-backlight";
-	pwms = <&pwm0 5000000>;
-	...
-}

From 909d8d33f8b4664c9b6c7fd585114921af77fc2b Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 27 Jan 2024 18:45:00 +0100
Subject: [PATCH 258/707] hwmon: Drop non-functional I2C_CLASS_HWMON support
 for drivers w/o detect()

Class-based I2C probing requires detect() and address_list both
to be set in the I2C client driver, see checks in i2c_detect().
It's misleading to declare I2C_CLASS_HWMON support if the driver
doesn't implement detect().
Class-based probing is a legacy mechanism, in addition apparently
nobody ever noticed that class-based probing has been non-functional
in both drivers from the very beginning. So drop the fragments of
class-based probing support.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/13ce7c11-a958-4892-ada9-faf5bfdcb89d@gmail.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/emc2305.rst | 1 -
 drivers/hwmon/adt7410.c         | 2 --
 drivers/hwmon/emc2305.c         | 5 -----
 3 files changed, 8 deletions(-)

diff --git a/Documentation/hwmon/emc2305.rst b/Documentation/hwmon/emc2305.rst
index 2403dbaf272891..d0bfffe463587b 100644
--- a/Documentation/hwmon/emc2305.rst
+++ b/Documentation/hwmon/emc2305.rst
@@ -6,7 +6,6 @@ Kernel driver emc2305
 Supported chips:
    Microchip EMC2305, EMC2303, EMC2302, EMC2301
 
-   Addresses scanned: I2C 0x27, 0x2c, 0x2d, 0x2e, 0x2f, 0x4c, 0x4d
    Prefixes: 'emc2305'
 
    Datasheet: Publicly available at the Microchip website :
diff --git a/drivers/hwmon/adt7410.c b/drivers/hwmon/adt7410.c
index 95250677933650..fd214d9b3a895a 100644
--- a/drivers/hwmon/adt7410.c
+++ b/drivers/hwmon/adt7410.c
@@ -95,14 +95,12 @@ static const struct i2c_device_id adt7410_ids[] = {
 MODULE_DEVICE_TABLE(i2c, adt7410_ids);
 
 static struct i2c_driver adt7410_driver = {
-	.class		= I2C_CLASS_HWMON,
 	.driver = {
 		.name	= "adt7410",
 		.pm	= pm_sleep_ptr(&adt7x10_dev_pm_ops),
 	},
 	.probe		= adt7410_i2c_probe,
 	.id_table	= adt7410_ids,
-	.address_list	= I2C_ADDRS(0x48, 0x49, 0x4a, 0x4b),
 };
 module_i2c_driver(adt7410_driver);
 
diff --git a/drivers/hwmon/emc2305.c b/drivers/hwmon/emc2305.c
index 29f0e4945f1924..6ef733c0be1675 100644
--- a/drivers/hwmon/emc2305.c
+++ b/drivers/hwmon/emc2305.c
@@ -12,9 +12,6 @@
 #include <linux/platform_data/emc2305.h>
 #include <linux/thermal.h>
 
-static const unsigned short
-emc2305_normal_i2c[] = { 0x27, 0x2c, 0x2d, 0x2e, 0x2f, 0x4c, 0x4d, I2C_CLIENT_END };
-
 #define EMC2305_REG_DRIVE_FAIL_STATUS	0x27
 #define EMC2305_REG_VENDOR		0xfe
 #define EMC2305_FAN_MAX			0xff
@@ -611,14 +608,12 @@ static void emc2305_remove(struct i2c_client *client)
 }
 
 static struct i2c_driver emc2305_driver = {
-	.class  = I2C_CLASS_HWMON,
 	.driver = {
 		.name = "emc2305",
 	},
 	.probe = emc2305_probe,
 	.remove	  = emc2305_remove,
 	.id_table = emc2305_ids,
-	.address_list = emc2305_normal_i2c,
 };
 
 module_i2c_driver(emc2305_driver);

From a16ab6713e2e4993e6a7152e848aee65b038cd1c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 27 Jan 2024 14:24:07 +0100
Subject: [PATCH 259/707] pidfd: don't do_notify_pidfd() if
 !thread_group_empty()

do_notify_pidfd() makes no sense until the whole thread group exits, change
do_notify_parent() to check thread_group_empty().

This avoids the unnecessary do_notify_pidfd() when tsk is not a leader, or
it exits before other threads, or it has a ptraced EXIT_ZOMBIE sub-thread.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240127132407.GA29136@redhat.com
Reviewed-by: Tycho Andersen <tandersen@netflix.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/signal.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index c9c57d053ce4f6..9561a3962ca687 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2050,9 +2050,11 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 
 	WARN_ON_ONCE(!tsk->ptrace &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
-
-	/* Wake up all pidfd waiters */
-	do_notify_pidfd(tsk);
+	/*
+	 * tsk is a group leader and has no threads, wake up the pidfd waiters.
+	 */
+	if (thread_group_empty(tsk))
+		do_notify_pidfd(tsk);
 
 	if (sig != SIGCHLD) {
 		/*

From e0bad07869f7bf341152b37df89d30b9d2cfd7b6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 26 Jan 2024 21:01:05 -0500
Subject: [PATCH 260/707] fs/pipe: Convert to lockdep_cmp_fn

*_lock_nested() is fundamentally broken; lockdep needs to check lock
ordering, but we cannot device a total ordering on an unbounded number
of elements with only a few subclasses.

the replacement is to define lock ordering with a proper comparison
function.

fs/pipe.c was already doing everything correctly otherwise, nothing
much changes here.

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Link: https://lore.kernel.org/r/20240127020111.487218-2-kent.overstreet@linux.dev
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pipe.c | 81 +++++++++++++++++++++++++------------------------------
 1 file changed, 36 insertions(+), 45 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index f1adbfe743d4a7..50c8a8596b5245 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -76,18 +76,20 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  */
 
-static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
+#define cmp_int(l, r)		((l > r) - (l < r))
+
+#ifdef CONFIG_PROVE_LOCKING
+static int pipe_lock_cmp_fn(const struct lockdep_map *a,
+			    const struct lockdep_map *b)
 {
-	if (pipe->files)
-		mutex_lock_nested(&pipe->mutex, subclass);
+	return cmp_int((unsigned long) a, (unsigned long) b);
 }
+#endif
 
 void pipe_lock(struct pipe_inode_info *pipe)
 {
-	/*
-	 * pipe_lock() nests non-pipe inode locks (for writing to a file)
-	 */
-	pipe_lock_nested(pipe, I_MUTEX_PARENT);
+	if (pipe->files)
+		mutex_lock(&pipe->mutex);
 }
 EXPORT_SYMBOL(pipe_lock);
 
@@ -98,28 +100,16 @@ void pipe_unlock(struct pipe_inode_info *pipe)
 }
 EXPORT_SYMBOL(pipe_unlock);
 
-static inline void __pipe_lock(struct pipe_inode_info *pipe)
-{
-	mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
-}
-
-static inline void __pipe_unlock(struct pipe_inode_info *pipe)
-{
-	mutex_unlock(&pipe->mutex);
-}
-
 void pipe_double_lock(struct pipe_inode_info *pipe1,
 		      struct pipe_inode_info *pipe2)
 {
 	BUG_ON(pipe1 == pipe2);
 
-	if (pipe1 < pipe2) {
-		pipe_lock_nested(pipe1, I_MUTEX_PARENT);
-		pipe_lock_nested(pipe2, I_MUTEX_CHILD);
-	} else {
-		pipe_lock_nested(pipe2, I_MUTEX_PARENT);
-		pipe_lock_nested(pipe1, I_MUTEX_CHILD);
-	}
+	if (pipe1 > pipe2)
+		swap(pipe1, pipe2);
+
+	pipe_lock(pipe1);
+	pipe_lock(pipe2);
 }
 
 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
@@ -271,7 +261,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 		return 0;
 
 	ret = 0;
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	/*
 	 * We only wake up writers if the pipe was full when we started
@@ -368,7 +358,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 			ret = -EAGAIN;
 			break;
 		}
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 
 		/*
 		 * We only get here if we didn't actually read anything.
@@ -400,13 +390,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 		if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
 			return -ERESTARTSYS;
 
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
 		wake_next_reader = true;
 	}
 	if (pipe_empty(pipe->head, pipe->tail))
 		wake_next_reader = false;
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	if (was_full)
 		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
@@ -462,7 +452,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 	if (unlikely(total_len == 0))
 		return 0;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	if (!pipe->readers) {
 		send_sig(SIGPIPE, current, 0);
@@ -582,19 +572,19 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 		 * after waiting we need to re-check whether the pipe
 		 * become empty while we dropped the lock.
 		 */
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 		if (was_empty)
 			wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		was_empty = pipe_empty(pipe->head, pipe->tail);
 		wake_next_writer = true;
 	}
 out:
 	if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
 		wake_next_writer = false;
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	/*
 	 * If we do do a wakeup event, we do a 'sync' wakeup, because we
@@ -629,7 +619,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 	case FIONREAD:
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		count = 0;
 		head = pipe->head;
 		tail = pipe->tail;
@@ -639,16 +629,16 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			count += pipe->bufs[tail & mask].len;
 			tail++;
 		}
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 
 		return put_user(count, (int __user *)arg);
 
 #ifdef CONFIG_WATCH_QUEUE
 	case IOC_WATCH_QUEUE_SET_SIZE: {
 		int ret;
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		ret = watch_queue_set_size(pipe, arg);
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 		return ret;
 	}
 
@@ -734,7 +724,7 @@ pipe_release(struct inode *inode, struct file *file)
 {
 	struct pipe_inode_info *pipe = file->private_data;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 	if (file->f_mode & FMODE_READ)
 		pipe->readers--;
 	if (file->f_mode & FMODE_WRITE)
@@ -747,7 +737,7 @@ pipe_release(struct inode *inode, struct file *file)
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	put_pipe_info(inode, pipe);
 	return 0;
@@ -759,7 +749,7 @@ pipe_fasync(int fd, struct file *filp, int on)
 	struct pipe_inode_info *pipe = filp->private_data;
 	int retval = 0;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 	if (filp->f_mode & FMODE_READ)
 		retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
 	if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
@@ -768,7 +758,7 @@ pipe_fasync(int fd, struct file *filp, int on)
 			/* this can happen only if on == T */
 			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
 	}
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 	return retval;
 }
 
@@ -834,6 +824,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
 		pipe->nr_accounted = pipe_bufs;
 		pipe->user = user;
 		mutex_init(&pipe->mutex);
+		lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL);
 		return pipe;
 	}
 
@@ -1144,7 +1135,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	filp->private_data = pipe;
 	/* OK, we have a pipe and it's pinned down */
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	/* We can only do regular read/write on fifos */
 	stream_open(inode, filp);
@@ -1214,7 +1205,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	}
 
 	/* Ok! */
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 	return 0;
 
 err_rd:
@@ -1230,7 +1221,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	goto err;
 
 err:
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	put_pipe_info(inode, pipe);
 	return ret;
@@ -1411,7 +1402,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
 	if (!pipe)
 		return -EBADF;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	switch (cmd) {
 	case F_SETPIPE_SZ:
@@ -1425,7 +1416,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
 		break;
 	}
 
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 	return ret;
 }
 

From e248feeb2519e4a00e44482f5045d1ae4ec7fa13 Mon Sep 17 00:00:00 2001
From: Zhipeng Lu <alexious@zju.edu.cn>
Date: Sun, 24 Dec 2023 16:20:33 +0800
Subject: [PATCH 261/707] SUNRPC: fix a memleak in gss_import_v2_context

The ctx->mech_used.data allocated by kmemdup is not freed in neither
gss_import_v2_context nor it only caller gss_krb5_import_sec_context,
which frees ctx on error.

Thus, this patch reform the last call of gss_import_v2_context to the
gss_krb5_import_ctx_v2, preventing the memleak while keepping the return
formation.

Fixes: 47d848077629 ("gss_krb5: handle new context format from gssd")
Signed-off-by: Zhipeng Lu <alexious@zju.edu.cn>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/auth_gss/gss_krb5_mech.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 64cff717c3d9b3..3366505bc669a0 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -398,6 +398,7 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
 	u64 seq_send64;
 	int keylen;
 	u32 time32;
+	int ret;
 
 	p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags));
 	if (IS_ERR(p))
@@ -450,8 +451,16 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
 	}
 	ctx->mech_used.len = gss_kerberos_mech.gm_oid.len;
 
-	return gss_krb5_import_ctx_v2(ctx, gfp_mask);
+	ret = gss_krb5_import_ctx_v2(ctx, gfp_mask);
+	if (ret) {
+		p = ERR_PTR(ret);
+		goto out_free;
+	}
 
+	return 0;
+
+out_free:
+	kfree(ctx->mech_used.data);
 out_err:
 	return PTR_ERR(p);
 }

From b66d75139458ec91c4e1c0f33b2c7f676df24604 Mon Sep 17 00:00:00 2001
From: Zhipeng Lu <alexious@zju.edu.cn>
Date: Tue, 2 Jan 2024 13:38:13 +0800
Subject: [PATCH 262/707] SUNRPC: fix some memleaks in gssx_dec_option_array

The creds and oa->data need to be freed in the error-handling paths after
their allocation. So this patch add these deallocations in the
corresponding paths.

Fixes: 1d658336b05f ("SUNRPC: Add RPC based upcall mechanism for RPCGSS auth")
Signed-off-by: Zhipeng Lu <alexious@zju.edu.cn>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/auth_gss/gss_rpc_xdr.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index d79f12c2550ac3..cb32ab9a839521 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -250,8 +250,8 @@ static int gssx_dec_option_array(struct xdr_stream *xdr,
 
 	creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL);
 	if (!creds) {
-		kfree(oa->data);
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto free_oa;
 	}
 
 	oa->data[0].option.data = CREDS_VALUE;
@@ -265,29 +265,40 @@ static int gssx_dec_option_array(struct xdr_stream *xdr,
 
 		/* option buffer */
 		p = xdr_inline_decode(xdr, 4);
-		if (unlikely(p == NULL))
-			return -ENOSPC;
+		if (unlikely(p == NULL)) {
+			err = -ENOSPC;
+			goto free_creds;
+		}
 
 		length = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, length);
-		if (unlikely(p == NULL))
-			return -ENOSPC;
+		if (unlikely(p == NULL)) {
+			err = -ENOSPC;
+			goto free_creds;
+		}
 
 		if (length == sizeof(CREDS_VALUE) &&
 		    memcmp(p, CREDS_VALUE, sizeof(CREDS_VALUE)) == 0) {
 			/* We have creds here. parse them */
 			err = gssx_dec_linux_creds(xdr, creds);
 			if (err)
-				return err;
+				goto free_creds;
 			oa->data[0].value.len = 1; /* presence */
 		} else {
 			/* consume uninteresting buffer */
 			err = gssx_dec_buffer(xdr, &dummy);
 			if (err)
-				return err;
+				goto free_creds;
 		}
 	}
 	return 0;
+
+free_creds:
+	kfree(creds);
+free_oa:
+	kfree(oa->data);
+	oa->data = NULL;
+	return err;
 }
 
 static int gssx_dec_status(struct xdr_stream *xdr,

From c7cb8c19bc73e6a6ee3dbabbc8299b8f469ee4e7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 1 Jan 2024 11:37:45 -0500
Subject: [PATCH 263/707] SUNRPC: Use a static buffer for the checksum
 initialization vector

Allocating and zeroing a buffer during every call to
krb5_etm_checksum() is inefficient. Instead, set aside a static
buffer that is the maximum crypto block size, and use a portion
(or all) of that.

Reported-by: Markus Elfring <Markus.Elfring@web.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/auth_gss/gss_krb5_crypto.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index d2b02710ab0709..b2c1b683a88ee2 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -921,6 +921,8 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, u32 len,
  * Caller provides the truncation length of the output token (h) in
  * cksumout.len.
  *
+ * Note that for RPCSEC, the "initial cipher state" is always all zeroes.
+ *
  * Return values:
  *   %GSS_S_COMPLETE: Digest computed, @cksumout filled in
  *   %GSS_S_FAILURE: Call failed
@@ -931,22 +933,19 @@ u32 krb5_etm_checksum(struct crypto_sync_skcipher *cipher,
 		      int body_offset, struct xdr_netobj *cksumout)
 {
 	unsigned int ivsize = crypto_sync_skcipher_ivsize(cipher);
+	static const u8 iv[GSS_KRB5_MAX_BLOCKSIZE];
 	struct ahash_request *req;
 	struct scatterlist sg[1];
-	u8 *iv, *checksumdata;
 	int err = -ENOMEM;
+	u8 *checksumdata;
 
 	checksumdata = kmalloc(crypto_ahash_digestsize(tfm), GFP_KERNEL);
 	if (!checksumdata)
 		return GSS_S_FAILURE;
-	/* For RPCSEC, the "initial cipher state" is always all zeroes. */
-	iv = kzalloc(ivsize, GFP_KERNEL);
-	if (!iv)
-		goto out_free_mem;
 
 	req = ahash_request_alloc(tfm, GFP_KERNEL);
 	if (!req)
-		goto out_free_mem;
+		goto out_free_cksumdata;
 	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
 	err = crypto_ahash_init(req);
 	if (err)
@@ -970,8 +969,7 @@ u32 krb5_etm_checksum(struct crypto_sync_skcipher *cipher,
 
 out_free_ahash:
 	ahash_request_free(req);
-out_free_mem:
-	kfree(iv);
+out_free_cksumdata:
 	kfree_sensitive(checksumdata);
 	return err ? GSS_S_FAILURE : GSS_S_COMPLETE;
 }

From 2df0218807f5846a235039410e44048bb676d0f4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 15 Dec 2023 12:18:30 +1100
Subject: [PATCH 264/707] nfsd: Don't leave work of closing files to a work
 queue

The work of closing a file can have non-trivial cost.  Doing it in a
separate work queue thread means that cost isn't imposed on the nfsd
threads and an imbalance can be created.  This can result in files being
queued for the work queue more quickly that the work queue can process
them, resulting in unbounded growth of the queue and memory exhaustion.

To avoid this work imbalance that exhausts memory, this patch moves all
closing of files into the nfsd threads.  This means that when the work
imposes a cost, that cost appears where it would be expected - in the
work of the nfsd thread.  A subsequent patch will ensure the final
__fput() is called in the same (nfsd) thread which calls filp_close().

Files opened for NFSv3 are never explicitly closed by the client and are
kept open by the server in the "filecache", which responds to memory
pressure, is garbage collected even when there is no pressure, and
sometimes closes files when there is particular need such as for rename.
These files currently have filp_close() called in a dedicated work
queue, so their __fput() can have no effect on nfsd threads.

This patch discards the work queue and instead has each nfsd thread call
flip_close() on as many as 8 files from the filecache each time it acts
on a client request (or finds there are no pending client requests).  If
there are more to be closed, more threads are woken.  This spreads the
work of __fput() over multiple threads and imposes any cost on those
threads.

The number 8 is somewhat arbitrary.  It needs to be greater than 1 to
ensure that files are closed more quickly than they can be added to the
cache.  It needs to be small enough to limit the per-request delays that
will be imposed on clients when all threads are busy closing files.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/filecache.c | 67 +++++++++++++++++++++------------------------
 fs/nfsd/filecache.h |  1 +
 fs/nfsd/nfssvc.c    |  2 ++
 3 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 9cb7f0c33df587..f8b100bca6e4da 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -61,13 +61,10 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age);
 static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions);
 
 struct nfsd_fcache_disposal {
-	struct work_struct work;
 	spinlock_t lock;
 	struct list_head freeme;
 };
 
-static struct workqueue_struct *nfsd_filecache_wq __read_mostly;
-
 static struct kmem_cache		*nfsd_file_slab;
 static struct kmem_cache		*nfsd_file_mark_slab;
 static struct list_lru			nfsd_file_lru;
@@ -421,7 +418,37 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
 		spin_lock(&l->lock);
 		list_move_tail(&nf->nf_lru, &l->freeme);
 		spin_unlock(&l->lock);
-		queue_work(nfsd_filecache_wq, &l->work);
+		svc_wake_up(nn->nfsd_serv);
+	}
+}
+
+/**
+ * nfsd_file_net_dispose - deal with nfsd_files waiting to be disposed.
+ * @nn: nfsd_net in which to find files to be disposed.
+ *
+ * When files held open for nfsv3 are removed from the filecache, whether
+ * due to memory pressure or garbage collection, they are queued to
+ * a per-net-ns queue.  This function completes the disposal, either
+ * directly or by waking another nfsd thread to help with the work.
+ */
+void nfsd_file_net_dispose(struct nfsd_net *nn)
+{
+	struct nfsd_fcache_disposal *l = nn->fcache_disposal;
+
+	if (!list_empty(&l->freeme)) {
+		LIST_HEAD(dispose);
+		int i;
+
+		spin_lock(&l->lock);
+		for (i = 0; i < 8 && !list_empty(&l->freeme); i++)
+			list_move(l->freeme.next, &dispose);
+		spin_unlock(&l->lock);
+		if (!list_empty(&l->freeme))
+			/* Wake up another thread to share the work
+			 * *before* doing any actual disposing.
+			 */
+			svc_wake_up(nn->nfsd_serv);
+		nfsd_file_dispose_list(&dispose);
 	}
 }
 
@@ -634,27 +661,6 @@ nfsd_file_close_inode_sync(struct inode *inode)
 	flush_delayed_fput();
 }
 
-/**
- * nfsd_file_delayed_close - close unused nfsd_files
- * @work: dummy
- *
- * Scrape the freeme list for this nfsd_net, and then dispose of them
- * all.
- */
-static void
-nfsd_file_delayed_close(struct work_struct *work)
-{
-	LIST_HEAD(head);
-	struct nfsd_fcache_disposal *l = container_of(work,
-			struct nfsd_fcache_disposal, work);
-
-	spin_lock(&l->lock);
-	list_splice_init(&l->freeme, &head);
-	spin_unlock(&l->lock);
-
-	nfsd_file_dispose_list(&head);
-}
-
 static int
 nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
 			    void *data)
@@ -717,10 +723,6 @@ nfsd_file_cache_init(void)
 		return ret;
 
 	ret = -ENOMEM;
-	nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", WQ_UNBOUND, 0);
-	if (!nfsd_filecache_wq)
-		goto out;
-
 	nfsd_file_slab = kmem_cache_create("nfsd_file",
 				sizeof(struct nfsd_file), 0, 0, NULL);
 	if (!nfsd_file_slab) {
@@ -735,7 +737,6 @@ nfsd_file_cache_init(void)
 		goto out_err;
 	}
 
-
 	ret = list_lru_init(&nfsd_file_lru);
 	if (ret) {
 		pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret);
@@ -785,8 +786,6 @@ nfsd_file_cache_init(void)
 	nfsd_file_slab = NULL;
 	kmem_cache_destroy(nfsd_file_mark_slab);
 	nfsd_file_mark_slab = NULL;
-	destroy_workqueue(nfsd_filecache_wq);
-	nfsd_filecache_wq = NULL;
 	rhltable_destroy(&nfsd_file_rhltable);
 	goto out;
 }
@@ -832,7 +831,6 @@ nfsd_alloc_fcache_disposal(void)
 	l = kmalloc(sizeof(*l), GFP_KERNEL);
 	if (!l)
 		return NULL;
-	INIT_WORK(&l->work, nfsd_file_delayed_close);
 	spin_lock_init(&l->lock);
 	INIT_LIST_HEAD(&l->freeme);
 	return l;
@@ -841,7 +839,6 @@ nfsd_alloc_fcache_disposal(void)
 static void
 nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l)
 {
-	cancel_work_sync(&l->work);
 	nfsd_file_dispose_list(&l->freeme);
 	kfree(l);
 }
@@ -910,8 +907,6 @@ nfsd_file_cache_shutdown(void)
 	fsnotify_wait_marks_destroyed();
 	kmem_cache_destroy(nfsd_file_mark_slab);
 	nfsd_file_mark_slab = NULL;
-	destroy_workqueue(nfsd_filecache_wq);
-	nfsd_filecache_wq = NULL;
 	rhltable_destroy(&nfsd_file_rhltable);
 
 	for_each_possible_cpu(i) {
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index e54165a3224f0b..c61884def906d0 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -56,6 +56,7 @@ void nfsd_file_cache_shutdown_net(struct net *net);
 void nfsd_file_put(struct nfsd_file *nf);
 struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
 void nfsd_file_close_inode_sync(struct inode *inode);
+void nfsd_file_net_dispose(struct nfsd_net *nn);
 bool nfsd_file_is_cached(struct inode *inode);
 __be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		  unsigned int may_flags, struct nfsd_file **nfp);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index a667802e08e75f..9a894c3511baf3 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -941,6 +941,8 @@ nfsd(void *vrqstp)
 		rqstp->rq_server->sv_maxconn = nn->max_connections;
 
 		svc_recv(rqstp);
+
+		nfsd_file_net_dispose(nn);
 	}
 
 	atomic_dec(&nfsdstats.th_cnt);

From fc6bfd4bace624be26652cbd5efa1deb2d4c3e0c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 15 Dec 2023 12:18:31 +1100
Subject: [PATCH 265/707] nfsd: use __fput_sync() to avoid delayed closing of
 files.

Calling fput() directly or though filp_close() from a kernel thread like
nfsd causes the final __fput() (if necessary) to be called from a
workqueue.  This means that nfsd is not forced to wait for any work to
complete.  If the ->release or ->destroy_inode function is slow for any
reason, this can result in nfsd closing files more quickly than the
workqueue can complete the close and the queue of pending closes can
grow without bounces (30 million has been seen at one customer site,
though this was in part due to a slowness in xfs which has since been
fixed).

nfsd does not need this.  It is quite appropriate and safe for nfsd to
do its own close work.  There is no reason that close should ever wait
for nfsd, so no deadlock can occur.

It should be safe and sensible to change all fput() calls to
__fput_sync().  However in the interests of caution this patch only
changes two - the two that can be most directly affected by client
behaviour and could occur at high frequency.

- the fput() implicitly in flip_close() is changed to __fput_sync()
  by calling get_file() first to ensure filp_close() doesn't do
  the final fput() itself.  If is where files opened for IO are closed.

- the fput() in nfsd_read() is also changed.  This is where directories
  opened for readdir are closed.

This ensure that minimal fput work is queued to the workqueue.

This removes the need for the flush_delayed_fput() call in
nfsd_file_close_inode_sync()

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/filecache.c |  3 +--
 fs/nfsd/vfs.c       | 42 +++++++++++++++++++++++++++++++++++++-----
 fs/nfsd/vfs.h       |  2 ++
 3 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index f8b100bca6e4da..8d9f7b07e35b39 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -280,7 +280,7 @@ nfsd_file_free(struct nfsd_file *nf)
 		nfsd_file_mark_put(nf->nf_mark);
 	if (nf->nf_file) {
 		nfsd_file_check_write_error(nf);
-		filp_close(nf->nf_file, NULL);
+		nfsd_filp_close(nf->nf_file);
 	}
 
 	/*
@@ -658,7 +658,6 @@ nfsd_file_close_inode_sync(struct inode *inode)
 		list_del_init(&nf->nf_lru);
 		nfsd_file_free(nf);
 	}
-	flush_delayed_fput();
 }
 
 static int
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b7c7a9273ea01d..f57749cd6f0b1a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1906,10 +1906,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	fh_drop_write(ffhp);
 
 	/*
-	 * If the target dentry has cached open files, then we need to try to
-	 * close them prior to doing the rename. Flushing delayed fput
-	 * shouldn't be done with locks held however, so we delay it until this
-	 * point and then reattempt the whole shebang.
+	 * If the target dentry has cached open files, then we need to
+	 * try to close them prior to doing the rename.  Final fput
+	 * shouldn't be done with locks held however, so we delay it
+	 * until this point and then reattempt the whole shebang.
 	 */
 	if (close_cached) {
 		close_cached = false;
@@ -2177,11 +2177,43 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
 	if (err == nfserr_eof || err == nfserr_toosmall)
 		err = nfs_ok; /* can still be found in ->err */
 out_close:
-	fput(file);
+	nfsd_filp_close(file);
 out:
 	return err;
 }
 
+/**
+ * nfsd_filp_close: close a file synchronously
+ * @fp: the file to close
+ *
+ * nfsd_filp_close() is similar in behaviour to filp_close().
+ * The difference is that if this is the final close on the
+ * file, the that finalisation happens immediately, rather then
+ * being handed over to a work_queue, as it the case for
+ * filp_close().
+ * When a user-space process closes a file (even when using
+ * filp_close() the finalisation happens before returning to
+ * userspace, so it is effectively synchronous.  When a kernel thread
+ * uses file_close(), on the other hand, the handling is completely
+ * asynchronous.  This means that any cost imposed by that finalisation
+ * is not imposed on the nfsd thread, and nfsd could potentually
+ * close files more quickly than the work queue finalises the close,
+ * which would lead to unbounded growth in the queue.
+ *
+ * In some contexts is it not safe to synchronously wait for
+ * close finalisation (see comment for __fput_sync()), but nfsd
+ * does not match those contexts.  In partcilarly it does not, at the
+ * time that this function is called, hold and locks and no finalisation
+ * of any file, socket, or device driver would have any cause to wait
+ * for nfsd to make progress.
+ */
+void nfsd_filp_close(struct file *fp)
+{
+	get_file(fp);
+	filp_close(fp, NULL);
+	__fput_sync(fp);
+}
+
 /*
  * Get file system stats
  * N.B. After this call fhp needs an fh_put
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 702fbc4483bf16..1efa4e8dfb0349 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -148,6 +148,8 @@ __be32		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
 __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
 				struct dentry *, int);
 
+void		nfsd_filp_close(struct file *fp);
+
 static inline int fh_want_write(struct svc_fh *fh)
 {
 	int ret;

From 26efb5e8c0b15f4f5da1046a3e3ea1e51c9f71ee Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 17 Jan 2024 14:48:04 +1100
Subject: [PATCH 266/707] nfsd: drop st_mutex and rp_mutex before calling
 move_to_close_lru()

move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
This can lead to a deadlock as move_to_close_lru() waits for sc_count to
drop to 2, and some threads holding a reference might be waiting for either
mutex.  These references will never be dropped so sc_count will never
reach 2.

There can be no harm in dropping ->st_mutex to before
move_to_close_lru() because the only place that takes the mutex is
nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.

Similarly dropping .rp_mutex is safe after the state is closed and so
no longer usable.  Another way to look at this is that nothing
significant happens between when nfsd4_close() now calls
nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
nfsd4_cstate_clear_replay() a little later.

See also
 https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
where this problem was raised but not successfully resolved.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6dc6340e28529d..d7d561b29fb0d9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -6991,7 +6991,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	return status;
 }
 
-static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
+static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 {
 	struct nfs4_client *clp = s->st_stid.sc_client;
 	bool unhashed;
@@ -7008,11 +7008,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 		list_for_each_entry(stp, &reaplist, st_locks)
 			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
 		free_ol_stateid_reaplist(&reaplist);
+		return false;
 	} else {
 		spin_unlock(&clp->cl_lock);
 		free_ol_stateid_reaplist(&reaplist);
-		if (unhashed)
-			move_to_close_lru(s, clp->net);
+		return unhashed;
 	}
 }
 
@@ -7028,6 +7028,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfs4_ol_stateid *stp;
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	bool need_move_to_close_list;
 
 	dprintk("NFSD: nfsd4_close on file %pd\n", 
 			cstate->current_fh.fh_dentry);
@@ -7050,8 +7051,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 */
 	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
 
-	nfsd4_close_open_stateid(stp);
+	need_move_to_close_list = nfsd4_close_open_stateid(stp);
 	mutex_unlock(&stp->st_mutex);
+	if (need_move_to_close_list) {
+		/* Drop the replay mutex early as move_to_close_lru()
+		 * can wait for other threads which hold that mutex.
+		 * This call is idempotent, so that fact that it will
+		 * be called twice is harmless.
+		 */
+		nfsd4_cstate_clear_replay(cstate);
+		move_to_close_lru(stp, net);
+	}
 
 	/* v4.1+ suggests that we send a special stateid in here, since the
 	 * clients should just ignore this anyway. Since this is not useful

From 135d5626e96952a2d09abdea5622e689aff2c4a6 Mon Sep 17 00:00:00 2001
From: Jorge Mora <jmora1300@gmail.com>
Date: Thu, 25 Jan 2024 07:46:54 -0700
Subject: [PATCH 267/707] NFSD: fix nfsd4_listxattr_validate_cookie

If LISTXATTRS is sent with a correct cookie but a small maxcount,
this could lead function nfsd4_listxattr_validate_cookie to
return NFS4ERR_BAD_COOKIE. If maxcount = 20, then second check
on function gives RHS = 3 thus any cookie larger than 3 returns
NFS4ERR_BAD_COOKIE.

There is no need to validate the cookie on the return XDR buffer
since attribute referenced by cookie will be the first in the
return buffer.

Fixes: 23e50fe3a5e6 ("nfsd: implement the xattr functions and en/decode logic")
Signed-off-by: Jorge Mora <mora@netapp.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c719c475a068ef..f0be0d6fe63fd2 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -5386,16 +5386,11 @@ nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs,
 
 	/*
 	 * If the cookie is larger than the maximum number we can fit
-	 * in either the buffer we just got back from vfs_listxattr, or,
-	 * XDR-encoded, in the return buffer, it's invalid.
+	 * in the buffer we just got back from vfs_listxattr, it's invalid.
 	 */
 	if (cookie > (listxattrs->lsxa_len) / (XATTR_USER_PREFIX_LEN + 2))
 		return nfserr_badcookie;
 
-	if (cookie > (listxattrs->lsxa_maxcount /
-		      (XDR_QUADLEN(XATTR_USER_PREFIX_LEN + 2) + 4)))
-		return nfserr_badcookie;
-
 	*offsetp = (u32)cookie;
 	return 0;
 }

From 42ec6645cb8f43a4062f8c62bbd13a43fa73351b Mon Sep 17 00:00:00 2001
From: Jorge Mora <jmora1300@gmail.com>
Date: Thu, 25 Jan 2024 07:46:12 -0700
Subject: [PATCH 268/707] NFSD: change LISTXATTRS cookie encoding to big-endian

Function nfsd4_listxattr_validate_cookie() expects the cookie
as an offset to the list thus it needs to be encoded in big-endian.

Fixes: 23e50fe3a5e6 ("nfsd: implement the xattr functions and en/decode logic")
Signed-off-by: Jorge Mora <mora@netapp.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f0be0d6fe63fd2..5649076df4b4f7 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -5407,6 +5407,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
 	u64 cookie;
 	char *sp;
 	__be32 status, tmp;
+	__be64 wire_cookie;
 	__be32 *p;
 	u32 nuser;
 
@@ -5498,7 +5499,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 	cookie = offset + count;
 
-	write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &cookie, 8);
+	wire_cookie = cpu_to_be64(cookie);
+	write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &wire_cookie, 8);
 	tmp = cpu_to_be32(count);
 	write_bytes_to_xdr_buf(xdr->buf, count_offset, &tmp, 4);
 out:

From 1d6bbeca66aaff9a1698f7fbee243cb379012364 Mon Sep 17 00:00:00 2001
From: Jorge Mora <jmora1300@gmail.com>
Date: Thu, 25 Jan 2024 07:45:28 -0700
Subject: [PATCH 269/707] NFSD: fix LISTXATTRS returning a short list with
 eof=TRUE

If the XDR buffer is not large enough to fit all attributes
and the remaining bytes left in the XDR buffer (xdrleft) is
equal to the number of bytes for the current attribute, then
the loop will prematurely exit without setting eof to FALSE.
Also in this case, adding the eof flag to the buffer will
make the reply 4 bytes larger than lsxa_maxcount.

Need to check if there are enough bytes to fit not only the
next attribute name but also the eof as well.

Fixes: 23e50fe3a5e6 ("nfsd: implement the xattr functions and en/decode logic")
Signed-off-by: Jorge Mora <mora@netapp.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5649076df4b4f7..840ecd7eaf0713 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -5447,7 +5447,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 		slen -= XATTR_USER_PREFIX_LEN;
 		xdrlen = 4 + ((slen + 3) & ~3);
-		if (xdrlen > xdrleft) {
+		/* Check if both entry and eof can fit in the XDR buffer */
+		if (xdrlen + XDR_UNIT > xdrleft) {
 			if (count == 0) {
 				/*
 				 * Can't even fit the first attribute name.

From 3713a52d5a7df24ab62ba6d9a0a72b1dae473a6e Mon Sep 17 00:00:00 2001
From: Jorge Mora <jmora1300@gmail.com>
Date: Thu, 25 Jan 2024 07:42:23 -0700
Subject: [PATCH 270/707] NFSD: fix LISTXATTRS returning more bytes than
 maxcount

The maxcount is the maximum number of bytes for the LISTXATTRS4resok
result. This includes the cookie and the count for the name array,
thus subtract 12 bytes from the maxcount: 8 (cookie) + 4 (array count)
when filling up the name array.

Fixes: 23e50fe3a5e6 ("nfsd: implement the xattr functions and en/decode logic")
Signed-off-by: Jorge Mora <mora@netapp.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 840ecd7eaf0713..e3f761cd5ee78d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -5423,7 +5423,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
 	 */
 	cookie_offset = xdr->buf->len;
 	count_offset = cookie_offset + 8;
-	p = xdr_reserve_space(xdr, 12);
+	p = xdr_reserve_space(xdr, XDR_UNIT * 3);
 	if (!p) {
 		status = nfserr_resource;
 		goto out;
@@ -5434,7 +5434,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
 	sp = listxattrs->lsxa_buf;
 	nuser = 0;
 
-	xdrleft = listxattrs->lsxa_maxcount;
+	/* Bytes left is maxcount - 8 (cookie) - 4 (array count) */
+	xdrleft = listxattrs->lsxa_maxcount - XDR_UNIT * 3;
 
 	while (left > 0 && xdrleft > 0) {
 		slen = strlen(sp);

From c1365ef33ad700b0a1fcdc65110e6c94d274f348 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:40 -0500
Subject: [PATCH 271/707] sunrpc: don't change ->sv_stats if it doesn't exist

We check for the existence of ->sv_stats elsewhere except in the core
processing code.  It appears that only nfsd actual exports these values
anywhere, everybody else just has a write only copy of sv_stats in their
svc_program.  Add a check for ->sv_stats before every adjustment to
allow us to eliminate the stats struct from all the users who don't
report the stats.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/svc.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index f60c93e5a25d69..d2e6f3d5921801 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1375,7 +1375,8 @@ svc_process_common(struct svc_rqst *rqstp)
 		goto err_bad_proc;
 
 	/* Syntactic check complete */
-	serv->sv_stats->rpccnt++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpccnt++;
 	trace_svc_process(rqstp, progp->pg_name);
 
 	aoffset = xdr_stream_pos(xdr);
@@ -1427,7 +1428,8 @@ svc_process_common(struct svc_rqst *rqstp)
 	goto close_xprt;
 
 err_bad_rpc:
-	serv->sv_stats->rpcbadfmt++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpcbadfmt++;
 	xdr_stream_encode_u32(xdr, RPC_MSG_DENIED);
 	xdr_stream_encode_u32(xdr, RPC_MISMATCH);
 	/* Only RPCv2 supported */
@@ -1438,7 +1440,8 @@ svc_process_common(struct svc_rqst *rqstp)
 err_bad_auth:
 	dprintk("svc: authentication failed (%d)\n",
 		be32_to_cpu(rqstp->rq_auth_stat));
-	serv->sv_stats->rpcbadauth++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpcbadauth++;
 	/* Restore write pointer to location of reply status: */
 	xdr_truncate_encode(xdr, XDR_UNIT * 2);
 	xdr_stream_encode_u32(xdr, RPC_MSG_DENIED);
@@ -1448,7 +1451,8 @@ svc_process_common(struct svc_rqst *rqstp)
 
 err_bad_prog:
 	dprintk("svc: unknown program %d\n", rqstp->rq_prog);
-	serv->sv_stats->rpcbadfmt++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpcbadfmt++;
 	*rqstp->rq_accept_statp = rpc_prog_unavail;
 	goto sendit;
 
@@ -1456,7 +1460,8 @@ svc_process_common(struct svc_rqst *rqstp)
 	svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n",
 		       rqstp->rq_vers, rqstp->rq_prog, progp->pg_name);
 
-	serv->sv_stats->rpcbadfmt++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpcbadfmt++;
 	*rqstp->rq_accept_statp = rpc_prog_mismatch;
 
 	/*
@@ -1470,19 +1475,22 @@ svc_process_common(struct svc_rqst *rqstp)
 err_bad_proc:
 	svc_printk(rqstp, "unknown procedure (%d)\n", rqstp->rq_proc);
 
-	serv->sv_stats->rpcbadfmt++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpcbadfmt++;
 	*rqstp->rq_accept_statp = rpc_proc_unavail;
 	goto sendit;
 
 err_garbage_args:
 	svc_printk(rqstp, "failed to decode RPC header\n");
 
-	serv->sv_stats->rpcbadfmt++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpcbadfmt++;
 	*rqstp->rq_accept_statp = rpc_garbage_args;
 	goto sendit;
 
 err_system_err:
-	serv->sv_stats->rpcbadfmt++;
+	if (serv->sv_stats)
+		serv->sv_stats->rpcbadfmt++;
 	*rqstp->rq_accept_statp = rpc_system_err;
 	goto sendit;
 }
@@ -1534,7 +1542,8 @@ void svc_process(struct svc_rqst *rqstp)
 out_baddir:
 	svc_printk(rqstp, "bad direction 0x%08x, dropping request\n",
 		   be32_to_cpu(*p));
-	rqstp->rq_server->sv_stats->rpcbadfmt++;
+	if (rqstp->rq_server->sv_stats)
+		rqstp->rq_server->sv_stats->rpcbadfmt++;
 out_drop:
 	svc_drop(rqstp);
 }

From 93af2fc6e6589e225e27e96e96604fb1352c8366 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:41 -0500
Subject: [PATCH 272/707] nfsd: stop setting ->pg_stats for unused stats

A lot of places are setting a blank svc_stats in ->pg_stats and never
utilizing these stats.  Remove all of these extra structs as we're not
reporting these stats anywhere.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/svc.c    | 3 ---
 fs/nfs/callback.c | 3 ---
 fs/nfsd/nfssvc.c  | 5 -----
 3 files changed, 11 deletions(-)

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index ce5862482097a1..ab8042a5b895bc 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -710,8 +710,6 @@ static const struct svc_version *nlmsvc_version[] = {
 #endif
 };
 
-static struct svc_stat		nlmsvc_stats;
-
 #define NLM_NRVERS	ARRAY_SIZE(nlmsvc_version)
 static struct svc_program	nlmsvc_program = {
 	.pg_prog		= NLM_PROGRAM,		/* program number */
@@ -719,7 +717,6 @@ static struct svc_program	nlmsvc_program = {
 	.pg_vers		= nlmsvc_version,	/* version table */
 	.pg_name		= "lockd",		/* service name */
 	.pg_class		= "nfsd",		/* share authentication with nfsd */
-	.pg_stats		= &nlmsvc_stats,	/* stats table */
 	.pg_authenticate	= &lockd_authenticate,	/* export authentication */
 	.pg_init_request	= svc_generic_init_request,
 	.pg_rpcbind_set		= svc_generic_rpcbind_set,
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 760d27dd7225e9..8adfcd4c8c1a0a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -356,15 +356,12 @@ static const struct svc_version *nfs4_callback_version[] = {
 	[4] = &nfs4_callback_version4,
 };
 
-static struct svc_stat nfs4_callback_stats;
-
 static struct svc_program nfs4_callback_program = {
 	.pg_prog = NFS4_CALLBACK,			/* RPC service number */
 	.pg_nvers = ARRAY_SIZE(nfs4_callback_version),	/* Number of entries */
 	.pg_vers = nfs4_callback_version,		/* version table */
 	.pg_name = "NFSv4 callback",			/* service name */
 	.pg_class = "nfs",				/* authentication class */
-	.pg_stats = &nfs4_callback_stats,
 	.pg_authenticate = nfs_callback_authenticate,
 	.pg_init_request = svc_generic_init_request,
 	.pg_rpcbind_set	= svc_generic_rpcbind_set,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 9a894c3511baf3..a0b117107e8605 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -80,7 +80,6 @@ unsigned long	nfsd_drc_max_mem;
 unsigned long	nfsd_drc_mem_used;
 
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-static struct svc_stat	nfsd_acl_svcstats;
 static const struct svc_version *nfsd_acl_version[] = {
 # if defined(CONFIG_NFSD_V2_ACL)
 	[2] = &nfsd_acl_version2,
@@ -99,15 +98,11 @@ static struct svc_program	nfsd_acl_program = {
 	.pg_vers		= nfsd_acl_version,
 	.pg_name		= "nfsacl",
 	.pg_class		= "nfsd",
-	.pg_stats		= &nfsd_acl_svcstats,
 	.pg_authenticate	= &svc_set_client,
 	.pg_init_request	= nfsd_acl_init_request,
 	.pg_rpcbind_set		= nfsd_acl_rpcbind_set,
 };
 
-static struct svc_stat	nfsd_acl_svcstats = {
-	.program	= &nfsd_acl_program,
-};
 #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
 
 static const struct svc_version *nfsd_version[] = {

From 6b32f5f1e4c2e77d3e8a811cc40a2b5d8d11dd2b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:42 -0500
Subject: [PATCH 273/707] sunrpc: pass in the sv_stats struct through
 svc_create_pooled

Since only one service actually reports the rpc stats there's not much
of a reason to have a pointer to it in the svc_program struct.  Adjust
the svc_create_pooled function to take the sv_stats as an argument and
pass the struct through there as desired instead of getting it from the
svc_program->pg_stats.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfssvc.c           |  3 ++-
 include/linux/sunrpc/svc.h |  4 +++-
 net/sunrpc/svc.c           | 12 +++++++-----
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index a0b117107e8605..d640f893021a71 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -661,7 +661,8 @@ int nfsd_create_serv(struct net *net)
 	if (nfsd_max_blksize == 0)
 		nfsd_max_blksize = nfsd_get_default_max_blksize();
 	nfsd_reset_versions(nn);
-	serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd);
+	serv = svc_create_pooled(&nfsd_program, &nfsd_svcstats,
+				 nfsd_max_blksize, nfsd);
 	if (serv == NULL)
 		return -ENOMEM;
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 67cf1c9efd809b..91a653eb3a5073 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -411,7 +411,9 @@ bool		   svc_rqst_replace_page(struct svc_rqst *rqstp,
 void		   svc_rqst_release_pages(struct svc_rqst *rqstp);
 void		   svc_rqst_free(struct svc_rqst *);
 void		   svc_exit_thread(struct svc_rqst *);
-struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
+struct svc_serv *  svc_create_pooled(struct svc_program *prog,
+				     struct svc_stat *stats,
+				     unsigned int bufsize,
 				     int (*threadfn)(void *data));
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_info *si, struct file *file);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index d2e6f3d5921801..9bd8a868c1a70a 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -451,8 +451,8 @@ __svc_init_bc(struct svc_serv *serv)
  * Create an RPC service
  */
 static struct svc_serv *
-__svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
-	     int (*threadfn)(void *data))
+__svc_create(struct svc_program *prog, struct svc_stat *stats,
+	     unsigned int bufsize, int npools, int (*threadfn)(void *data))
 {
 	struct svc_serv	*serv;
 	unsigned int vers;
@@ -463,7 +463,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 		return NULL;
 	serv->sv_name      = prog->pg_name;
 	serv->sv_program   = prog;
-	serv->sv_stats     = prog->pg_stats;
+	serv->sv_stats     = stats;
 	if (bufsize > RPCSVC_MAXPAYLOAD)
 		bufsize = RPCSVC_MAXPAYLOAD;
 	serv->sv_max_payload = bufsize? bufsize : 4096;
@@ -529,26 +529,28 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 struct svc_serv *svc_create(struct svc_program *prog, unsigned int bufsize,
 			    int (*threadfn)(void *data))
 {
-	return __svc_create(prog, bufsize, 1, threadfn);
+	return __svc_create(prog, NULL, bufsize, 1, threadfn);
 }
 EXPORT_SYMBOL_GPL(svc_create);
 
 /**
  * svc_create_pooled - Create an RPC service with pooled threads
  * @prog: the RPC program the new service will handle
+ * @stats: the stats struct if desired
  * @bufsize: maximum message size for @prog
  * @threadfn: a function to service RPC requests for @prog
  *
  * Returns an instantiated struct svc_serv object or NULL.
  */
 struct svc_serv *svc_create_pooled(struct svc_program *prog,
+				   struct svc_stat *stats,
 				   unsigned int bufsize,
 				   int (*threadfn)(void *data))
 {
 	struct svc_serv *serv;
 	unsigned int npools = svc_pool_map_get();
 
-	serv = __svc_create(prog, bufsize, npools, threadfn);
+	serv = __svc_create(prog, stats, bufsize, npools, threadfn);
 	if (!serv)
 		goto out_err;
 	return serv;

From 2896caafbc9efc3188ee0e38440efd29130fdd94 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:43 -0500
Subject: [PATCH 274/707] sunrpc: remove ->pg_stats from svc_program

Now that this isn't used anywhere, remove it.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfssvc.c           | 1 -
 include/linux/sunrpc/svc.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index d640f893021a71..d98a6abad99010 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -127,7 +127,6 @@ struct svc_program		nfsd_program = {
 	.pg_vers		= nfsd_version,		/* version table */
 	.pg_name		= "nfsd",		/* program name */
 	.pg_class		= "nfsd",		/* authentication class */
-	.pg_stats		= &nfsd_svcstats,	/* version table */
 	.pg_authenticate	= &svc_set_client,	/* export authentication */
 	.pg_init_request	= nfsd_init_request,
 	.pg_rpcbind_set		= nfsd_rpcbind_set,
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 91a653eb3a5073..23617da0e565e7 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -339,7 +339,6 @@ struct svc_program {
 	const struct svc_version **pg_vers;	/* version array */
 	char *			pg_name;	/* service name */
 	char *			pg_class;	/* class name: services sharing authentication */
-	struct svc_stat *	pg_stats;	/* rpc statistics */
 	enum svc_auth_status	(*pg_authenticate)(struct svc_rqst *rqstp);
 	__be32			(*pg_init_request)(struct svc_rqst *,
 						   const struct svc_program *,

From b3b72309149a69c7e16911fbbf77dde058f75491 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:44 -0500
Subject: [PATCH 275/707] sunrpc: use the struct net as the svc proc private

nfsd is the only thing using this helper, and it doesn't use the private
currently.  When we switch to per-network namespace stats we will need
the struct net * in order to get to the nfsd_net.  Use the net as the
proc private so we can utilize this when we make the switch over.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/stats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 65fc1297c6dfa4..383860cb1d5b0f 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -314,7 +314,7 @@ EXPORT_SYMBOL_GPL(rpc_proc_unregister);
 struct proc_dir_entry *
 svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops)
 {
-	return do_register(net, statp->program->pg_name, statp, proc_ops);
+	return do_register(net, statp->program->pg_name, net, proc_ops);
 }
 EXPORT_SYMBOL_GPL(svc_proc_register);
 

From a96efc8859539ce5da7e2a7cd0dd177d41b1810f Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:45 -0500
Subject: [PATCH 276/707] nfsd: rename NFSD_NET_* to NFSD_STATS_*

We're going to merge the stats all into per network namespace in
subsequent patches, rename these nn counters to be consistent with the
rest of the stats.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/netns.h    | 4 ++--
 fs/nfsd/nfscache.c | 4 ++--
 fs/nfsd/stats.h    | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 74b4360779a112..e3605cb5f044d8 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -26,9 +26,9 @@ struct nfsd4_client_tracking_ops;
 
 enum {
 	/* cache misses due only to checksum comparison failures */
-	NFSD_NET_PAYLOAD_MISSES,
+	NFSD_STATS_PAYLOAD_MISSES,
 	/* amount of memory (in bytes) currently consumed by the DRC */
-	NFSD_NET_DRC_MEM_USAGE,
+	NFSD_STATS_DRC_MEM_USAGE,
 	NFSD_NET_COUNTERS_NUM
 };
 
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 5c1a4a0aa60568..3d4a9d181c43e2 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -687,7 +687,7 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&nn->num_drc_entries));
 	seq_printf(m, "hash buckets:          %u\n", 1 << nn->maskbits);
 	seq_printf(m, "mem usage:             %lld\n",
-		   percpu_counter_sum_positive(&nn->counter[NFSD_NET_DRC_MEM_USAGE]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_DRC_MEM_USAGE]));
 	seq_printf(m, "cache hits:            %lld\n",
 		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]));
 	seq_printf(m, "cache misses:          %lld\n",
@@ -695,7 +695,7 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
 	seq_printf(m, "not cached:            %lld\n",
 		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]));
 	seq_printf(m, "payload misses:        %lld\n",
-		   percpu_counter_sum_positive(&nn->counter[NFSD_NET_PAYLOAD_MISSES]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_PAYLOAD_MISSES]));
 	seq_printf(m, "longest chain len:     %u\n", nn->longest_chain);
 	seq_printf(m, "cachesize at longest:  %u\n", nn->longest_chain_cachesize);
 	return 0;
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 14f50c660b619e..7ed4325ac69123 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -81,17 +81,17 @@ static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount)
 
 static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn)
 {
-	percpu_counter_inc(&nn->counter[NFSD_NET_PAYLOAD_MISSES]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_PAYLOAD_MISSES]);
 }
 
 static inline void nfsd_stats_drc_mem_usage_add(struct nfsd_net *nn, s64 amount)
 {
-	percpu_counter_add(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount);
+	percpu_counter_add(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount);
 }
 
 static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount)
 {
-	percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount);
+	percpu_counter_sub(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount);
 }
 
 #ifdef CONFIG_NFSD_V4

From bdf92cbe7097b5cccf4e3e9ae6619c0f315346f7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:46 -0500
Subject: [PATCH 277/707] nfsd: expose /proc/net/sunrpc/nfsd in net namespaces

We are running nfsd servers inside of containers with their own network
namespace, and we want to monitor these services using the stats found
in /proc.  However these are not exposed in the proc inside of the
container, so we have to bind mount the host /proc into our containers
to get at this information.

Separate out the stat counters init and the proc registration, and move
the proc registration into the pernet operations entry and exit points
so that these stats can be exposed inside of network namespaces.

This is an intermediate step, this just exposes the global counters in
the network namespace.  Subsequent patches will move these counters into
the per-network namespace container.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfsctl.c |  8 +++++---
 fs/nfsd/stats.c  | 21 ++++++---------------
 fs/nfsd/stats.h  |  6 ++++--
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index f206ca32e7f53c..b57480b50e350c 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1679,6 +1679,7 @@ static __net_init int nfsd_net_init(struct net *net)
 	nfsd4_init_leases_net(nn);
 	get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
 	seqlock_init(&nn->writeverf_lock);
+	nfsd_proc_stat_init(net);
 
 	return 0;
 
@@ -1699,6 +1700,7 @@ static __net_exit void nfsd_net_exit(struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
+	nfsd_proc_stat_shutdown(net);
 	nfsd_net_reply_cache_destroy(nn);
 	nfsd_idmap_shutdown(net);
 	nfsd_export_shutdown(net);
@@ -1722,7 +1724,7 @@ static int __init init_nfsd(void)
 	retval = nfsd4_init_pnfs();
 	if (retval)
 		goto out_free_slabs;
-	retval = nfsd_stat_init();	/* Statistics */
+	retval = nfsd_stat_counters_init();	/* Statistics */
 	if (retval)
 		goto out_free_pnfs;
 	retval = nfsd_drc_slab_create();
@@ -1762,7 +1764,7 @@ static int __init init_nfsd(void)
 	nfsd_lockd_shutdown();
 	nfsd_drc_slab_free();
 out_free_stat:
-	nfsd_stat_shutdown();
+	nfsd_stat_counters_destroy();
 out_free_pnfs:
 	nfsd4_exit_pnfs();
 out_free_slabs:
@@ -1780,7 +1782,7 @@ static void __exit exit_nfsd(void)
 	nfsd_drc_slab_free();
 	remove_proc_entry("fs/nfs/exports", NULL);
 	remove_proc_entry("fs/nfs", NULL);
-	nfsd_stat_shutdown();
+	nfsd_stat_counters_destroy();
 	nfsd_lockd_shutdown();
 	nfsd4_free_slabs();
 	nfsd4_exit_pnfs();
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 12d79f5d4eb1ac..394a65a33942d7 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -108,31 +108,22 @@ void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num)
 		percpu_counter_destroy(&counters[i]);
 }
 
-static int nfsd_stat_counters_init(void)
+int nfsd_stat_counters_init(void)
 {
 	return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
 }
 
-static void nfsd_stat_counters_destroy(void)
+void nfsd_stat_counters_destroy(void)
 {
 	nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
 }
 
-int nfsd_stat_init(void)
+void nfsd_proc_stat_init(struct net *net)
 {
-	int err;
-
-	err = nfsd_stat_counters_init();
-	if (err)
-		return err;
-
-	svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops);
-
-	return 0;
+	svc_proc_register(net, &nfsd_svcstats, &nfsd_proc_ops);
 }
 
-void nfsd_stat_shutdown(void)
+void nfsd_proc_stat_shutdown(struct net *net)
 {
-	nfsd_stat_counters_destroy();
-	svc_proc_unregister(&init_net, "nfsd");
+	svc_proc_unregister(net, "nfsd");
 }
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 7ed4325ac69123..38811aa7d13e1e 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -40,8 +40,10 @@ extern struct svc_stat		nfsd_svcstats;
 int nfsd_percpu_counters_init(struct percpu_counter *counters, int num);
 void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num);
 void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num);
-int nfsd_stat_init(void);
-void nfsd_stat_shutdown(void);
+int nfsd_stat_counters_init(void);
+void nfsd_stat_counters_destroy(void);
+void nfsd_proc_stat_init(struct net *net);
+void nfsd_proc_stat_shutdown(struct net *net);
 
 static inline void nfsd_stats_rc_hits_inc(void)
 {

From fce33795b2c3284637713515f4aaf6970942aa8c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:47 -0500
Subject: [PATCH 278/707] nfsd: make all of the nfsd stats per-network
 namespace

We have a global set of counters that we modify for all of the nfsd
operations, but now that we're exposing these stats across all network
namespaces we need to make the stats also be per-network namespace.  We
already have some caching stats that are per-network namespace, so move
these definitions into the same counter and then adjust all the helpers
and users of these stats to provide the appropriate nfsd_net struct so
that the stats are maintained for the per-network namespace objects.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/cache.h     |  2 --
 fs/nfsd/netns.h     | 17 ++++++++++++--
 fs/nfsd/nfs4proc.c  |  6 ++---
 fs/nfsd/nfs4state.c |  3 ++-
 fs/nfsd/nfscache.c  | 36 ++++++------------------------
 fs/nfsd/nfsctl.c    | 12 +++-------
 fs/nfsd/nfsfh.c     |  3 ++-
 fs/nfsd/stats.c     | 26 ++++++++++++----------
 fs/nfsd/stats.h     | 54 ++++++++++++++++-----------------------------
 fs/nfsd/vfs.c       |  6 +++--
 10 files changed, 69 insertions(+), 96 deletions(-)

diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 4cbe0434cbb8ce..66a05fefae98ea 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -80,8 +80,6 @@ enum {
 
 int	nfsd_drc_slab_create(void);
 void	nfsd_drc_slab_free(void);
-int	nfsd_net_reply_cache_init(struct nfsd_net *nn);
-void	nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
 int	nfsd_reply_cache_init(struct nfsd_net *);
 void	nfsd_reply_cache_shutdown(struct nfsd_net *);
 int	nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index e3605cb5f044d8..0cef4bb407a9c6 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -11,6 +11,7 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <linux/filelock.h>
+#include <linux/nfs4.h>
 #include <linux/percpu_counter.h>
 #include <linux/siphash.h>
 
@@ -29,7 +30,19 @@ enum {
 	NFSD_STATS_PAYLOAD_MISSES,
 	/* amount of memory (in bytes) currently consumed by the DRC */
 	NFSD_STATS_DRC_MEM_USAGE,
-	NFSD_NET_COUNTERS_NUM
+	NFSD_STATS_RC_HITS,		/* repcache hits */
+	NFSD_STATS_RC_MISSES,		/* repcache misses */
+	NFSD_STATS_RC_NOCACHE,		/* uncached reqs */
+	NFSD_STATS_FH_STALE,		/* FH stale error */
+	NFSD_STATS_IO_READ,		/* bytes returned to read requests */
+	NFSD_STATS_IO_WRITE,		/* bytes passed in write requests */
+#ifdef CONFIG_NFSD_V4
+	NFSD_STATS_FIRST_NFS4_OP,	/* count of individual nfsv4 operations */
+	NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP,
+#define NFSD_STATS_NFS4_OP(op)	(NFSD_STATS_FIRST_NFS4_OP + (op))
+	NFSD_STATS_WDELEG_GETATTR,	/* count of getattr conflict with wdeleg */
+#endif
+	NFSD_STATS_COUNTERS_NUM
 };
 
 /*
@@ -164,7 +177,7 @@ struct nfsd_net {
 	atomic_t                 num_drc_entries;
 
 	/* Per-netns stats counters */
-	struct percpu_counter    counter[NFSD_NET_COUNTERS_NUM];
+	struct percpu_counter    counter[NFSD_STATS_COUNTERS_NUM];
 
 	/* longest hash chain seen */
 	unsigned int             longest_chain;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 14712fa08f769e..648ff427005e6c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2490,10 +2490,10 @@ nfsd4_proc_null(struct svc_rqst *rqstp)
 	return rpc_success;
 }
 
-static inline void nfsd4_increment_op_stats(u32 opnum)
+static inline void nfsd4_increment_op_stats(struct nfsd_net *nn, u32 opnum)
 {
 	if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP)
-		percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_NFS4_OP(opnum)]);
+		percpu_counter_inc(&nn->counter[NFSD_STATS_NFS4_OP(opnum)]);
 }
 
 static const struct nfsd4_operation nfsd4_ops[];
@@ -2768,7 +2768,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
 					   status, nfsd4_op_name(op->opnum));
 
 		nfsd4_cstate_clear_replay(cstate);
-		nfsd4_increment_op_stats(op->opnum);
+		nfsd4_increment_op_stats(nn, op->opnum);
 	}
 
 	fh_put(current_fh);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d7d561b29fb0d9..82f456bc42a4cf 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -8461,6 +8461,7 @@ __be32
 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
 {
 	__be32 status;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct file_lock_context *ctx;
 	struct file_lock *fl;
 	struct nfs4_delegation *dp;
@@ -8490,7 +8491,7 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
 			}
 break_lease:
 			spin_unlock(&ctx->flc_lock);
-			nfsd_stats_wdeleg_getattr_inc();
+			nfsd_stats_wdeleg_getattr_inc(nn);
 			status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
 			if (status != nfserr_jukebox ||
 					!nfsd_wait_for_delegreturn(rqstp, inode))
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 3d4a9d181c43e2..cfcc6ac8f255a8 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -176,27 +176,6 @@ void nfsd_drc_slab_free(void)
 	kmem_cache_destroy(drc_slab);
 }
 
-/**
- * nfsd_net_reply_cache_init - per net namespace reply cache set-up
- * @nn: nfsd_net being initialized
- *
- * Returns zero on succes; otherwise a negative errno is returned.
- */
-int nfsd_net_reply_cache_init(struct nfsd_net *nn)
-{
-	return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM);
-}
-
-/**
- * nfsd_net_reply_cache_destroy - per net namespace reply cache tear-down
- * @nn: nfsd_net being freed
- *
- */
-void nfsd_net_reply_cache_destroy(struct nfsd_net *nn)
-{
-	nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM);
-}
-
 int nfsd_reply_cache_init(struct nfsd_net *nn)
 {
 	unsigned int hashsize;
@@ -501,7 +480,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct nfsd_cacherep *key,
 int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
 		      unsigned int len, struct nfsd_cacherep **cacherep)
 {
-	struct nfsd_net		*nn;
+	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct nfsd_cacherep	*rp, *found;
 	__wsum			csum;
 	struct nfsd_drc_bucket	*b;
@@ -510,7 +489,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
 	int rtn = RC_DOIT;
 
 	if (type == RC_NOCACHE) {
-		nfsd_stats_rc_nocache_inc();
+		nfsd_stats_rc_nocache_inc(nn);
 		goto out;
 	}
 
@@ -520,7 +499,6 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
 	 * Since the common case is a cache miss followed by an insert,
 	 * preallocate an entry.
 	 */
-	nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	rp = nfsd_cacherep_alloc(rqstp, csum, nn);
 	if (!rp)
 		goto out;
@@ -537,7 +515,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
 
 	nfsd_cacherep_dispose(&dispose);
 
-	nfsd_stats_rc_misses_inc();
+	nfsd_stats_rc_misses_inc(nn);
 	atomic_inc(&nn->num_drc_entries);
 	nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp));
 	goto out;
@@ -545,7 +523,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
 found_entry:
 	/* We found a matching entry which is either in progress or done. */
 	nfsd_reply_cache_free_locked(NULL, rp, nn);
-	nfsd_stats_rc_hits_inc();
+	nfsd_stats_rc_hits_inc(nn);
 	rtn = RC_DROPIT;
 	rp = found;
 
@@ -689,11 +667,11 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
 	seq_printf(m, "mem usage:             %lld\n",
 		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_DRC_MEM_USAGE]));
 	seq_printf(m, "cache hits:            %lld\n",
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS]));
 	seq_printf(m, "cache misses:          %lld\n",
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES]));
 	seq_printf(m, "not cached:            %lld\n",
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE]));
 	seq_printf(m, "payload misses:        %lld\n",
 		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_PAYLOAD_MISSES]));
 	seq_printf(m, "longest chain len:     %u\n", nn->longest_chain);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b57480b50e350c..ea3c8114245c28 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1671,7 +1671,7 @@ static __net_init int nfsd_net_init(struct net *net)
 	retval = nfsd_idmap_init(net);
 	if (retval)
 		goto out_idmap_error;
-	retval = nfsd_net_reply_cache_init(nn);
+	retval = nfsd_stat_counters_init(nn);
 	if (retval)
 		goto out_repcache_error;
 	nn->nfsd_versions = NULL;
@@ -1701,7 +1701,7 @@ static __net_exit void nfsd_net_exit(struct net *net)
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
 	nfsd_proc_stat_shutdown(net);
-	nfsd_net_reply_cache_destroy(nn);
+	nfsd_stat_counters_destroy(nn);
 	nfsd_idmap_shutdown(net);
 	nfsd_export_shutdown(net);
 	nfsd_netns_free_versions(nn);
@@ -1724,12 +1724,9 @@ static int __init init_nfsd(void)
 	retval = nfsd4_init_pnfs();
 	if (retval)
 		goto out_free_slabs;
-	retval = nfsd_stat_counters_init();	/* Statistics */
-	if (retval)
-		goto out_free_pnfs;
 	retval = nfsd_drc_slab_create();
 	if (retval)
-		goto out_free_stat;
+		goto out_free_pnfs;
 	nfsd_lockd_init();	/* lockd->nfsd callbacks */
 	retval = create_proc_exports_entry();
 	if (retval)
@@ -1763,8 +1760,6 @@ static int __init init_nfsd(void)
 out_free_lockd:
 	nfsd_lockd_shutdown();
 	nfsd_drc_slab_free();
-out_free_stat:
-	nfsd_stat_counters_destroy();
 out_free_pnfs:
 	nfsd4_exit_pnfs();
 out_free_slabs:
@@ -1782,7 +1777,6 @@ static void __exit exit_nfsd(void)
 	nfsd_drc_slab_free();
 	remove_proc_entry("fs/nfs/exports", NULL);
 	remove_proc_entry("fs/nfs", NULL);
-	nfsd_stat_counters_destroy();
 	nfsd_lockd_shutdown();
 	nfsd4_free_slabs();
 	nfsd4_exit_pnfs();
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index dbfa0ac13564ac..40fecf7b224f2f 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -327,6 +327,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 __be32
 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
 {
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct svc_export *exp = NULL;
 	struct dentry	*dentry;
 	__be32		error;
@@ -395,7 +396,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
 out:
 	trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error);
 	if (error == nfserr_stale)
-		nfsd_stats_fh_stale_inc(exp);
+		nfsd_stats_fh_stale_inc(nn, exp);
 	return error;
 }
 
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 394a65a33942d7..44e275324b06e5 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -34,15 +34,17 @@ struct svc_stat		nfsd_svcstats = {
 
 static int nfsd_show(struct seq_file *seq, void *v)
 {
+	struct net *net = pde_data(file_inode(seq->file));
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	int i;
 
 	seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n",
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]),
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]),
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]),
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_FH_STALE]),
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_READ]),
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS]),
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES]),
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE]),
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_FH_STALE]),
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_READ]),
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_WRITE]));
 
 	/* thread usage: */
 	seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt));
@@ -63,10 +65,10 @@ static int nfsd_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1);
 	for (i = 0; i <= LAST_NFS4_OP; i++) {
 		seq_printf(seq, " %lld",
-			   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)]));
+			   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_NFS4_OP(i)]));
 	}
 	seq_printf(seq, "\nwdeleg_getattr %lld",
-		percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]));
+		percpu_counter_sum_positive(&nn->counter[NFSD_STATS_WDELEG_GETATTR]));
 
 	seq_putc(seq, '\n');
 #endif
@@ -108,14 +110,14 @@ void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num)
 		percpu_counter_destroy(&counters[i]);
 }
 
-int nfsd_stat_counters_init(void)
+int nfsd_stat_counters_init(struct nfsd_net *nn)
 {
-	return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
+	return nfsd_percpu_counters_init(nn->counter, NFSD_STATS_COUNTERS_NUM);
 }
 
-void nfsd_stat_counters_destroy(void)
+void nfsd_stat_counters_destroy(struct nfsd_net *nn)
 {
-	nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
+	nfsd_percpu_counters_destroy(nn->counter, NFSD_STATS_COUNTERS_NUM);
 }
 
 void nfsd_proc_stat_init(struct net *net)
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 38811aa7d13e1e..c24be4ddbe7d70 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -10,26 +10,7 @@
 #include <uapi/linux/nfsd/stats.h>
 #include <linux/percpu_counter.h>
 
-
-enum {
-	NFSD_STATS_RC_HITS,		/* repcache hits */
-	NFSD_STATS_RC_MISSES,		/* repcache misses */
-	NFSD_STATS_RC_NOCACHE,		/* uncached reqs */
-	NFSD_STATS_FH_STALE,		/* FH stale error */
-	NFSD_STATS_IO_READ,		/* bytes returned to read requests */
-	NFSD_STATS_IO_WRITE,		/* bytes passed in write requests */
-#ifdef CONFIG_NFSD_V4
-	NFSD_STATS_FIRST_NFS4_OP,	/* count of individual nfsv4 operations */
-	NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP,
-#define NFSD_STATS_NFS4_OP(op)	(NFSD_STATS_FIRST_NFS4_OP + (op))
-	NFSD_STATS_WDELEG_GETATTR,	/* count of getattr conflict with wdeleg */
-#endif
-	NFSD_STATS_COUNTERS_NUM
-};
-
 struct nfsd_stats {
-	struct percpu_counter	counter[NFSD_STATS_COUNTERS_NUM];
-
 	atomic_t	th_cnt;		/* number of available threads */
 };
 
@@ -40,43 +21,46 @@ extern struct svc_stat		nfsd_svcstats;
 int nfsd_percpu_counters_init(struct percpu_counter *counters, int num);
 void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num);
 void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num);
-int nfsd_stat_counters_init(void);
-void nfsd_stat_counters_destroy(void);
+int nfsd_stat_counters_init(struct nfsd_net *nn);
+void nfsd_stat_counters_destroy(struct nfsd_net *nn);
 void nfsd_proc_stat_init(struct net *net);
 void nfsd_proc_stat_shutdown(struct net *net);
 
-static inline void nfsd_stats_rc_hits_inc(void)
+static inline void nfsd_stats_rc_hits_inc(struct nfsd_net *nn)
 {
-	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_HITS]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_RC_HITS]);
 }
 
-static inline void nfsd_stats_rc_misses_inc(void)
+static inline void nfsd_stats_rc_misses_inc(struct nfsd_net *nn)
 {
-	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_MISSES]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_RC_MISSES]);
 }
 
-static inline void nfsd_stats_rc_nocache_inc(void)
+static inline void nfsd_stats_rc_nocache_inc(struct nfsd_net *nn)
 {
-	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_RC_NOCACHE]);
 }
 
-static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp)
+static inline void nfsd_stats_fh_stale_inc(struct nfsd_net *nn,
+					   struct svc_export *exp)
 {
-	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_FH_STALE]);
 	if (exp && exp->ex_stats)
 		percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]);
 }
 
-static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount)
+static inline void nfsd_stats_io_read_add(struct nfsd_net *nn,
+					  struct svc_export *exp, s64 amount)
 {
-	percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount);
+	percpu_counter_add(&nn->counter[NFSD_STATS_IO_READ], amount);
 	if (exp && exp->ex_stats)
 		percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount);
 }
 
-static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount)
+static inline void nfsd_stats_io_write_add(struct nfsd_net *nn,
+					   struct svc_export *exp, s64 amount)
 {
-	percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount);
+	percpu_counter_add(&nn->counter[NFSD_STATS_IO_WRITE], amount);
 	if (exp && exp->ex_stats)
 		percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount);
 }
@@ -97,9 +81,9 @@ static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount)
 }
 
 #ifdef CONFIG_NFSD_V4
-static inline void nfsd_stats_wdeleg_getattr_inc(void)
+static inline void nfsd_stats_wdeleg_getattr_inc(struct nfsd_net *nn)
 {
-	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_WDELEG_GETATTR]);
 }
 #endif
 #endif /* _NFSD_STATS_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f57749cd6f0b1a..38952105ed7fd4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1002,7 +1002,9 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			       unsigned long *count, u32 *eof, ssize_t host_err)
 {
 	if (host_err >= 0) {
-		nfsd_stats_io_read_add(fhp->fh_export, host_err);
+		struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+		nfsd_stats_io_read_add(nn, fhp->fh_export, host_err);
 		*eof = nfsd_eof_on_read(file, offset, host_err, *count);
 		*count = host_err;
 		fsnotify_access(file);
@@ -1185,7 +1187,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 		goto out_nfserr;
 	}
 	*cnt = host_err;
-	nfsd_stats_io_write_add(exp, *cnt);
+	nfsd_stats_io_write_add(nn, exp, *cnt);
 	fsnotify_modify(file);
 	host_err = filemap_check_wb_err(file->f_mapping, since);
 	if (host_err < 0)

From 39ed42f221ad514767163d3c5fb5701fb5be47ef Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:48 -0500
Subject: [PATCH 279/707] nfsd: remove nfsd_stats, make th_cnt a global counter

This is the last global stat, take it out of the nfsd_stats struct and
make it a global part of nfsd, report it the same as always.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfsd.h   | 1 +
 fs/nfsd/nfssvc.c | 5 +++--
 fs/nfsd/stats.c  | 3 +--
 fs/nfsd/stats.h  | 6 ------
 4 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 304e9728b929a0..be2ea3d6d2a289 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -86,6 +86,7 @@ extern struct mutex		nfsd_mutex;
 extern spinlock_t		nfsd_drc_lock;
 extern unsigned long		nfsd_drc_max_mem;
 extern unsigned long		nfsd_drc_mem_used;
+extern atomic_t			nfsd_th_cnt;		/* number of available threads */
 
 extern const struct seq_operations nfs_exports_op;
 
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index d98a6abad99010..fdb59189643044 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -34,6 +34,7 @@
 
 #define NFSDDBG_FACILITY	NFSDDBG_SVC
 
+atomic_t			nfsd_th_cnt = ATOMIC_INIT(0);
 extern struct svc_program	nfsd_program;
 static int			nfsd(void *vrqstp);
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
@@ -924,7 +925,7 @@ nfsd(void *vrqstp)
 
 	current->fs->umask = 0;
 
-	atomic_inc(&nfsdstats.th_cnt);
+	atomic_inc(&nfsd_th_cnt);
 
 	set_freezable();
 
@@ -940,7 +941,7 @@ nfsd(void *vrqstp)
 		nfsd_file_net_dispose(nn);
 	}
 
-	atomic_dec(&nfsdstats.th_cnt);
+	atomic_dec(&nfsd_th_cnt);
 
 out:
 	/* Release the thread */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 44e275324b06e5..3a7f791c30528d 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -27,7 +27,6 @@
 
 #include "nfsd.h"
 
-struct nfsd_stats	nfsdstats;
 struct svc_stat		nfsd_svcstats = {
 	.program	= &nfsd_program,
 };
@@ -47,7 +46,7 @@ static int nfsd_show(struct seq_file *seq, void *v)
 		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_WRITE]));
 
 	/* thread usage: */
-	seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt));
+	seq_printf(seq, "th %u 0", atomic_read(&nfsd_th_cnt));
 
 	/* deprecated thread usage histogram stats */
 	for (i = 0; i < 10; i++)
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index c24be4ddbe7d70..5675d283a53730 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -10,12 +10,6 @@
 #include <uapi/linux/nfsd/stats.h>
 #include <linux/percpu_counter.h>
 
-struct nfsd_stats {
-	atomic_t	th_cnt;		/* number of available threads */
-};
-
-extern struct nfsd_stats	nfsdstats;
-
 extern struct svc_stat		nfsd_svcstats;
 
 int nfsd_percpu_counters_init(struct percpu_counter *counters, int num);

From 52ebfd3e83cda597a4ea431a682e4424d4cadea1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Jan 2024 10:39:49 -0500
Subject: [PATCH 280/707] nfsd: make svc_stat per-network namespace instead of
 global

The final bit of stats that is global is the rpc svc_stat.  Move this
into the nfsd_net struct and use that everywhere instead of the global
struct.  Remove the unused global struct.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/netns.h  |  4 ++++
 fs/nfsd/nfsctl.c |  2 ++
 fs/nfsd/nfssvc.c |  2 +-
 fs/nfsd/stats.c  | 10 ++++------
 fs/nfsd/stats.h  |  2 --
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 0cef4bb407a9c6..afc16ee4da7428 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -14,6 +14,7 @@
 #include <linux/nfs4.h>
 #include <linux/percpu_counter.h>
 #include <linux/siphash.h>
+#include <linux/sunrpc/stats.h>
 
 /* Hash tables for nfs4_clientid state */
 #define CLIENT_HASH_BITS                 4
@@ -179,6 +180,9 @@ struct nfsd_net {
 	/* Per-netns stats counters */
 	struct percpu_counter    counter[NFSD_STATS_COUNTERS_NUM];
 
+	/* sunrpc svc stats */
+	struct svc_stat          nfsd_svcstats;
+
 	/* longest hash chain seen */
 	unsigned int             longest_chain;
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index ea3c8114245c28..5a5547bd6ecf7e 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1674,6 +1674,8 @@ static __net_init int nfsd_net_init(struct net *net)
 	retval = nfsd_stat_counters_init(nn);
 	if (retval)
 		goto out_repcache_error;
+	memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats));
+	nn->nfsd_svcstats.program = &nfsd_program;
 	nn->nfsd_versions = NULL;
 	nn->nfsd4_minorversions = NULL;
 	nfsd4_init_leases_net(nn);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index fdb59189643044..c0d17b92b249f7 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -661,7 +661,7 @@ int nfsd_create_serv(struct net *net)
 	if (nfsd_max_blksize == 0)
 		nfsd_max_blksize = nfsd_get_default_max_blksize();
 	nfsd_reset_versions(nn);
-	serv = svc_create_pooled(&nfsd_program, &nfsd_svcstats,
+	serv = svc_create_pooled(&nfsd_program, &nn->nfsd_svcstats,
 				 nfsd_max_blksize, nfsd);
 	if (serv == NULL)
 		return -ENOMEM;
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 3a7f791c30528d..be52fb1e928ed6 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -27,10 +27,6 @@
 
 #include "nfsd.h"
 
-struct svc_stat		nfsd_svcstats = {
-	.program	= &nfsd_program,
-};
-
 static int nfsd_show(struct seq_file *seq, void *v)
 {
 	struct net *net = pde_data(file_inode(seq->file));
@@ -56,7 +52,7 @@ static int nfsd_show(struct seq_file *seq, void *v)
 	seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n");
 
 	/* show my rpc info */
-	svc_seq_show(seq, &nfsd_svcstats);
+	svc_seq_show(seq, &nn->nfsd_svcstats);
 
 #ifdef CONFIG_NFSD_V4
 	/* Show count for individual nfsv4 operations */
@@ -121,7 +117,9 @@ void nfsd_stat_counters_destroy(struct nfsd_net *nn)
 
 void nfsd_proc_stat_init(struct net *net)
 {
-	svc_proc_register(net, &nfsd_svcstats, &nfsd_proc_ops);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops);
 }
 
 void nfsd_proc_stat_shutdown(struct net *net)
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 5675d283a53730..d2753e975dfd34 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -10,8 +10,6 @@
 #include <uapi/linux/nfsd/stats.h>
 #include <linux/percpu_counter.h>
 
-extern struct svc_stat		nfsd_svcstats;
-
 int nfsd_percpu_counters_init(struct percpu_counter *counters, int num);
 void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num);
 void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num);

From 3b818452f12d6d2e25c8e4f40cc88d6761e838ea Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:45:17 -0500
Subject: [PATCH 281/707] NFSD: Reset cb_seq_status after NFS4ERR_DELAY

I noticed that once an NFSv4.1 callback operation gets a
NFS4ERR_DELAY status on CB_SEQUENCE and then the connection is lost,
the callback client loops, resending it indefinitely.

The switch arm in nfsd4_cb_sequence_done() that handles
NFS4ERR_DELAY uses rpc_restart_call() to rearm the RPC state machine
for the retransmit, but that path does not call the rpc_prepare_call
callback again. Thus cb_seq_status is set to -10008 by the first
NFS4ERR_DELAY result, but is never set back to 1 for the retransmits.

nfsd4_cb_sequence_done() thinks it's getting nothing but a
long series of CB_SEQUENCE NFS4ERR_DELAY replies.

Fixes: 7ba6cad6c88f ("nfsd: New helper nfsd4_cb_sequence_done() for processing more cb errors")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 926c29879c6ab8..43b0a34a5d5b8a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1178,6 +1178,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		ret = false;
 		break;
 	case -NFS4ERR_DELAY:
+		cb->cb_seq_status = 1;
 		if (!rpc_restart_call(task))
 			goto out;
 

From 1d3befc1d9d5c705955315037055e4f9ea2003b7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:45:23 -0500
Subject: [PATCH 282/707] NFSD: Convert the callback workqueue to use
 delayed_work

Normally, NFSv4 callback operations are supposed to be sent to the
client as soon as they are queued up.

In a moment, I will introduce a recovery path where the server has
to wait for the client to reconnect. We don't want a hard busy wait
here -- the callback should be requeued to try again in several
milliseconds.

For now, convert nfsd4_callback from struct work_struct to struct
delayed_work, and queue with a zero delay argument. This should
avoid behavior changes for current operation.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 6 +++---
 fs/nfsd/state.h        | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 43b0a34a5d5b8a..1ed2512b364846 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -887,7 +887,7 @@ static struct workqueue_struct *callback_wq;
 
 static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
 {
-	return queue_work(callback_wq, &cb->cb_work);
+	return queue_delayed_work(callback_wq, &cb->cb_work, 0);
 }
 
 static void nfsd41_cb_inflight_begin(struct nfs4_client *clp)
@@ -1370,7 +1370,7 @@ static void
 nfsd4_run_cb_work(struct work_struct *work)
 {
 	struct nfsd4_callback *cb =
-		container_of(work, struct nfsd4_callback, cb_work);
+		container_of(work, struct nfsd4_callback, cb_work.work);
 	struct nfs4_client *clp = cb->cb_clp;
 	struct rpc_clnt *clnt;
 	int flags;
@@ -1415,7 +1415,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
 	cb->cb_msg.rpc_argp = cb;
 	cb->cb_msg.rpc_resp = cb;
 	cb->cb_ops = ops;
-	INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
+	INIT_DELAYED_WORK(&cb->cb_work, nfsd4_run_cb_work);
 	cb->cb_seq_status = 1;
 	cb->cb_status = 0;
 	cb->cb_need_restart = false;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 41bdc913fa715b..87c4372ba36a8d 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -68,7 +68,7 @@ struct nfsd4_callback {
 	struct nfs4_client *cb_clp;
 	struct rpc_message cb_msg;
 	const struct nfsd4_callback_ops *cb_ops;
-	struct work_struct cb_work;
+	struct delayed_work cb_work;
 	int cb_seq_status;
 	int cb_status;
 	bool cb_need_restart;

From 4bfc8cc603d73d757dafed5eb270aca6d189543e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:45:29 -0500
Subject: [PATCH 283/707] NFSD: Reschedule CB operations when backchannel
 rpc_clnt is shut down

As part of managing a client disconnect, NFSD closes down and
replaces the backchannel rpc_clnt.

If a callback operation is pending when the backchannel rpc_clnt is
shut down, currently nfsd4_run_cb_work() just discards that
callback. But there are multiple cases to deal with here:

 o The client's lease is getting destroyed. Throw the CB away.

 o The client disconnected. It might be forcing a retransmit of
   CB operations, or it could have disconnected for other reasons.
   Reschedule the CB so it is retransmitted when the client
   reconnects.

Since callback operations can now be rescheduled, ensure that
cb_ops->prepare can be called only once by moving the
cb_ops->prepare paragraph down to just before the rpc_call_async()
call.

Fixes: 2bbfed98a4d8 ("nfsd: Fix races between nfsd4_cb_release() and nfsd4_shutdown_callback()")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 1ed2512b364846..389d05985c5230 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -890,6 +890,13 @@ static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
 	return queue_delayed_work(callback_wq, &cb->cb_work, 0);
 }
 
+static void nfsd4_queue_cb_delayed(struct nfsd4_callback *cb,
+				   unsigned long msecs)
+{
+	queue_delayed_work(callback_wq, &cb->cb_work,
+			   msecs_to_jiffies(msecs));
+}
+
 static void nfsd41_cb_inflight_begin(struct nfs4_client *clp)
 {
 	atomic_inc(&clp->cl_cb_inflight);
@@ -1375,20 +1382,21 @@ nfsd4_run_cb_work(struct work_struct *work)
 	struct rpc_clnt *clnt;
 	int flags;
 
-	if (cb->cb_need_restart) {
-		cb->cb_need_restart = false;
-	} else {
-		if (cb->cb_ops && cb->cb_ops->prepare)
-			cb->cb_ops->prepare(cb);
-	}
-
 	if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
 		nfsd4_process_cb_update(cb);
 
 	clnt = clp->cl_cb_client;
 	if (!clnt) {
-		/* Callback channel broken, or client killed; give up: */
-		nfsd41_destroy_cb(cb);
+		if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
+			nfsd41_destroy_cb(cb);
+		else {
+			/*
+			 * XXX: Ideally, we could wait for the client to
+			 *	reconnect, but I haven't figured out how
+			 *	to do that yet.
+			 */
+			nfsd4_queue_cb_delayed(cb, 25);
+		}
 		return;
 	}
 
@@ -1401,6 +1409,12 @@ nfsd4_run_cb_work(struct work_struct *work)
 		return;
 	}
 
+	if (cb->cb_need_restart) {
+		cb->cb_need_restart = false;
+	} else {
+		if (cb->cb_ops && cb->cb_ops->prepare)
+			cb->cb_ops->prepare(cb);
+	}
 	cb->cb_msg.rpc_cred = clp->cl_cb_cred;
 	flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN;
 	rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags,

From fa8fbe5970231df591aca7d9075c7da4ec85b473 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:45:36 -0500
Subject: [PATCH 284/707] NFSD: Retransmit callbacks after client reconnects

NFSv4.1 clients assume that if they disconnect, that will force the
server to resend pending callback operations once a fresh connection
has been established.

Turns out NFSD has not been resending after reconnect.

Fixes: 7ba6cad6c88f ("nfsd: New helper nfsd4_cb_sequence_done() for processing more cb errors")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 389d05985c5230..3bff14241b3cc5 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1178,12 +1178,21 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		break;
 	case -ESERVERFAULT:
 		++session->se_cb_seq_nr;
-		fallthrough;
+		nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
+		ret = false;
+		break;
 	case 1:
+		/*
+		 * cb_seq_status remains 1 if an RPC Reply was never
+		 * received. NFSD can't know if the client processed
+		 * the CB_SEQUENCE operation. Ask the client to send a
+		 * DESTROY_SESSION to recover.
+		 */
+		fallthrough;
 	case -NFS4ERR_BADSESSION:
 		nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
 		ret = false;
-		break;
+		goto need_restart;
 	case -NFS4ERR_DELAY:
 		cb->cb_seq_status = 1;
 		if (!rpc_restart_call(task))

From 5c2be2ed0f80683c8773b379b0af2a97f194a1a2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:45:42 -0500
Subject: [PATCH 285/707] NFSD: Add nfsd_seq4_status trace event

Add a trace point that records SEQ4_STATUS flags returned in an
NFSv4.1 SEQUENCE response. SEQ4_STATUS flags report backchannel
issues and changes to lease state to clients. Knowing what the
server is reporting to clients is useful for debugging both
configuration and operational issues in real time.

For example, upcoming patches will enable server administrators to
revoke parts of a client's lease; that revocation is indicated to
the client when a subsequent SEQUENCE operation has one or more
SEQ4_STATUS flags that are set.

Sample trace records:

nfsd-927   [006]   615.581821: nfsd_seq4_status:     xid=0x095ded07 sessionid=65a032c3:b7845faf:00000001:00000000 status_flags=BACKCHANNEL_FAULT
nfsd-927   [006]   615.588043: nfsd_seq4_status:     xid=0x0a5ded07 sessionid=65a032c3:b7845faf:00000001:00000000 status_flags=BACKCHANNEL_FAULT
nfsd-928   [003]   615.588448: nfsd_seq4_status:     xid=0x0b5ded07 sessionid=65a032c3:b7845faf:00000001:00000000 status_flags=BACKCHANNEL_FAULT

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c |  1 +
 fs/nfsd/trace.h     | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 82f456bc42a4cf..ae9b5a3a585f96 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4058,6 +4058,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	}
 	if (!list_empty(&clp->cl_revoked))
 		seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
+	trace_nfsd_seq4_status(rqstp, seq);
 out_no_session:
 	if (conn)
 		free_conn(conn);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index d1e8cf079b0f4b..38d11b43779c77 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -696,6 +696,41 @@ DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name,			\
 
 DEFINE_STID_EVENT(revoke);
 
+TRACE_EVENT_CONDITION(nfsd_seq4_status,
+	TP_PROTO(
+		const struct svc_rqst *rqstp,
+		const struct nfsd4_sequence *sequence
+	),
+	TP_ARGS(rqstp, sequence),
+	TP_CONDITION(sequence->status_flags),
+	TP_STRUCT__entry(
+		__field(unsigned int, netns_ino)
+		__field(u32, xid)
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(u32, seqno)
+		__field(u32, reserved)
+		__field(unsigned long, status_flags)
+	),
+	TP_fast_assign(
+		const struct nfsd4_sessionid *sid =
+			(struct nfsd4_sessionid *)&sequence->sessionid;
+
+		__entry->netns_ino = SVC_NET(rqstp)->ns.inum;
+		__entry->xid = be32_to_cpu(rqstp->rq_xid);
+		__entry->cl_boot = sid->clientid.cl_boot;
+		__entry->cl_id = sid->clientid.cl_id;
+		__entry->seqno = sid->sequence;
+		__entry->reserved = sid->reserved;
+		__entry->status_flags = sequence->status_flags;
+	),
+	TP_printk("xid=0x%08x sessionid=%08x:%08x:%08x:%08x status_flags=%s",
+		__entry->xid, __entry->cl_boot, __entry->cl_id,
+		__entry->seqno, __entry->reserved,
+		show_nfs4_seq4_status(__entry->status_flags)
+	)
+);
+
 DECLARE_EVENT_CLASS(nfsd_clientid_class,
 	TP_PROTO(const clientid_t *clid),
 	TP_ARGS(clid),

From 2a2547f8a32216f0e1079b61b85790e8567a4565 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:45:48 -0500
Subject: [PATCH 286/707] NFSD: Replace dprintks in nfsd4_cb_sequence_done()

Improve observability of backchannel session operation.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c |  9 +++--
 fs/nfsd/trace.h        | 82 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3bff14241b3cc5..78d9939cf4b093 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1165,6 +1165,8 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 	if (!cb->cb_holds_slot)
 		goto need_restart;
 
+	/* This is the operation status code for CB_SEQUENCE */
+	trace_nfsd_cb_seq_status(task, cb);
 	switch (cb->cb_seq_status) {
 	case 0:
 		/*
@@ -1210,13 +1212,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		break;
 	default:
 		nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
-		dprintk("%s: unprocessed error %d\n", __func__,
-			cb->cb_seq_status);
 	}
-
 	nfsd41_cb_release_slot(cb);
-	dprintk("%s: freed slot, new seqid=%d\n", __func__,
-		clp->cl_cb_session->se_cb_seq_nr);
+
+	trace_nfsd_cb_free_slot(task, cb);
 
 	if (RPC_SIGNALLED(task))
 		goto need_restart;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 38d11b43779c77..c134c755ae5d1e 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -9,8 +9,10 @@
 #define _NFSD_TRACE_H
 
 #include <linux/tracepoint.h>
+#include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprt.h>
 #include <trace/misc/nfs.h>
+#include <trace/misc/sunrpc.h>
 
 #include "export.h"
 #include "nfsfh.h"
@@ -1440,6 +1442,86 @@ TRACE_EVENT(nfsd_cb_setup_err,
 		__entry->error)
 );
 
+TRACE_EVENT(nfsd_cb_seq_status,
+	TP_PROTO(
+		const struct rpc_task *task,
+		const struct nfsd4_callback *cb
+	),
+	TP_ARGS(task, cb),
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(u32, seqno)
+		__field(u32, reserved)
+		__field(int, tk_status)
+		__field(int, seq_status)
+	),
+	TP_fast_assign(
+		const struct nfs4_client *clp = cb->cb_clp;
+		const struct nfsd4_session *session = clp->cl_cb_session;
+		const struct nfsd4_sessionid *sid =
+			(struct nfsd4_sessionid *)&session->se_sessionid;
+
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client ?
+				     task->tk_client->cl_clid : -1;
+		__entry->cl_boot = sid->clientid.cl_boot;
+		__entry->cl_id = sid->clientid.cl_id;
+		__entry->seqno = sid->sequence;
+		__entry->reserved = sid->reserved;
+		__entry->tk_status = task->tk_status;
+		__entry->seq_status = cb->cb_seq_status;
+	),
+	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+		" sessionid=%08x:%08x:%08x:%08x tk_status=%d seq_status=%d\n",
+		__entry->task_id, __entry->client_id,
+		__entry->cl_boot, __entry->cl_id,
+		__entry->seqno, __entry->reserved,
+		__entry->tk_status, __entry->seq_status
+	)
+);
+
+TRACE_EVENT(nfsd_cb_free_slot,
+	TP_PROTO(
+		const struct rpc_task *task,
+		const struct nfsd4_callback *cb
+	),
+	TP_ARGS(task, cb),
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(u32, seqno)
+		__field(u32, reserved)
+		__field(u32, slot_seqno)
+	),
+	TP_fast_assign(
+		const struct nfs4_client *clp = cb->cb_clp;
+		const struct nfsd4_session *session = clp->cl_cb_session;
+		const struct nfsd4_sessionid *sid =
+			(struct nfsd4_sessionid *)&session->se_sessionid;
+
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client ?
+				     task->tk_client->cl_clid : -1;
+		__entry->cl_boot = sid->clientid.cl_boot;
+		__entry->cl_id = sid->clientid.cl_id;
+		__entry->seqno = sid->sequence;
+		__entry->reserved = sid->reserved;
+		__entry->slot_seqno = session->se_cb_seq_nr;
+	),
+	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+		" sessionid=%08x:%08x:%08x:%08x new slot seqno=%u\n",
+		__entry->task_id, __entry->client_id,
+		__entry->cl_boot, __entry->cl_id,
+		__entry->seqno, __entry->reserved,
+		__entry->slot_seqno
+	)
+);
+
 TRACE_EVENT_CONDITION(nfsd_cb_recall,
 	TP_PROTO(
 		const struct nfs4_stid *stid

From d5927e2267c8f247f77e027be01b82407fbac04f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:45:54 -0500
Subject: [PATCH 287/707] NFSD: Rename nfsd_cb_state trace point

Make it clear where backchannel state is updated.

Example trace point output:

kworker/u16:0-10    [006]  2800.080404: nfsd_cb_new_state:    addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=UP
         nfsd-940   [003]  2800.478368: nfsd_cb_new_state:    addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=UNKNOWN
kworker/u16:0-10    [003]  2800.478828: nfsd_cb_new_state:    addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=DOWN

kworker/u16:0-10    [005]  2802.039724: nfsd_cb_start:        addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=UP
kworker/u16:0-10    [005]  2810.611452: nfsd_cb_start:        addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=FAULT
kworker/u16:0-10    [005]  2810.616832: nfsd_cb_start:        addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=UNKNOWN
kworker/u16:0-10    [005]  2810.616931: nfsd_cb_start:        addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=DOWN

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 4 +++-
 fs/nfsd/trace.h        | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 78d9939cf4b093..a63171ccfc2b88 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1006,7 +1006,7 @@ static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate)
 {
 	if (clp->cl_cb_state != newstate) {
 		clp->cl_cb_state = newstate;
-		trace_nfsd_cb_state(clp);
+		trace_nfsd_cb_new_state(clp);
 	}
 }
 
@@ -1390,6 +1390,8 @@ nfsd4_run_cb_work(struct work_struct *work)
 	struct rpc_clnt *clnt;
 	int flags;
 
+	trace_nfsd_cb_start(clp);
+
 	if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
 		nfsd4_process_cb_update(cb);
 
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index c134c755ae5d1e..6003af2bee33cb 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1371,7 +1371,8 @@ DEFINE_EVENT(nfsd_cb_class, nfsd_cb_##name,		\
 	TP_PROTO(const struct nfs4_client *clp),	\
 	TP_ARGS(clp))
 
-DEFINE_NFSD_CB_EVENT(state);
+DEFINE_NFSD_CB_EVENT(start);
+DEFINE_NFSD_CB_EVENT(new_state);
 DEFINE_NFSD_CB_EVENT(probe);
 DEFINE_NFSD_CB_EVENT(lost);
 DEFINE_NFSD_CB_EVENT(shutdown);

From 035544bdf2ac5f0ab809e3b87cebeae5ae47035d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:46:01 -0500
Subject: [PATCH 288/707] NFSD: Add callback operation lifetime trace points

Help observe the flow of callback operations.

bc_shutdown() records exactly when the backchannel RPC client is
destroyed and cl_cb_client is replaced with NULL.

Examples include:

         nfsd-955   [004]   650.013997: nfsd_cb_queue:        addr=192.168.122.6:0 client 65b3c5b8:f541f749 cb=0xffff8881134b02f8 (first try)
kworker/u21:4-497   [004]   650.014050: nfsd_cb_seq_status:   task:00000001@00000001 sessionid=65b3c5b8:f541f749:00000001:00000000 tk_status=-107 seq_status=1
kworker/u21:4-497   [004]   650.014051: nfsd_cb_restart:      addr=192.168.122.6:0 client 65b3c5b8:f541f749 cb=0xffff88810e39f400 (first try)
kworker/u21:4-497   [004]   650.014066: nfsd_cb_queue:        addr=192.168.122.6:0 client 65b3c5b8:f541f749 cb=0xffff88810e39f400 (need restart)


kworker/u16:0-10    [006]   650.065750: nfsd_cb_start:        addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=UNKNOWN
kworker/u16:0-10    [006]   650.065752: nfsd_cb_bc_update:    addr=192.168.122.6:0 client 65b3c5b8:f541f749 cb=0xffff8881134b02f8 (first try)
kworker/u16:0-10    [006]   650.065754: nfsd_cb_bc_shutdown:  addr=192.168.122.6:0 client 65b3c5b8:f541f749 cb=0xffff8881134b02f8 (first try)
kworker/u16:0-10    [006]   650.065810: nfsd_cb_new_state:    addr=192.168.122.6:0 client 65b3c5b8:f541f749 state=DOWN

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c   |  8 ++++++++
 fs/nfsd/trace.h          | 42 ++++++++++++++++++++++++++++++++++++++++
 include/trace/misc/nfs.h | 34 ++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index a63171ccfc2b88..b50ce54aa1bfab 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -887,12 +887,14 @@ static struct workqueue_struct *callback_wq;
 
 static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
 {
+	trace_nfsd_cb_queue(cb->cb_clp, cb);
 	return queue_delayed_work(callback_wq, &cb->cb_work, 0);
 }
 
 static void nfsd4_queue_cb_delayed(struct nfsd4_callback *cb,
 				   unsigned long msecs)
 {
+	trace_nfsd_cb_queue(cb->cb_clp, cb);
 	queue_delayed_work(callback_wq, &cb->cb_work,
 			   msecs_to_jiffies(msecs));
 }
@@ -1113,6 +1115,7 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
 {
 	struct nfs4_client *clp = cb->cb_clp;
 
+	trace_nfsd_cb_destroy(clp, cb);
 	nfsd41_cb_release_slot(cb);
 	if (cb->cb_ops && cb->cb_ops->release)
 		cb->cb_ops->release(cb);
@@ -1227,6 +1230,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 	goto out;
 need_restart:
 	if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) {
+		trace_nfsd_cb_restart(clp, cb);
 		task->tk_status = 0;
 		cb->cb_need_restart = true;
 	}
@@ -1340,11 +1344,14 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 	struct nfsd4_conn *c;
 	int err;
 
+	trace_nfsd_cb_bc_update(clp, cb);
+
 	/*
 	 * This is either an update, or the client dying; in either case,
 	 * kill the old client:
 	 */
 	if (clp->cl_cb_client) {
+		trace_nfsd_cb_bc_shutdown(clp, cb);
 		rpc_shutdown_client(clp->cl_cb_client);
 		clp->cl_cb_client = NULL;
 		put_cred(clp->cl_cb_cred);
@@ -1356,6 +1363,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 	}
 	if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
 		return;
+
 	spin_lock(&clp->cl_lock);
 	/*
 	 * Only serialized callback code is allowed to clear these
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 6003af2bee33cb..9f9e58debc2611 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1443,6 +1443,48 @@ TRACE_EVENT(nfsd_cb_setup_err,
 		__entry->error)
 );
 
+DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class,
+	TP_PROTO(
+		const struct nfs4_client *clp,
+		const struct nfsd4_callback *cb
+	),
+	TP_ARGS(clp, cb),
+	TP_STRUCT__entry(
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(const void *, cb)
+		__field(bool, need_restart)
+		__sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
+	),
+	TP_fast_assign(
+		__entry->cl_boot = clp->cl_clientid.cl_boot;
+		__entry->cl_id = clp->cl_clientid.cl_id;
+		__entry->cb = cb;
+		__entry->need_restart = cb->cb_need_restart;
+		__assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+				  clp->cl_cb_conn.cb_addrlen)
+	),
+	TP_printk("addr=%pISpc client %08x:%08x cb=%p%s",
+		__get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
+		__entry->cb, __entry->need_restart ?
+			" (need restart)" : " (first try)"
+	)
+);
+
+#define DEFINE_NFSD_CB_LIFETIME_EVENT(name)		\
+DEFINE_EVENT(nfsd_cb_lifetime_class, nfsd_cb_##name,	\
+	TP_PROTO(					\
+		const struct nfs4_client *clp,		\
+		const struct nfsd4_callback *cb		\
+	),						\
+	TP_ARGS(clp, cb))
+
+DEFINE_NFSD_CB_LIFETIME_EVENT(queue);
+DEFINE_NFSD_CB_LIFETIME_EVENT(destroy);
+DEFINE_NFSD_CB_LIFETIME_EVENT(restart);
+DEFINE_NFSD_CB_LIFETIME_EVENT(bc_update);
+DEFINE_NFSD_CB_LIFETIME_EVENT(bc_shutdown);
+
 TRACE_EVENT(nfsd_cb_seq_status,
 	TP_PROTO(
 		const struct rpc_task *task,
diff --git a/include/trace/misc/nfs.h b/include/trace/misc/nfs.h
index 0d9d48dca38a89..64ab5dac59ce0c 100644
--- a/include/trace/misc/nfs.h
+++ b/include/trace/misc/nfs.h
@@ -385,3 +385,37 @@ TRACE_DEFINE_ENUM(IOMODE_ANY);
 		{ SEQ4_STATUS_RESTART_RECLAIM_NEEDED,	"RESTART_RECLAIM_NEEDED" }, \
 		{ SEQ4_STATUS_CB_PATH_DOWN_SESSION,	"CB_PATH_DOWN_SESSION" }, \
 		{ SEQ4_STATUS_BACKCHANNEL_FAULT,	"BACKCHANNEL_FAULT" })
+
+TRACE_DEFINE_ENUM(OP_CB_GETATTR);
+TRACE_DEFINE_ENUM(OP_CB_RECALL);
+TRACE_DEFINE_ENUM(OP_CB_LAYOUTRECALL);
+TRACE_DEFINE_ENUM(OP_CB_NOTIFY);
+TRACE_DEFINE_ENUM(OP_CB_PUSH_DELEG);
+TRACE_DEFINE_ENUM(OP_CB_RECALL_ANY);
+TRACE_DEFINE_ENUM(OP_CB_RECALLABLE_OBJ_AVAIL);
+TRACE_DEFINE_ENUM(OP_CB_RECALL_SLOT);
+TRACE_DEFINE_ENUM(OP_CB_SEQUENCE);
+TRACE_DEFINE_ENUM(OP_CB_WANTS_CANCELLED);
+TRACE_DEFINE_ENUM(OP_CB_NOTIFY_LOCK);
+TRACE_DEFINE_ENUM(OP_CB_NOTIFY_DEVICEID);
+TRACE_DEFINE_ENUM(OP_CB_OFFLOAD);
+TRACE_DEFINE_ENUM(OP_CB_ILLEGAL);
+
+#define show_nfs4_cb_op(x) \
+	__print_symbolic(x, \
+		{ 0,				"CB_NULL" }, \
+		{ 1,				"CB_COMPOUND" }, \
+		{ OP_CB_GETATTR,		"CB_GETATTR" }, \
+		{ OP_CB_RECALL,			"CB_RECALL" }, \
+		{ OP_CB_LAYOUTRECALL,		"CB_LAYOUTRECALL" }, \
+		{ OP_CB_NOTIFY,			"CB_NOTIFY" }, \
+		{ OP_CB_PUSH_DELEG,		"CB_PUSH_DELEG" }, \
+		{ OP_CB_RECALL_ANY,		"CB_RECALL_ANY" }, \
+		{ OP_CB_RECALLABLE_OBJ_AVAIL,	"CB_RECALLABLE_OBJ_AVAIL" }, \
+		{ OP_CB_RECALL_SLOT,		"CB_RECALL_SLOT" }, \
+		{ OP_CB_SEQUENCE,		"CB_SEQUENCE" }, \
+		{ OP_CB_WANTS_CANCELLED,	"CB_WANTS_CANCELLED" }, \
+		{ OP_CB_NOTIFY_LOCK,		"CB_NOTIFY_LOCK" }, \
+		{ OP_CB_NOTIFY_DEVICEID,	"CB_NOTIFY_DEVICEID" }, \
+		{ OP_CB_OFFLOAD,		"CB_OFFLOAD" }, \
+		{ OP_CB_ILLEGAL,		"CB_ILLEGAL" })

From 33ad6a2f38e313cea2bf7bc56ea60215cddfacf8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:46:07 -0500
Subject: [PATCH 289/707] SUNRPC: Remove EXPORT_SYMBOL_GPL for svc_process_bc()

svc_process_bc(), previously known as bc_svc_process(), was
added in commit 4d6bbb6233c9 ("nfs41: Backchannel bc_svc_process()")
but there has never been a call site outside of the sunrpc.ko
module.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/svc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 9bd8a868c1a70a..121d0739031b5a 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1623,7 +1623,6 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp)
 	WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
 	rpc_put_task(task);
 }
-EXPORT_SYMBOL_GPL(svc_process_bc);
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
 /**

From 2ae8a993e913b7e071ec503c91f9304a38af16d9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:46:13 -0500
Subject: [PATCH 290/707] NFSD: Remove unused @reason argument

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index b50ce54aa1bfab..45a31f05159598 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -45,7 +45,7 @@
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
-static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp);
 
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
@@ -1012,14 +1012,14 @@ static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate)
 	}
 }
 
-static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
+static void nfsd4_mark_cb_down(struct nfs4_client *clp)
 {
 	if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
 		return;
 	nfsd4_mark_cb_state(clp, NFSD4_CB_DOWN);
 }
 
-static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp)
 {
 	if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
 		return;
@@ -1031,7 +1031,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 	struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
 
 	if (task->tk_status)
-		nfsd4_mark_cb_down(clp, task->tk_status);
+		nfsd4_mark_cb_down(clp);
 	else
 		nfsd4_mark_cb_state(clp, NFSD4_CB_UP);
 }
@@ -1183,7 +1183,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		break;
 	case -ESERVERFAULT:
 		++session->se_cb_seq_nr;
-		nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
+		nfsd4_mark_cb_fault(cb->cb_clp);
 		ret = false;
 		break;
 	case 1:
@@ -1195,7 +1195,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		 */
 		fallthrough;
 	case -NFS4ERR_BADSESSION:
-		nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
+		nfsd4_mark_cb_fault(cb->cb_clp);
 		ret = false;
 		goto need_restart;
 	case -NFS4ERR_DELAY:
@@ -1214,7 +1214,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		}
 		break;
 	default:
-		nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
+		nfsd4_mark_cb_fault(cb->cb_clp);
 	}
 	nfsd41_cb_release_slot(cb);
 
@@ -1260,7 +1260,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 		case -EIO:
 		case -ETIMEDOUT:
 		case -EACCES:
-			nfsd4_mark_cb_down(clp, task->tk_status);
+			nfsd4_mark_cb_down(clp);
 		}
 		break;
 	default:
@@ -1382,7 +1382,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 
 	err = setup_callback_client(clp, &conn, ses);
 	if (err) {
-		nfsd4_mark_cb_down(clp, err);
+		nfsd4_mark_cb_down(clp);
 		if (c)
 			svc_xprt_put(c->cn_xprt);
 		return;

From 0275e5a9e5c217115c96dc026e90c5913483f24c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:46:20 -0500
Subject: [PATCH 291/707] NFSD: Replace comment with lockdep assertion

Convert a code comment into a real assertion.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 45a31f05159598..d73c66fa131df7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1315,12 +1315,13 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
 	nfsd41_cb_inflight_wait_complete(clp);
 }
 
-/* requires cl_lock: */
 static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
 {
 	struct nfsd4_session *s;
 	struct nfsd4_conn *c;
 
+	lockdep_assert_held(&clp->cl_lock);
+
 	list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
 		list_for_each_entry(c, &s->se_conns, cn_persession) {
 			if (c->cn_flags & NFS4_CDFC4_BACK)

From d7b0002517bbb67b5e0835c25fa3142d6224c621 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:46:26 -0500
Subject: [PATCH 292/707] NFSD: Remove BUG_ON in nfsd4_process_cb_update()

Don't kill the kworker thread, and don't panic while cl_lock is
held. There's no need for scorching the earth here.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index d73c66fa131df7..fd6a27e20f65ba 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1370,8 +1370,9 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 	 * Only serialized callback code is allowed to clear these
 	 * flags; main nfsd code can only set them:
 	 */
-	BUG_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK));
+	WARN_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK));
 	clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
+
 	memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
 	c = __nfsd4_find_backchannel(clp);
 	if (c) {

From e36b4d5f581085f88d2ddf6e143b211972e44bbd Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:46:32 -0500
Subject: [PATCH 293/707] SUNRPC: Remove stale comments

bc_close() and bc_destroy now do something, so the comments are
no longer correct. Commit 6221f1d9b63f ("SUNRPC: Fix backchannel
RPC soft lockups") should have removed these.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xprtsock.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 58f3dc8d0d71c3..d92c13e78a56cf 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2987,20 +2987,11 @@ static int bc_send_request(struct rpc_rqst *req)
 	return len;
 }
 
-/*
- * The close routine. Since this is client initiated, we do nothing
- */
-
 static void bc_close(struct rpc_xprt *xprt)
 {
 	xprt_disconnect_done(xprt);
 }
 
-/*
- * The xprt destroy routine. Again, because this connection is client
- * initiated, we do nothing
- */
-
 static void bc_destroy(struct rpc_xprt *xprt)
 {
 	dprintk("RPC:       bc_destroy xprt %p\n", xprt);

From a670b682f0500a921682917422c947e2b4275df5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Jan 2024 12:46:38 -0500
Subject: [PATCH 294/707] NFSD: Remove redundant cb_seq_status initialization

As far as I can see, setting cb_seq_status in nfsd4_init_cb() is
superfluous because it is set again in nfsd4_cb_prepare().

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index fd6a27e20f65ba..32dd2fbb1f301b 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1450,7 +1450,6 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
 	cb->cb_msg.rpc_resp = cb;
 	cb->cb_ops = ops;
 	INIT_DELAYED_WORK(&cb->cb_work, nfsd4_run_cb_work);
-	cb->cb_seq_status = 1;
 	cb->cb_status = 0;
 	cb->cb_need_restart = false;
 	cb->cb_holds_slot = false;

From c345694e92d3cf5314d331b68edbcb9d743fdbd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Thu, 25 Jan 2024 16:32:29 +0100
Subject: [PATCH 295/707] selftests/landlock: Fix capability for net_test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CAP_NET_ADMIN allows to configure network interfaces, not CAP_SYS_ADMIN
which only allows to call unshare(2).  Without this change, running
network tests as a non-root user but with all capabilities would fail at
the setup_loopback() step with "RTNETLINK answers: Operation not
permitted".

The issue is only visible when running tests with non-root users (i.e.
only relying on ambient capabilities).  Indeed, when configuring the
network interface, the "ip" command is called, which may lead to the
special handling of capabilities for the root user by execve(2).  If
root is the caller, then the inherited, permitted and effective
capabilities are all reset, which then includes CAP_NET_ADMIN.  However,
if a non-root user is the caller, then ambient capabilities are masked
by the inherited ones, which were explicitly dropped.

To make execution deterministic whatever users are running the tests,
set the noroot secure bit for each test, and set the inheritable and
ambient capabilities to CAP_NET_ADMIN, the only capability that may be
required after an execve(2).

Factor out _effective_cap() into _change_cap(), and use it to manage
ambient capabilities with the new set_ambient_cap() and
clear_ambient_cap() helpers.

This makes it possible to run all Landlock tests (including net_test)
with uml-run.sh from https://github.com/landlock-lsm/landlock-test-tools

Cc: Konstantin Meskhidze <konstantin.meskhidze@huawei.com>
Fixes: a549d055a22e ("selftests/landlock: Add network tests")
Link: https://lore.kernel.org/r/20240125153230.3817165-2-mic@digikod.net
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/common.h   | 51 +++++++++++++++++----
 tools/testing/selftests/landlock/net_test.c |  5 +-
 2 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h
index 5b79758cae6275..13597ebd3a64e3 100644
--- a/tools/testing/selftests/landlock/common.h
+++ b/tools/testing/selftests/landlock/common.h
@@ -9,6 +9,7 @@
 
 #include <errno.h>
 #include <linux/landlock.h>
+#include <linux/securebits.h>
 #include <sys/capability.h>
 #include <sys/socket.h>
 #include <sys/syscall.h>
@@ -115,12 +116,20 @@ static void _init_caps(struct __test_metadata *const _metadata, bool drop_all)
 		/* clang-format off */
 		CAP_DAC_OVERRIDE,
 		CAP_MKNOD,
+		CAP_NET_ADMIN,
+		CAP_NET_BIND_SERVICE,
 		CAP_SYS_ADMIN,
 		CAP_SYS_CHROOT,
-		CAP_NET_BIND_SERVICE,
 		/* clang-format on */
 	};
 
+	/*
+	 * As a safety guard, this should be called each time, but it will fail
+	 * the second time when called by the same task (e.g.
+	 * fs_test.c:layout0.unpriv), which is OK.
+	 */
+	cap_set_secbits(SECBIT_NOROOT);
+
 	cap_p = cap_get_proc();
 	EXPECT_NE(NULL, cap_p)
 	{
@@ -137,6 +146,8 @@ static void _init_caps(struct __test_metadata *const _metadata, bool drop_all)
 			TH_LOG("Failed to cap_set_flag: %s", strerror(errno));
 		}
 	}
+
+	/* Automatically resets ambient capabilities. */
 	EXPECT_NE(-1, cap_set_proc(cap_p))
 	{
 		TH_LOG("Failed to cap_set_proc: %s", strerror(errno));
@@ -145,6 +156,9 @@ static void _init_caps(struct __test_metadata *const _metadata, bool drop_all)
 	{
 		TH_LOG("Failed to cap_free: %s", strerror(errno));
 	}
+
+	/* Quickly checks that ambient capabilities are cleared. */
+	EXPECT_NE(-1, cap_get_ambient(caps[0]));
 }
 
 /* We cannot put such helpers in a library because of kselftest_harness.h . */
@@ -158,8 +172,9 @@ static void __maybe_unused drop_caps(struct __test_metadata *const _metadata)
 	_init_caps(_metadata, true);
 }
 
-static void _effective_cap(struct __test_metadata *const _metadata,
-			   const cap_value_t caps, const cap_flag_value_t value)
+static void _change_cap(struct __test_metadata *const _metadata,
+			const cap_flag_t flag, const cap_value_t cap,
+			const cap_flag_value_t value)
 {
 	cap_t cap_p;
 
@@ -168,7 +183,7 @@ static void _effective_cap(struct __test_metadata *const _metadata,
 	{
 		TH_LOG("Failed to cap_get_proc: %s", strerror(errno));
 	}
-	EXPECT_NE(-1, cap_set_flag(cap_p, CAP_EFFECTIVE, 1, &caps, value))
+	EXPECT_NE(-1, cap_set_flag(cap_p, flag, 1, &cap, value))
 	{
 		TH_LOG("Failed to cap_set_flag: %s", strerror(errno));
 	}
@@ -183,15 +198,35 @@ static void _effective_cap(struct __test_metadata *const _metadata,
 }
 
 static void __maybe_unused set_cap(struct __test_metadata *const _metadata,
-				   const cap_value_t caps)
+				   const cap_value_t cap)
 {
-	_effective_cap(_metadata, caps, CAP_SET);
+	_change_cap(_metadata, CAP_EFFECTIVE, cap, CAP_SET);
 }
 
 static void __maybe_unused clear_cap(struct __test_metadata *const _metadata,
-				     const cap_value_t caps)
+				     const cap_value_t cap)
+{
+	_change_cap(_metadata, CAP_EFFECTIVE, cap, CAP_CLEAR);
+}
+
+static void __maybe_unused
+set_ambient_cap(struct __test_metadata *const _metadata, const cap_value_t cap)
+{
+	_change_cap(_metadata, CAP_INHERITABLE, cap, CAP_SET);
+
+	EXPECT_NE(-1, cap_set_ambient(cap, CAP_SET))
+	{
+		TH_LOG("Failed to set ambient capability %d: %s", cap,
+		       strerror(errno));
+	}
+}
+
+static void __maybe_unused clear_ambient_cap(
+	struct __test_metadata *const _metadata, const cap_value_t cap)
 {
-	_effective_cap(_metadata, caps, CAP_CLEAR);
+	EXPECT_EQ(1, cap_get_ambient(cap));
+	_change_cap(_metadata, CAP_INHERITABLE, cap, CAP_CLEAR);
+	EXPECT_EQ(0, cap_get_ambient(cap));
 }
 
 /* Receives an FD from a UNIX socket. Returns the received FD, or -errno. */
diff --git a/tools/testing/selftests/landlock/net_test.c b/tools/testing/selftests/landlock/net_test.c
index efcde123af1f24..936cfc879f1d2c 100644
--- a/tools/testing/selftests/landlock/net_test.c
+++ b/tools/testing/selftests/landlock/net_test.c
@@ -107,8 +107,11 @@ static void setup_loopback(struct __test_metadata *const _metadata)
 {
 	set_cap(_metadata, CAP_SYS_ADMIN);
 	ASSERT_EQ(0, unshare(CLONE_NEWNET));
-	ASSERT_EQ(0, system("ip link set dev lo up"));
 	clear_cap(_metadata, CAP_SYS_ADMIN);
+
+	set_ambient_cap(_metadata, CAP_NET_ADMIN);
+	ASSERT_EQ(0, system("ip link set dev lo up"));
+	clear_ambient_cap(_metadata, CAP_NET_ADMIN);
 }
 
 static bool is_restricted(const struct protocol_variant *const prot,

From 8d3a9e03af2f66277abdc38ab28969500585d620 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Thu, 25 Jan 2024 16:32:30 +0100
Subject: [PATCH 296/707] selftests/landlock: Clean up error logs related to
 capabilities
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It doesn't help to call TH_LOG() for every cap_*() error. Let's only
log errors returned by the kernel, not by libcap specificities.

Link: https://lore.kernel.org/r/20240125153230.3817165-3-mic@digikod.net
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/common.h | 39 ++++++-----------------
 1 file changed, 9 insertions(+), 30 deletions(-)

diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h
index 13597ebd3a64e3..36fca11958b25a 100644
--- a/tools/testing/selftests/landlock/common.h
+++ b/tools/testing/selftests/landlock/common.h
@@ -131,31 +131,19 @@ static void _init_caps(struct __test_metadata *const _metadata, bool drop_all)
 	cap_set_secbits(SECBIT_NOROOT);
 
 	cap_p = cap_get_proc();
-	EXPECT_NE(NULL, cap_p)
-	{
-		TH_LOG("Failed to cap_get_proc: %s", strerror(errno));
-	}
-	EXPECT_NE(-1, cap_clear(cap_p))
-	{
-		TH_LOG("Failed to cap_clear: %s", strerror(errno));
-	}
+	EXPECT_NE(NULL, cap_p);
+	EXPECT_NE(-1, cap_clear(cap_p));
 	if (!drop_all) {
 		EXPECT_NE(-1, cap_set_flag(cap_p, CAP_PERMITTED,
-					   ARRAY_SIZE(caps), caps, CAP_SET))
-		{
-			TH_LOG("Failed to cap_set_flag: %s", strerror(errno));
-		}
+					   ARRAY_SIZE(caps), caps, CAP_SET));
 	}
 
 	/* Automatically resets ambient capabilities. */
 	EXPECT_NE(-1, cap_set_proc(cap_p))
 	{
-		TH_LOG("Failed to cap_set_proc: %s", strerror(errno));
-	}
-	EXPECT_NE(-1, cap_free(cap_p))
-	{
-		TH_LOG("Failed to cap_free: %s", strerror(errno));
+		TH_LOG("Failed to set capabilities: %s", strerror(errno));
 	}
+	EXPECT_NE(-1, cap_free(cap_p));
 
 	/* Quickly checks that ambient capabilities are cleared. */
 	EXPECT_NE(-1, cap_get_ambient(caps[0]));
@@ -179,22 +167,13 @@ static void _change_cap(struct __test_metadata *const _metadata,
 	cap_t cap_p;
 
 	cap_p = cap_get_proc();
-	EXPECT_NE(NULL, cap_p)
-	{
-		TH_LOG("Failed to cap_get_proc: %s", strerror(errno));
-	}
-	EXPECT_NE(-1, cap_set_flag(cap_p, flag, 1, &cap, value))
-	{
-		TH_LOG("Failed to cap_set_flag: %s", strerror(errno));
-	}
+	EXPECT_NE(NULL, cap_p);
+	EXPECT_NE(-1, cap_set_flag(cap_p, flag, 1, &cap, value));
 	EXPECT_NE(-1, cap_set_proc(cap_p))
 	{
-		TH_LOG("Failed to cap_set_proc: %s", strerror(errno));
-	}
-	EXPECT_NE(-1, cap_free(cap_p))
-	{
-		TH_LOG("Failed to cap_free: %s", strerror(errno));
+		TH_LOG("Failed to set capability %d: %s", cap, strerror(errno));
 	}
+	EXPECT_NE(-1, cap_free(cap_p));
 }
 
 static void __maybe_unused set_cap(struct __test_metadata *const _metadata,

From dfb1a0fcab9f7d294434bd67d98c0f9fb72f2ac6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Thu, 18 Jan 2024 12:36:32 +0100
Subject: [PATCH 297/707] landlock: Add support for KUnit tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the SECURITY_LANDLOCK_KUNIT_TEST option to enable KUnit tests for
Landlock.  The minimal required configuration is listed in the
security/landlock/.kunitconfig file.

Add an initial landlock_fs KUnit test suite with 7 test cases for
filesystem helpers.  These are related to the LANDLOCK_ACCESS_FS_REFER
right.

There is one KUnit test case per:
* mutated state (e.g. test_scope_to_request_*) or,
* shared state between tests (e.g. test_is_eaccess_*).

Add macros to improve readability of tests (i.e. one per line).  Test
cases are collocated with the tested functions to help maintenance and
improve documentation.  This is why SECURITY_LANDLOCK_KUNIT_TEST cannot
be set as module.

This is a nice complement to Landlock's user space kselftests.  We
expect new Landlock features to come with KUnit tests as well.

Thanks to UML support, we can run all KUnit tests for Landlock with:
./tools/testing/kunit/kunit.py run --kunitconfig security/landlock

[00:00:00] ======================= landlock_fs  =======================
[00:00:00] [PASSED] test_no_more_access
[00:00:00] [PASSED] test_scope_to_request_with_exec_none
[00:00:00] [PASSED] test_scope_to_request_with_exec_some
[00:00:00] [PASSED] test_scope_to_request_without_access
[00:00:00] [PASSED] test_is_eacces_with_none
[00:00:00] [PASSED] test_is_eacces_with_refer
[00:00:00] [PASSED] test_is_eacces_with_write
[00:00:00] =================== [PASSED] landlock_fs ===================
[00:00:00] ============================================================
[00:00:00] Testing complete. Ran 7 tests: passed: 7

Cc: Konstantin Meskhidze <konstantin.meskhidze@huawei.com>
Reviewed-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20240118113632.1948478-1-mic@digikod.net
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 security/landlock/.kunitconfig               |   4 +
 security/landlock/Kconfig                    |  15 ++
 security/landlock/common.h                   |   2 +
 security/landlock/fs.c                       | 234 +++++++++++++++++++
 tools/testing/kunit/configs/all_tests.config |   1 +
 5 files changed, 256 insertions(+)
 create mode 100644 security/landlock/.kunitconfig

diff --git a/security/landlock/.kunitconfig b/security/landlock/.kunitconfig
new file mode 100644
index 00000000000000..03e11946660429
--- /dev/null
+++ b/security/landlock/.kunitconfig
@@ -0,0 +1,4 @@
+CONFIG_KUNIT=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_LANDLOCK=y
+CONFIG_SECURITY_LANDLOCK_KUNIT_TEST=y
diff --git a/security/landlock/Kconfig b/security/landlock/Kconfig
index c4bf0d5eff39f3..3f1493402052ec 100644
--- a/security/landlock/Kconfig
+++ b/security/landlock/Kconfig
@@ -20,3 +20,18 @@ config SECURITY_LANDLOCK
 	  If you are unsure how to answer this question, answer N.  Otherwise,
 	  you should also prepend "landlock," to the content of CONFIG_LSM to
 	  enable Landlock at boot time.
+
+config SECURITY_LANDLOCK_KUNIT_TEST
+	bool "KUnit tests for Landlock" if !KUNIT_ALL_TESTS
+	depends on KUNIT=y
+	depends on SECURITY_LANDLOCK
+	default KUNIT_ALL_TESTS
+	help
+	  Build KUnit tests for Landlock.
+
+	  See the KUnit documentation in Documentation/dev-tools/kunit
+
+	  Run all KUnit tests for Landlock with:
+	  ./tools/testing/kunit/kunit.py run --kunitconfig security/landlock
+
+	  If you are unsure how to answer this question, answer N.
diff --git a/security/landlock/common.h b/security/landlock/common.h
index 5dc0fe15707d6b..0eb1d34c2eaefe 100644
--- a/security/landlock/common.h
+++ b/security/landlock/common.h
@@ -17,4 +17,6 @@
 
 #define pr_fmt(fmt) LANDLOCK_NAME ": " fmt
 
+#define BIT_INDEX(bit) HWEIGHT(bit - 1)
+
 #endif /* _SECURITY_LANDLOCK_COMMON_H */
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index fc520a06f9af10..73997e63734f93 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -7,6 +7,7 @@
  * Copyright © 2021-2022 Microsoft Corporation
  */
 
+#include <kunit/test.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/bits.h>
@@ -311,6 +312,119 @@ static bool no_more_access(
 	return true;
 }
 
+#define NMA_TRUE(...) KUNIT_EXPECT_TRUE(test, no_more_access(__VA_ARGS__))
+#define NMA_FALSE(...) KUNIT_EXPECT_FALSE(test, no_more_access(__VA_ARGS__))
+
+#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST
+
+static void test_no_more_access(struct kunit *const test)
+{
+	const layer_mask_t rx0[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_READ_FILE)] = BIT_ULL(0),
+	};
+	const layer_mask_t mx0[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_MAKE_REG)] = BIT_ULL(0),
+	};
+	const layer_mask_t x0[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
+	};
+	const layer_mask_t x1[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(1),
+	};
+	const layer_mask_t x01[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0) |
+							  BIT_ULL(1),
+	};
+	const layer_mask_t allows_all[LANDLOCK_NUM_ACCESS_FS] = {};
+
+	/* Checks without restriction. */
+	NMA_TRUE(&x0, &allows_all, false, &allows_all, NULL, false);
+	NMA_TRUE(&allows_all, &x0, false, &allows_all, NULL, false);
+	NMA_FALSE(&x0, &x0, false, &allows_all, NULL, false);
+
+	/*
+	 * Checks that we can only refer a file if no more access could be
+	 * inherited.
+	 */
+	NMA_TRUE(&x0, &x0, false, &rx0, NULL, false);
+	NMA_TRUE(&rx0, &rx0, false, &rx0, NULL, false);
+	NMA_FALSE(&rx0, &rx0, false, &x0, NULL, false);
+	NMA_FALSE(&rx0, &rx0, false, &x1, NULL, false);
+
+	/* Checks allowed referring with different nested domains. */
+	NMA_TRUE(&x0, &x1, false, &x0, NULL, false);
+	NMA_TRUE(&x1, &x0, false, &x0, NULL, false);
+	NMA_TRUE(&x0, &x01, false, &x0, NULL, false);
+	NMA_TRUE(&x0, &x01, false, &rx0, NULL, false);
+	NMA_TRUE(&x01, &x0, false, &x0, NULL, false);
+	NMA_TRUE(&x01, &x0, false, &rx0, NULL, false);
+	NMA_FALSE(&x01, &x01, false, &x0, NULL, false);
+
+	/* Checks that file access rights are also enforced for a directory. */
+	NMA_FALSE(&rx0, &rx0, true, &x0, NULL, false);
+
+	/* Checks that directory access rights don't impact file referring... */
+	NMA_TRUE(&mx0, &mx0, false, &x0, NULL, false);
+	/* ...but only directory referring. */
+	NMA_FALSE(&mx0, &mx0, true, &x0, NULL, false);
+
+	/* Checks directory exchange. */
+	NMA_TRUE(&mx0, &mx0, true, &mx0, &mx0, true);
+	NMA_TRUE(&mx0, &mx0, true, &mx0, &x0, true);
+	NMA_FALSE(&mx0, &mx0, true, &x0, &mx0, true);
+	NMA_FALSE(&mx0, &mx0, true, &x0, &x0, true);
+	NMA_FALSE(&mx0, &mx0, true, &x1, &x1, true);
+
+	/* Checks file exchange with directory access rights... */
+	NMA_TRUE(&mx0, &mx0, false, &mx0, &mx0, false);
+	NMA_TRUE(&mx0, &mx0, false, &mx0, &x0, false);
+	NMA_TRUE(&mx0, &mx0, false, &x0, &mx0, false);
+	NMA_TRUE(&mx0, &mx0, false, &x0, &x0, false);
+	/* ...and with file access rights. */
+	NMA_TRUE(&rx0, &rx0, false, &rx0, &rx0, false);
+	NMA_TRUE(&rx0, &rx0, false, &rx0, &x0, false);
+	NMA_FALSE(&rx0, &rx0, false, &x0, &rx0, false);
+	NMA_FALSE(&rx0, &rx0, false, &x0, &x0, false);
+	NMA_FALSE(&rx0, &rx0, false, &x1, &x1, false);
+
+	/*
+	 * Allowing the following requests should not be a security risk
+	 * because domain 0 denies execute access, and domain 1 is always
+	 * nested with domain 0.  However, adding an exception for this case
+	 * would mean to check all nested domains to make sure none can get
+	 * more privileges (e.g. processes only sandboxed by domain 0).
+	 * Moreover, this behavior (i.e. composition of N domains) could then
+	 * be inconsistent compared to domain 1's ruleset alone (e.g. it might
+	 * be denied to link/rename with domain 1's ruleset, whereas it would
+	 * be allowed if nested on top of domain 0).  Another drawback would be
+	 * to create a cover channel that could enable sandboxed processes to
+	 * infer most of the filesystem restrictions from their domain.  To
+	 * make it simple, efficient, safe, and more consistent, this case is
+	 * always denied.
+	 */
+	NMA_FALSE(&x1, &x1, false, &x0, NULL, false);
+	NMA_FALSE(&x1, &x1, false, &rx0, NULL, false);
+	NMA_FALSE(&x1, &x1, true, &x0, NULL, false);
+	NMA_FALSE(&x1, &x1, true, &rx0, NULL, false);
+
+	/* Checks the same case of exclusive domains with a file... */
+	NMA_TRUE(&x1, &x1, false, &x01, NULL, false);
+	NMA_FALSE(&x1, &x1, false, &x01, &x0, false);
+	NMA_FALSE(&x1, &x1, false, &x01, &x01, false);
+	NMA_FALSE(&x1, &x1, false, &x0, &x0, false);
+	/* ...and with a directory. */
+	NMA_FALSE(&x1, &x1, false, &x0, &x0, true);
+	NMA_FALSE(&x1, &x1, true, &x0, &x0, false);
+	NMA_FALSE(&x1, &x1, true, &x0, &x0, true);
+}
+
+#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */
+
+#undef NMA_TRUE
+#undef NMA_FALSE
+
 /*
  * Removes @layer_masks accesses that are not requested.
  *
@@ -331,6 +445,57 @@ scope_to_request(const access_mask_t access_request,
 	return !memchr_inv(layer_masks, 0, sizeof(*layer_masks));
 }
 
+#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST
+
+static void test_scope_to_request_with_exec_none(struct kunit *const test)
+{
+	/* Allows everything. */
+	layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
+
+	/* Checks and scopes with execute. */
+	KUNIT_EXPECT_TRUE(test, scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE,
+						 &layer_masks));
+	KUNIT_EXPECT_EQ(test, 0,
+			layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]);
+	KUNIT_EXPECT_EQ(test, 0,
+			layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]);
+}
+
+static void test_scope_to_request_with_exec_some(struct kunit *const test)
+{
+	/* Denies execute and write. */
+	layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(1),
+	};
+
+	/* Checks and scopes with execute. */
+	KUNIT_EXPECT_FALSE(test, scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE,
+						  &layer_masks));
+	KUNIT_EXPECT_EQ(test, BIT_ULL(0),
+			layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]);
+	KUNIT_EXPECT_EQ(test, 0,
+			layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]);
+}
+
+static void test_scope_to_request_without_access(struct kunit *const test)
+{
+	/* Denies execute and write. */
+	layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(1),
+	};
+
+	/* Checks and scopes without access request. */
+	KUNIT_EXPECT_TRUE(test, scope_to_request(0, &layer_masks));
+	KUNIT_EXPECT_EQ(test, 0,
+			layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]);
+	KUNIT_EXPECT_EQ(test, 0,
+			layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]);
+}
+
+#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */
+
 /*
  * Returns true if there is at least one access right different than
  * LANDLOCK_ACCESS_FS_REFER.
@@ -354,6 +519,51 @@ is_eacces(const layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS],
 	return false;
 }
 
+#define IE_TRUE(...) KUNIT_EXPECT_TRUE(test, is_eacces(__VA_ARGS__))
+#define IE_FALSE(...) KUNIT_EXPECT_FALSE(test, is_eacces(__VA_ARGS__))
+
+#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST
+
+static void test_is_eacces_with_none(struct kunit *const test)
+{
+	const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
+
+	IE_FALSE(&layer_masks, 0);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
+}
+
+static void test_is_eacces_with_refer(struct kunit *const test)
+{
+	const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_REFER)] = BIT_ULL(0),
+	};
+
+	IE_FALSE(&layer_masks, 0);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
+}
+
+static void test_is_eacces_with_write(struct kunit *const test)
+{
+	const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
+		[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(0),
+	};
+
+	IE_FALSE(&layer_masks, 0);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER);
+	IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE);
+
+	IE_TRUE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
+}
+
+#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */
+
+#undef IE_TRUE
+#undef IE_FALSE
+
 /**
  * is_access_to_paths_allowed - Check accesses for requests with a common path
  *
@@ -1225,3 +1435,27 @@ __init void landlock_add_fs_hooks(void)
 	security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
 			   &landlock_lsmid);
 }
+
+#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST
+
+/* clang-format off */
+static struct kunit_case test_cases[] = {
+	KUNIT_CASE(test_no_more_access),
+	KUNIT_CASE(test_scope_to_request_with_exec_none),
+	KUNIT_CASE(test_scope_to_request_with_exec_some),
+	KUNIT_CASE(test_scope_to_request_without_access),
+	KUNIT_CASE(test_is_eacces_with_none),
+	KUNIT_CASE(test_is_eacces_with_refer),
+	KUNIT_CASE(test_is_eacces_with_write),
+	{}
+};
+/* clang-format on */
+
+static struct kunit_suite test_suite = {
+	.name = "landlock_fs",
+	.test_cases = test_cases,
+};
+
+kunit_test_suite(test_suite);
+
+#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */
diff --git a/tools/testing/kunit/configs/all_tests.config b/tools/testing/kunit/configs/all_tests.config
index 3bf506d4a63ccd..1b8f1abfedf041 100644
--- a/tools/testing/kunit/configs/all_tests.config
+++ b/tools/testing/kunit/configs/all_tests.config
@@ -37,6 +37,7 @@ CONFIG_REGMAP_BUILD=y
 
 CONFIG_SECURITY=y
 CONFIG_SECURITY_APPARMOR=y
+CONFIG_SECURITY_LANDLOCK=y
 
 CONFIG_SOUND=y
 CONFIG_SND=y

From 66ba1133d970a4e4a86e7fbba72c4a71aa233707 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Noack?= <gnoack@google.com>
Date: Fri, 8 Dec 2023 16:51:16 +0100
Subject: [PATCH 298/707] landlock: Add IOCTL access right
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces the LANDLOCK_ACCESS_FS_IOCTL access right
and increments the Landlock ABI version to 5.

Like the truncate right, these rights are associated with a file
descriptor at the time of open(2), and get respected even when the
file descriptor is used outside of the thread which it was originally
opened in.

A newly enabled Landlock policy therefore does not apply to file
descriptors which are already open.

If the LANDLOCK_ACCESS_FS_IOCTL right is handled, only a small number
of safe IOCTL commands will be permitted on newly opened files.  The
permitted IOCTLs can be configured through the ruleset in limited ways
now.  (See documentation for details.)

Specifically, when LANDLOCK_ACCESS_FS_IOCTL is handled, granting this
right on a file or directory will *not* permit to do all IOCTL
commands, but only influence the IOCTL commands which are not already
handled through other access rights.  The intent is to keep the groups
of IOCTL commands more fine-grained.

Noteworthy scenarios which require special attention:

TTY devices support IOCTLs like TIOCSTI and TIOCLINUX, which can be
used to control shell processes on the same terminal which run at
different privilege levels, which may make it possible to escape a
sandbox.  Because stdin, stdout and stderr are normally inherited
rather than newly opened, IOCTLs are usually permitted on them even
after the Landlock policy is enforced.

Some legitimate file system features, like setting up fscrypt, are
exposed as IOCTL commands on regular files and directories -- users of
Landlock are advised to double check that the sandboxed process does
not need to invoke these IOCTLs.

Known limitations:

The LANDLOCK_ACCESS_FS_IOCTL access right is a coarse-grained control
over IOCTL commands.  Future work will enable a more fine-grained
access control for IOCTLs.

In the meantime, Landlock users may use path-based restrictions in
combination with their knowledge about the file system layout to
control what IOCTLs can be done.  Mounting file systems with the nodev
option can help to distinguish regular files and devices, and give
guarantees about the affected files, which Landlock alone can not give
yet.

Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20231208155121.1943775-5-gnoack@google.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 include/uapi/linux/landlock.h                |  58 +++++-
 security/landlock/fs.c                       | 176 ++++++++++++++++++-
 security/landlock/fs.h                       |   2 +
 security/landlock/limits.h                   |  11 +-
 security/landlock/ruleset.h                  |   2 +-
 security/landlock/syscalls.c                 |  19 +-
 tools/testing/selftests/landlock/base_test.c |   2 +-
 tools/testing/selftests/landlock/fs_test.c   |   5 +-
 8 files changed, 253 insertions(+), 22 deletions(-)

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index 25c8d76775393a..578f268b084b70 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -128,7 +128,7 @@ struct landlock_net_port_attr {
  * files and directories.  Files or directories opened before the sandboxing
  * are not subject to these restrictions.
  *
- * A file can only receive these access rights:
+ * The following access rights apply only to files:
  *
  * - %LANDLOCK_ACCESS_FS_EXECUTE: Execute a file.
  * - %LANDLOCK_ACCESS_FS_WRITE_FILE: Open a file with write access. Note that
@@ -138,12 +138,13 @@ struct landlock_net_port_attr {
  * - %LANDLOCK_ACCESS_FS_READ_FILE: Open a file with read access.
  * - %LANDLOCK_ACCESS_FS_TRUNCATE: Truncate a file with :manpage:`truncate(2)`,
  *   :manpage:`ftruncate(2)`, :manpage:`creat(2)`, or :manpage:`open(2)` with
- *   ``O_TRUNC``. Whether an opened file can be truncated with
- *   :manpage:`ftruncate(2)` is determined during :manpage:`open(2)`, in the
- *   same way as read and write permissions are checked during
- *   :manpage:`open(2)` using %LANDLOCK_ACCESS_FS_READ_FILE and
- *   %LANDLOCK_ACCESS_FS_WRITE_FILE. This access right is available since the
- *   third version of the Landlock ABI.
+ *   ``O_TRUNC``.  This access right is available since the third version of the
+ *   Landlock ABI.
+ *
+ * Whether an opened file can be truncated with :manpage:`ftruncate(2)` or used
+ * with `ioctl(2)` is determined during :manpage:`open(2)`, in the same way as
+ * read and write permissions are checked during :manpage:`open(2)` using
+ * %LANDLOCK_ACCESS_FS_READ_FILE and %LANDLOCK_ACCESS_FS_WRITE_FILE.
  *
  * A directory can receive access rights related to files or directories.  The
  * following access right is applied to the directory itself, and the
@@ -198,13 +199,53 @@ struct landlock_net_port_attr {
  *   If multiple requirements are not met, the ``EACCES`` error code takes
  *   precedence over ``EXDEV``.
  *
+ * The following access right applies both to files and directories:
+ *
+ * - %LANDLOCK_ACCESS_FS_IOCTL: Invoke :manpage:`ioctl(2)` commands on an opened
+ *   file or directory.
+ *
+ *   This access right applies to all :manpage:`ioctl(2)` commands, except of
+ *   ``FIOCLEX``, ``FIONCLEX``, ``FIONBIO`` and ``FIOASYNC``.  These commands
+ *   continue to be invokable independent of the %LANDLOCK_ACCESS_FS_IOCTL
+ *   access right.
+ *
+ *   When certain other access rights are handled in the ruleset, in addition to
+ *   %LANDLOCK_ACCESS_FS_IOCTL, granting these access rights will unlock access
+ *   to additional groups of IOCTL commands, on the affected files:
+ *
+ *   * %LANDLOCK_ACCESS_FS_READ_FILE unlocks access to ``FIOQSIZE``,
+ *     ``FS_IOC_FIEMAP``, ``FIBMAP``, ``FIGETBSZ``, ``FIONREAD``,
+ *     ``FIDEDUPRANGE``.
+ *
+ *   * %LANDLOCK_ACCESS_FS_WRITE_FILE unlocks access to ``FIOQSIZE``,
+ *     ``FS_IOC_FIEMAP``, ``FIBMAP``, ``FIGETBSZ``, ``FICLONE``,
+ *     ``FICLONERANGE``, ``FS_IOC_RESVSP``, ``FS_IOC_RESVSP64``,
+ *     ``FS_IOC_UNRESVSP``, ``FS_IOC_UNRESVSP64``, ``FS_IOC_ZERO_RANGE``.
+ *
+ *   * %LANDLOCK_ACCESS_FS_READ_DIR unlocks access to ``FIOQSIZE``,
+ *     ``FS_IOC_FIEMAP``, ``FIBMAP``, ``FIGETBSZ``.
+ *
+ *   When these access rights are handled in the ruleset, the availability of
+ *   the affected IOCTL commands is not governed by %LANDLOCK_ACCESS_FS_IOCTL
+ *   any more, but by the respective access right.
+ *
+ *   All other IOCTL commands are not handled specially, and are governed by
+ *   %LANDLOCK_ACCESS_FS_IOCTL.  This includes %FS_IOC_GETFLAGS and
+ *   %FS_IOC_SETFLAGS for manipulating inode flags (:manpage:`ioctl_iflags(2)`),
+ *   %FS_IOC_FSFETXATTR and %FS_IOC_FSSETXATTR for manipulating extended
+ *   attributes, as well as %FIFREEZE and %FITHAW for freezing and thawing file
+ *   systems.
+ *
+ *   This access right is available since the fifth version of the Landlock
+ *   ABI.
+ *
  * .. warning::
  *
  *   It is currently not possible to restrict some file-related actions
  *   accessible through these syscall families: :manpage:`chdir(2)`,
  *   :manpage:`stat(2)`, :manpage:`flock(2)`, :manpage:`chmod(2)`,
  *   :manpage:`chown(2)`, :manpage:`setxattr(2)`, :manpage:`utime(2)`,
- *   :manpage:`ioctl(2)`, :manpage:`fcntl(2)`, :manpage:`access(2)`.
+ *   :manpage:`fcntl(2)`, :manpage:`access(2)`.
  *   Future Landlock evolutions will enable to restrict them.
  */
 /* clang-format off */
@@ -223,6 +264,7 @@ struct landlock_net_port_attr {
 #define LANDLOCK_ACCESS_FS_MAKE_SYM			(1ULL << 12)
 #define LANDLOCK_ACCESS_FS_REFER			(1ULL << 13)
 #define LANDLOCK_ACCESS_FS_TRUNCATE			(1ULL << 14)
+#define LANDLOCK_ACCESS_FS_IOCTL			(1ULL << 15)
 /* clang-format on */
 
 /**
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index 73997e63734f93..90f7f6db1e87dc 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -7,6 +7,7 @@
  * Copyright © 2021-2022 Microsoft Corporation
  */
 
+#include <asm/ioctls.h>
 #include <kunit/test.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
@@ -14,6 +15,7 @@
 #include <linux/compiler_types.h>
 #include <linux/dcache.h>
 #include <linux/err.h>
+#include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -29,6 +31,7 @@
 #include <linux/types.h>
 #include <linux/wait_bit.h>
 #include <linux/workqueue.h>
+#include <uapi/linux/fiemap.h>
 #include <uapi/linux/landlock.h>
 
 #include "common.h"
@@ -84,6 +87,145 @@ static const struct landlock_object_underops landlock_fs_underops = {
 	.release = release_inode
 };
 
+/* IOCTL helpers */
+
+/*
+ * These are synthetic access rights, which are only used within the kernel, but
+ * not exposed to callers in userspace.  The mapping between these access rights
+ * and IOCTL commands is defined in the required_ioctl_access() helper function.
+ */
+#define LANDLOCK_ACCESS_FS_IOCTL_GROUP1 (LANDLOCK_LAST_PUBLIC_ACCESS_FS << 1)
+#define LANDLOCK_ACCESS_FS_IOCTL_GROUP2 (LANDLOCK_LAST_PUBLIC_ACCESS_FS << 2)
+#define LANDLOCK_ACCESS_FS_IOCTL_GROUP3 (LANDLOCK_LAST_PUBLIC_ACCESS_FS << 3)
+#define LANDLOCK_ACCESS_FS_IOCTL_GROUP4 (LANDLOCK_LAST_PUBLIC_ACCESS_FS << 4)
+
+/* ioctl_groups - all synthetic access rights for IOCTL command groups */
+/* clang-format off */
+#define IOCTL_GROUPS (			  \
+	LANDLOCK_ACCESS_FS_IOCTL_GROUP1 | \
+	LANDLOCK_ACCESS_FS_IOCTL_GROUP2 | \
+	LANDLOCK_ACCESS_FS_IOCTL_GROUP3 | \
+	LANDLOCK_ACCESS_FS_IOCTL_GROUP4)
+/* clang-format on */
+
+static_assert((IOCTL_GROUPS & LANDLOCK_MASK_ACCESS_FS) == IOCTL_GROUPS);
+
+/**
+ * required_ioctl_access(): Determine required IOCTL access rights.
+ *
+ * @cmd: The IOCTL command that is supposed to be run.
+ *
+ * Returns: The access rights that must be granted on an opened file in order to
+ * use the given @cmd.
+ */
+static access_mask_t required_ioctl_access(unsigned int cmd)
+{
+	switch (cmd) {
+	case FIOCLEX:
+	case FIONCLEX:
+	case FIONBIO:
+	case FIOASYNC:
+		/*
+		 * FIOCLEX, FIONCLEX, FIONBIO and FIOASYNC manipulate the FD's
+		 * close-on-exec and the file's buffered-IO and async flags.
+		 * These operations are also available through fcntl(2),
+		 * and are unconditionally permitted in Landlock.
+		 */
+		return 0;
+	case FIOQSIZE:
+		return LANDLOCK_ACCESS_FS_IOCTL_GROUP1;
+	case FS_IOC_FIEMAP:
+	case FIBMAP:
+	case FIGETBSZ:
+		return LANDLOCK_ACCESS_FS_IOCTL_GROUP2;
+	case FIONREAD:
+	case FIDEDUPERANGE:
+		return LANDLOCK_ACCESS_FS_IOCTL_GROUP3;
+	case FICLONE:
+	case FICLONERANGE:
+	case FS_IOC_RESVSP:
+	case FS_IOC_RESVSP64:
+	case FS_IOC_UNRESVSP:
+	case FS_IOC_UNRESVSP64:
+	case FS_IOC_ZERO_RANGE:
+		return LANDLOCK_ACCESS_FS_IOCTL_GROUP4;
+	default:
+		/*
+		 * Other commands are guarded by the catch-all access right.
+		 */
+		return LANDLOCK_ACCESS_FS_IOCTL;
+	}
+}
+
+/**
+ * expand_ioctl() - Return the dst flags from either the src flag or the
+ * %LANDLOCK_ACCESS_FS_IOCTL flag, depending on whether the
+ * %LANDLOCK_ACCESS_FS_IOCTL and src access rights are handled or not.
+ *
+ * @handled: Handled access rights.
+ * @access: The access mask to copy values from.
+ * @src: A single access right to copy from in @access.
+ * @dst: One or more access rights to copy to.
+ *
+ * Returns: @dst, or 0.
+ */
+static access_mask_t expand_ioctl(const access_mask_t handled,
+				  const access_mask_t access,
+				  const access_mask_t src,
+				  const access_mask_t dst)
+{
+	access_mask_t copy_from;
+
+	if (!(handled & LANDLOCK_ACCESS_FS_IOCTL))
+		return 0;
+
+	copy_from = (handled & src) ? src : LANDLOCK_ACCESS_FS_IOCTL;
+	if (access & copy_from)
+		return dst;
+
+	return 0;
+}
+
+/**
+ * landlock_expand_access_fs() - Returns @access with the synthetic IOCTL group
+ * flags enabled if necessary.
+ *
+ * @handled: Handled FS access rights.
+ * @access: FS access rights to expand.
+ *
+ * Returns: @access expanded by the necessary flags for the synthetic IOCTL
+ * access rights.
+ */
+static access_mask_t landlock_expand_access_fs(const access_mask_t handled,
+					       const access_mask_t access)
+{
+	return access |
+	       expand_ioctl(handled, access, LANDLOCK_ACCESS_FS_WRITE_FILE,
+			    LANDLOCK_ACCESS_FS_IOCTL_GROUP1 |
+				    LANDLOCK_ACCESS_FS_IOCTL_GROUP2 |
+				    LANDLOCK_ACCESS_FS_IOCTL_GROUP4) |
+	       expand_ioctl(handled, access, LANDLOCK_ACCESS_FS_READ_FILE,
+			    LANDLOCK_ACCESS_FS_IOCTL_GROUP1 |
+				    LANDLOCK_ACCESS_FS_IOCTL_GROUP2 |
+				    LANDLOCK_ACCESS_FS_IOCTL_GROUP3) |
+	       expand_ioctl(handled, access, LANDLOCK_ACCESS_FS_READ_DIR,
+			    LANDLOCK_ACCESS_FS_IOCTL_GROUP1);
+}
+
+/**
+ * landlock_expand_handled_access_fs() - add synthetic IOCTL access rights to an
+ * access mask of handled accesses.
+ *
+ * @handled: The handled accesses of a ruleset that is being created.
+ *
+ * Returns: @handled, with the bits for the synthetic IOCTL access rights set,
+ * if %LANDLOCK_ACCESS_FS_IOCTL is handled.
+ */
+access_mask_t landlock_expand_handled_access_fs(const access_mask_t handled)
+{
+	return landlock_expand_access_fs(handled, handled);
+}
+
 /* Ruleset management */
 
 static struct landlock_object *get_inode_object(struct inode *const inode)
@@ -148,7 +290,8 @@ static struct landlock_object *get_inode_object(struct inode *const inode)
 	LANDLOCK_ACCESS_FS_EXECUTE | \
 	LANDLOCK_ACCESS_FS_WRITE_FILE | \
 	LANDLOCK_ACCESS_FS_READ_FILE | \
-	LANDLOCK_ACCESS_FS_TRUNCATE)
+	LANDLOCK_ACCESS_FS_TRUNCATE | \
+	LANDLOCK_ACCESS_FS_IOCTL)
 /* clang-format on */
 
 /*
@@ -158,6 +301,7 @@ int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
 			    const struct path *const path,
 			    access_mask_t access_rights)
 {
+	access_mask_t handled;
 	int err;
 	struct landlock_id id = {
 		.type = LANDLOCK_KEY_INODE,
@@ -170,9 +314,11 @@ int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
 	if (WARN_ON_ONCE(ruleset->num_layers != 1))
 		return -EINVAL;
 
+	handled = landlock_get_fs_access_mask(ruleset, 0);
+	/* Expands the synthetic IOCTL groups. */
+	access_rights |= landlock_expand_access_fs(handled, access_rights);
 	/* Transforms relative access rights to absolute ones. */
-	access_rights |= LANDLOCK_MASK_ACCESS_FS &
-			 ~landlock_get_fs_access_mask(ruleset, 0);
+	access_rights |= LANDLOCK_MASK_ACCESS_FS & ~handled;
 	id.key.object = get_inode_object(d_backing_inode(path->dentry));
 	if (IS_ERR(id.key.object))
 		return PTR_ERR(id.key.object);
@@ -1333,7 +1479,9 @@ static int hook_file_open(struct file *const file)
 {
 	layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
 	access_mask_t open_access_request, full_access_request, allowed_access;
-	const access_mask_t optional_access = LANDLOCK_ACCESS_FS_TRUNCATE;
+	const access_mask_t optional_access = LANDLOCK_ACCESS_FS_TRUNCATE |
+					      LANDLOCK_ACCESS_FS_IOCTL |
+					      IOCTL_GROUPS;
 	const struct landlock_ruleset *const dom = get_current_fs_domain();
 
 	if (!dom)
@@ -1406,6 +1554,25 @@ static int hook_file_truncate(struct file *const file)
 	return -EACCES;
 }
 
+static int hook_file_ioctl(struct file *file, unsigned int cmd,
+			   unsigned long arg)
+{
+	const access_mask_t required_access = required_ioctl_access(cmd);
+	const access_mask_t allowed_access =
+		landlock_file(file)->allowed_access;
+
+	/*
+	 * It is the access rights at the time of opening the file which
+	 * determine whether IOCTL can be used on the opened file later.
+	 *
+	 * The access right is attached to the opened file in hook_file_open().
+	 */
+	if ((allowed_access & required_access) == required_access)
+		return 0;
+
+	return -EACCES;
+}
+
 static struct security_hook_list landlock_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(inode_free_security, hook_inode_free_security),
 
@@ -1428,6 +1595,7 @@ static struct security_hook_list landlock_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(file_alloc_security, hook_file_alloc_security),
 	LSM_HOOK_INIT(file_open, hook_file_open),
 	LSM_HOOK_INIT(file_truncate, hook_file_truncate),
+	LSM_HOOK_INIT(file_ioctl, hook_file_ioctl),
 };
 
 __init void landlock_add_fs_hooks(void)
diff --git a/security/landlock/fs.h b/security/landlock/fs.h
index 488e4813680ab7..c88fe7bda37b69 100644
--- a/security/landlock/fs.h
+++ b/security/landlock/fs.h
@@ -92,4 +92,6 @@ int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
 			    const struct path *const path,
 			    access_mask_t access_hierarchy);
 
+access_mask_t landlock_expand_handled_access_fs(const access_mask_t handled);
+
 #endif /* _SECURITY_LANDLOCK_FS_H */
diff --git a/security/landlock/limits.h b/security/landlock/limits.h
index 93c9c6f915567e..296795f8a5c127 100644
--- a/security/landlock/limits.h
+++ b/security/landlock/limits.h
@@ -18,7 +18,16 @@
 #define LANDLOCK_MAX_NUM_LAYERS		16
 #define LANDLOCK_MAX_NUM_RULES		U32_MAX
 
-#define LANDLOCK_LAST_ACCESS_FS		LANDLOCK_ACCESS_FS_TRUNCATE
+/*
+ * For file system access rights, Landlock distinguishes between the publicly
+ * visible access rights (1 to LANDLOCK_LAST_PUBLIC_ACCESS_FS) and the private
+ * ones which are not exposed to userspace (LANDLOCK_LAST_PUBLIC_ACCESS_FS + 1
+ * to LANDLOCK_LAST_ACCESS_FS).  The private access rights are defined in fs.c.
+ */
+#define LANDLOCK_LAST_PUBLIC_ACCESS_FS	LANDLOCK_ACCESS_FS_IOCTL
+#define LANDLOCK_MASK_PUBLIC_ACCESS_FS	((LANDLOCK_LAST_PUBLIC_ACCESS_FS << 1) - 1)
+
+#define LANDLOCK_LAST_ACCESS_FS		(LANDLOCK_LAST_PUBLIC_ACCESS_FS << 4)
 #define LANDLOCK_MASK_ACCESS_FS		((LANDLOCK_LAST_ACCESS_FS << 1) - 1)
 #define LANDLOCK_NUM_ACCESS_FS		__const_hweight64(LANDLOCK_MASK_ACCESS_FS)
 #define LANDLOCK_SHIFT_ACCESS_FS	0
diff --git a/security/landlock/ruleset.h b/security/landlock/ruleset.h
index c7f1526784fd10..5a28ea8e1c3d50 100644
--- a/security/landlock/ruleset.h
+++ b/security/landlock/ruleset.h
@@ -30,7 +30,7 @@
 	LANDLOCK_ACCESS_FS_REFER)
 /* clang-format on */
 
-typedef u16 access_mask_t;
+typedef u32 access_mask_t;
 /* Makes sure all filesystem access rights can be stored. */
 static_assert(BITS_PER_TYPE(access_mask_t) >= LANDLOCK_NUM_ACCESS_FS);
 /* Makes sure all network access rights can be stored. */
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index 898358f57fa085..f0bc50003b4684 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -137,7 +137,7 @@ static const struct file_operations ruleset_fops = {
 	.write = fop_dummy_write,
 };
 
-#define LANDLOCK_ABI_VERSION 4
+#define LANDLOCK_ABI_VERSION 5
 
 /**
  * sys_landlock_create_ruleset - Create a new ruleset
@@ -192,8 +192,8 @@ SYSCALL_DEFINE3(landlock_create_ruleset,
 		return err;
 
 	/* Checks content (and 32-bits cast). */
-	if ((ruleset_attr.handled_access_fs | LANDLOCK_MASK_ACCESS_FS) !=
-	    LANDLOCK_MASK_ACCESS_FS)
+	if ((ruleset_attr.handled_access_fs | LANDLOCK_MASK_PUBLIC_ACCESS_FS) !=
+	    LANDLOCK_MASK_PUBLIC_ACCESS_FS)
 		return -EINVAL;
 
 	/* Checks network content (and 32-bits cast). */
@@ -201,6 +201,10 @@ SYSCALL_DEFINE3(landlock_create_ruleset,
 	    LANDLOCK_MASK_ACCESS_NET)
 		return -EINVAL;
 
+	/* Expands synthetic IOCTL groups. */
+	ruleset_attr.handled_access_fs = landlock_expand_handled_access_fs(
+		ruleset_attr.handled_access_fs);
+
 	/* Checks arguments and transforms to kernel struct. */
 	ruleset = landlock_create_ruleset(ruleset_attr.handled_access_fs,
 					  ruleset_attr.handled_access_net);
@@ -309,8 +313,13 @@ static int add_rule_path_beneath(struct landlock_ruleset *const ruleset,
 	if (!path_beneath_attr.allowed_access)
 		return -ENOMSG;
 
-	/* Checks that allowed_access matches the @ruleset constraints. */
-	mask = landlock_get_raw_fs_access_mask(ruleset, 0);
+	/*
+	 * Checks that allowed_access matches the @ruleset constraints and only
+	 * consists of publicly visible access rights (as opposed to synthetic
+	 * ones).
+	 */
+	mask = landlock_get_raw_fs_access_mask(ruleset, 0) &
+	       LANDLOCK_MASK_PUBLIC_ACCESS_FS;
 	if ((path_beneath_attr.allowed_access | mask) != mask)
 		return -EINVAL;
 
diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c
index 646f778dfb1eee..d292b419ccba40 100644
--- a/tools/testing/selftests/landlock/base_test.c
+++ b/tools/testing/selftests/landlock/base_test.c
@@ -75,7 +75,7 @@ TEST(abi_version)
 	const struct landlock_ruleset_attr ruleset_attr = {
 		.handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE,
 	};
-	ASSERT_EQ(4, landlock_create_ruleset(NULL, 0,
+	ASSERT_EQ(5, landlock_create_ruleset(NULL, 0,
 					     LANDLOCK_CREATE_RULESET_VERSION));
 
 	ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0,
diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 2d6d9b43d958cf..3203f4a5bc8595 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -527,9 +527,10 @@ TEST_F_FORK(layout1, inval)
 	LANDLOCK_ACCESS_FS_EXECUTE | \
 	LANDLOCK_ACCESS_FS_WRITE_FILE | \
 	LANDLOCK_ACCESS_FS_READ_FILE | \
-	LANDLOCK_ACCESS_FS_TRUNCATE)
+	LANDLOCK_ACCESS_FS_TRUNCATE | \
+	LANDLOCK_ACCESS_FS_IOCTL)
 
-#define ACCESS_LAST LANDLOCK_ACCESS_FS_TRUNCATE
+#define ACCESS_LAST LANDLOCK_ACCESS_FS_IOCTL
 
 #define ACCESS_ALL ( \
 	ACCESS_FILE | \

From e14aeb6cda036f78bd6a1eba2eb04b6d8beb2814 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian.fainelli@broadcom.com>
Date: Tue, 18 Oct 2022 16:23:53 -0700
Subject: [PATCH 299/707] ARM: brcmstb: Add debug UART entry for 74165

BCM74165 uses the same address map as the 7278 family (v7 memory map)
therefore re-use that constant and shit down the other labels to keep
numerical ordering.

Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
---
 arch/arm/include/debug/brcmstb.S | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/arm/include/debug/brcmstb.S b/arch/arm/include/debug/brcmstb.S
index f6175e6e28cd22..3f7d68740ed4c9 100644
--- a/arch/arm/include/debug/brcmstb.S
+++ b/arch/arm/include/debug/brcmstb.S
@@ -27,6 +27,7 @@
 #define UARTA_72165		UARTA_7278
 #define UARTA_7364		REG_PHYS_ADDR(0x40b000)
 #define UARTA_7366		UARTA_7364
+#define UARTA_74165		UARTA_7278
 #define UARTA_74371		REG_PHYS_ADDR(0x406b00)
 #define UARTA_7439		REG_PHYS_ADDR(0x40a900)
 #define UARTA_7445		REG_PHYS_ADDR(0x40ab00)
@@ -88,9 +89,10 @@ ARM_BE8(	rev	\rv, \rv )
 30:		checkuart(\rp, \rv, 0x72780000, 7278)
 31:		checkuart(\rp, \rv, 0x73640000, 7364)
 32:		checkuart(\rp, \rv, 0x73660000, 7366)
-33:		checkuart(\rp, \rv, 0x07437100, 74371)
-34:		checkuart(\rp, \rv, 0x74390000, 7439)
-35:		checkuart(\rp, \rv, 0x74450000, 7445)
+33:		checkuart(\rp, \rv, 0x07416500, 74165)
+34:		checkuart(\rp, \rv, 0x07437100, 74371)
+35:		checkuart(\rp, \rv, 0x74390000, 7439)
+36:		checkuart(\rp, \rv, 0x74450000, 7445)
 
 		/* No valid UART found */
 90:		mov	\rp, #0

From cd3ac0d15df74727044a15e6c6954d9a11a59860 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Mon, 29 Jan 2024 19:27:40 +0800
Subject: [PATCH 300/707] f2fs: zone: fix to wait completion of last bio in
 zone correctly

It needs to check last zone_pending_bio and wait IO completion before
traverse next fio in io->io_list, otherwise, bio in next zone may be
submitted before all IO completion in current zone.

Fixes: e067dc3c6b9c ("f2fs: maintain six open zones for zoned devices")
Cc: Daeho Jeong <daehojeong@google.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Reviewed-by: Daeho Jeong <daehojeong@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 65fe48bb17d16b..ac82e69a9f5fda 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1010,7 +1010,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
 	f2fs_bug_on(sbi, is_read_io(fio->op));
 
 	f2fs_down_write(&io->io_rwsem);
-
+next:
 #ifdef CONFIG_BLK_DEV_ZONED
 	if (f2fs_sb_has_blkzoned(sbi) && btype < META && io->zone_pending_bio) {
 		wait_for_completion_io(&io->zone_wait);
@@ -1020,7 +1020,6 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
 	}
 #endif
 
-next:
 	if (fio->in_list) {
 		spin_lock(&io->io_lock);
 		if (list_empty(&io->io_list)) {

From 52434fdc4f86c261e986e7b24035b5ab20d32145 Mon Sep 17 00:00:00 2001
From: Wenjie Qi <qwjhust@gmail.com>
Date: Tue, 16 Jan 2024 22:11:38 +0800
Subject: [PATCH 301/707] f2fs: fix NULL pointer dereference in
 f2fs_submit_page_write()

BUG: kernel NULL pointer dereference, address: 0000000000000014
RIP: 0010:f2fs_submit_page_write+0x6cf/0x780 [f2fs]
Call Trace:
<TASK>
? show_regs+0x6e/0x80
? __die+0x29/0x70
? page_fault_oops+0x154/0x4a0
? prb_read_valid+0x20/0x30
? __irq_work_queue_local+0x39/0xd0
? irq_work_queue+0x36/0x70
? do_user_addr_fault+0x314/0x6c0
? exc_page_fault+0x7d/0x190
? asm_exc_page_fault+0x2b/0x30
? f2fs_submit_page_write+0x6cf/0x780 [f2fs]
? f2fs_submit_page_write+0x736/0x780 [f2fs]
do_write_page+0x50/0x170 [f2fs]
f2fs_outplace_write_data+0x61/0xb0 [f2fs]
f2fs_do_write_data_page+0x3f8/0x660 [f2fs]
f2fs_write_single_data_page+0x5bb/0x7a0 [f2fs]
f2fs_write_cache_pages+0x3da/0xbe0 [f2fs]
...
It is possible that other threads have added this fio to io->bio
and submitted the io->bio before entering f2fs_submit_page_write().
At this point io->bio = NULL.
If is_end_zone_blkaddr(sbi, fio->new_blkaddr) of this fio is true,
then an NULL pointer dereference error occurs at bio_get(io->bio).
The original code for determining zone end was after "out:",
which would have missed some fio who is zone end. I've moved
 this code before "skip:" to make sure it's done for each fio.

Fixes: e067dc3c6b9c ("f2fs: maintain six open zones for zoned devices")
Signed-off-by: Wenjie Qi <qwjhust@gmail.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ac82e69a9f5fda..05158f89ef32db 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1080,10 +1080,6 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
 	io->last_block_in_bio = fio->new_blkaddr;
 
 	trace_f2fs_submit_page_write(fio->page, fio);
-skip:
-	if (fio->in_list)
-		goto next;
-out:
 #ifdef CONFIG_BLK_DEV_ZONED
 	if (f2fs_sb_has_blkzoned(sbi) && btype < META &&
 			is_end_zone_blkaddr(sbi, fio->new_blkaddr)) {
@@ -1096,6 +1092,10 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
 		__submit_merged_bio(io);
 	}
 #endif
+skip:
+	if (fio->in_list)
+		goto next;
+out:
 	if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
 				!f2fs_is_checkpoint_ready(sbi))
 		__submit_merged_bio(io);

From ed4c142b8c136481123f4ee140164e5023a536ac Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Wed, 10 Jan 2024 14:28:41 -0600
Subject: [PATCH 302/707] iio: adc: ad7380: new driver for AD7380 ADCs

This adds a new driver for the AD7380 family ADCs.

The driver currently implements basic support for the AD7380, AD7381,
AD7383, and AD7384 2-channel differential ADCs. Support for additional
single-ended and 4-channel chips that use the same register map as well
as additional features of the chip will be added in future patches.

Co-developed-by: Stefan Popa <stefan.popa@analog.com>
Signed-off-by: Stefan Popa <stefan.popa@analog.com>
Reviewed-by: Nuno Sa <nuno.sa@analog.com>
Signed-off-by: David Lechner <dlechner@baylibre.com>
Link: https://lore.kernel.org/r/20240110-ad7380-mainline-v4-2-93a1d96b50fa@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 MAINTAINERS              |   1 +
 drivers/iio/adc/Kconfig  |  16 ++
 drivers/iio/adc/Makefile |   1 +
 drivers/iio/adc/ad7380.c | 462 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 480 insertions(+)
 create mode 100644 drivers/iio/adc/ad7380.c

diff --git a/MAINTAINERS b/MAINTAINERS
index da83fdae811b79..c1a7f85fdbacfe 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -435,6 +435,7 @@ S:	Supported
 W:	https://wiki.analog.com/resources/tools-software/linux-drivers/iio-adc/ad738x
 W:	https://ez.analog.com/linux-software-drivers
 F:	Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml
+F:	drivers/iio/adc/ad7380.c
 
 AD7877 TOUCHSCREEN DRIVER
 M:	Michael Hennerich <michael.hennerich@analog.com>
diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig
index 3b73c509bd68ef..59ae1d17b50d44 100644
--- a/drivers/iio/adc/Kconfig
+++ b/drivers/iio/adc/Kconfig
@@ -138,6 +138,22 @@ config AD7298
 	  To compile this driver as a module, choose M here: the
 	  module will be called ad7298.
 
+config AD7380
+	tristate "Analog Devices AD7380 ADC driver"
+	depends on SPI_MASTER
+	select IIO_BUFFER
+	select IIO_TRIGGER
+	select IIO_TRIGGERED_BUFFER
+	help
+	  AD7380 is a family of simultaneous sampling ADCs that share the same
+	  SPI register map and have similar pinouts.
+
+	  Say yes here to build support for Analog Devices AD7380 ADC and
+	  similar chips.
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called ad7380.
+
 config AD7476
 	tristate "Analog Devices AD7476 1-channel ADCs driver and other similar devices from AD and TI"
 	depends on SPI
diff --git a/drivers/iio/adc/Makefile b/drivers/iio/adc/Makefile
index d2fda54a3259c9..5a26ab6f110952 100644
--- a/drivers/iio/adc/Makefile
+++ b/drivers/iio/adc/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_AD7291) += ad7291.o
 obj-$(CONFIG_AD7292) += ad7292.o
 obj-$(CONFIG_AD7298) += ad7298.o
 obj-$(CONFIG_AD7923) += ad7923.o
+obj-$(CONFIG_AD7380) += ad7380.o
 obj-$(CONFIG_AD7476) += ad7476.o
 obj-$(CONFIG_AD7606_IFACE_PARALLEL) += ad7606_par.o
 obj-$(CONFIG_AD7606_IFACE_SPI) += ad7606_spi.o
diff --git a/drivers/iio/adc/ad7380.c b/drivers/iio/adc/ad7380.c
new file mode 100644
index 00000000000000..44b8b18ab213ac
--- /dev/null
+++ b/drivers/iio/adc/ad7380.c
@@ -0,0 +1,462 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Analog Devices AD738x Simultaneous Sampling SAR ADCs
+ *
+ * Copyright 2017 Analog Devices Inc.
+ * Copyright 2023 BayLibre, SAS
+ */
+
+#include <linux/bitfield.h>
+#include <linux/bitops.h>
+#include <linux/cleanup.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+#include <linux/regulator/consumer.h>
+#include <linux/slab.h>
+#include <linux/spi/spi.h>
+#include <linux/sysfs.h>
+
+#include <linux/iio/buffer.h>
+#include <linux/iio/iio.h>
+#include <linux/iio/sysfs.h>
+#include <linux/iio/trigger_consumer.h>
+#include <linux/iio/triggered_buffer.h>
+
+/* 2.5V internal reference voltage */
+#define AD7380_INTERNAL_REF_MV		2500
+
+/* reading and writing registers is more reliable at lower than max speed */
+#define AD7380_REG_WR_SPEED_HZ		10000000
+
+#define AD7380_REG_WR			BIT(15)
+#define AD7380_REG_REGADDR		GENMASK(14, 12)
+#define AD7380_REG_DATA			GENMASK(11, 0)
+
+#define AD7380_REG_ADDR_NOP		0x0
+#define AD7380_REG_ADDR_CONFIG1		0x1
+#define AD7380_REG_ADDR_CONFIG2		0x2
+#define AD7380_REG_ADDR_ALERT		0x3
+#define AD7380_REG_ADDR_ALERT_LOW_TH	0x4
+#define AD7380_REG_ADDR_ALERT_HIGH_TH	0x5
+
+#define AD7380_CONFIG1_OS_MODE		BIT(9)
+#define AD7380_CONFIG1_OSR		GENMASK(8, 6)
+#define AD7380_CONFIG1_CRC_W		BIT(5)
+#define AD7380_CONFIG1_CRC_R		BIT(4)
+#define AD7380_CONFIG1_ALERTEN		BIT(3)
+#define AD7380_CONFIG1_RES		BIT(2)
+#define AD7380_CONFIG1_REFSEL		BIT(1)
+#define AD7380_CONFIG1_PMODE		BIT(0)
+
+#define AD7380_CONFIG2_SDO2		GENMASK(9, 8)
+#define AD7380_CONFIG2_SDO		BIT(8)
+#define AD7380_CONFIG2_RESET		GENMASK(7, 0)
+
+#define AD7380_CONFIG2_RESET_SOFT	0x3C
+#define AD7380_CONFIG2_RESET_HARD	0xFF
+
+#define AD7380_ALERT_LOW_TH		GENMASK(11, 0)
+#define AD7380_ALERT_HIGH_TH		GENMASK(11, 0)
+
+struct ad7380_chip_info {
+	const char *name;
+	const struct iio_chan_spec *channels;
+	unsigned int num_channels;
+};
+
+#define AD7380_DIFFERENTIAL_CHANNEL(index, bits) {		\
+	.type = IIO_VOLTAGE,					\
+	.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),		\
+	.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE),	\
+	.indexed = 1,						\
+	.differential = 1,					\
+	.channel = 2 * (index),					\
+	.channel2 = 2 * (index) + 1,				\
+	.scan_index = (index),					\
+	.scan_type = {						\
+		.sign = 's',					\
+		.realbits = (bits),				\
+		.storagebits = 16,				\
+		.endianness = IIO_CPU,				\
+	},							\
+}
+
+#define DEFINE_AD7380_DIFFERENTIAL_2_CHANNEL(name, bits)	\
+static const struct iio_chan_spec name[] = {			\
+	AD7380_DIFFERENTIAL_CHANNEL(0, bits),			\
+	AD7380_DIFFERENTIAL_CHANNEL(1, bits),			\
+	IIO_CHAN_SOFT_TIMESTAMP(2),				\
+}
+
+/* fully differential */
+DEFINE_AD7380_DIFFERENTIAL_2_CHANNEL(ad7380_channels, 16);
+DEFINE_AD7380_DIFFERENTIAL_2_CHANNEL(ad7381_channels, 14);
+/* pseudo differential */
+DEFINE_AD7380_DIFFERENTIAL_2_CHANNEL(ad7383_channels, 16);
+DEFINE_AD7380_DIFFERENTIAL_2_CHANNEL(ad7384_channels, 14);
+
+/* Since this is simultaneous sampling, we don't allow individual channels. */
+static const unsigned long ad7380_2_channel_scan_masks[] = {
+	GENMASK(1, 0),
+	0
+};
+
+static const struct ad7380_chip_info ad7380_chip_info = {
+	.name = "ad7380",
+	.channels = ad7380_channels,
+	.num_channels = ARRAY_SIZE(ad7380_channels),
+};
+
+static const struct ad7380_chip_info ad7381_chip_info = {
+	.name = "ad7381",
+	.channels = ad7381_channels,
+	.num_channels = ARRAY_SIZE(ad7381_channels),
+};
+
+static const struct ad7380_chip_info ad7383_chip_info = {
+	.name = "ad7383",
+	.channels = ad7383_channels,
+	.num_channels = ARRAY_SIZE(ad7383_channels),
+};
+
+static const struct ad7380_chip_info ad7384_chip_info = {
+	.name = "ad7384",
+	.channels = ad7384_channels,
+	.num_channels = ARRAY_SIZE(ad7384_channels),
+};
+
+struct ad7380_state {
+	const struct ad7380_chip_info *chip_info;
+	struct spi_device *spi;
+	struct regulator *vref;
+	struct regmap *regmap;
+	/*
+	 * DMA (thus cache coherency maintenance) requires the
+	 * transfer buffers to live in their own cache lines.
+	 * Make the buffer large enough for 2 16-bit samples and one 64-bit
+	 * aligned 64 bit timestamp.
+	 */
+	struct {
+		u16 raw[2];
+		s64 ts __aligned(8);
+	} scan_data __aligned(IIO_DMA_MINALIGN);
+	u16 tx[2];
+	u16 rx[2];
+};
+
+static int ad7380_regmap_reg_write(void *context, unsigned int reg,
+				   unsigned int val)
+{
+	struct ad7380_state *st = context;
+	struct spi_transfer xfer = {
+		.speed_hz = AD7380_REG_WR_SPEED_HZ,
+		.bits_per_word = 16,
+		.len = 2,
+		.tx_buf = &st->tx[0],
+	};
+
+	st->tx[0] = FIELD_PREP(AD7380_REG_WR, 1) |
+		    FIELD_PREP(AD7380_REG_REGADDR, reg) |
+		    FIELD_PREP(AD7380_REG_DATA, val);
+
+	return spi_sync_transfer(st->spi, &xfer, 1);
+}
+
+static int ad7380_regmap_reg_read(void *context, unsigned int reg,
+				  unsigned int *val)
+{
+	struct ad7380_state *st = context;
+	struct spi_transfer xfers[] = {
+		{
+			.speed_hz = AD7380_REG_WR_SPEED_HZ,
+			.bits_per_word = 16,
+			.len = 2,
+			.tx_buf = &st->tx[0],
+			.cs_change = 1,
+			.cs_change_delay = {
+				.value = 10, /* t[CSH] */
+				.unit = SPI_DELAY_UNIT_NSECS,
+			},
+		}, {
+			.speed_hz = AD7380_REG_WR_SPEED_HZ,
+			.bits_per_word = 16,
+			.len = 2,
+			.rx_buf = &st->rx[0],
+		},
+	};
+	int ret;
+
+	st->tx[0] = FIELD_PREP(AD7380_REG_WR, 0) |
+		    FIELD_PREP(AD7380_REG_REGADDR, reg) |
+		    FIELD_PREP(AD7380_REG_DATA, 0);
+
+	ret = spi_sync_transfer(st->spi, xfers, ARRAY_SIZE(xfers));
+	if (ret < 0)
+		return ret;
+
+	*val = FIELD_GET(AD7380_REG_DATA, st->rx[0]);
+
+	return 0;
+}
+
+static const struct regmap_config ad7380_regmap_config = {
+	.reg_bits = 3,
+	.val_bits = 12,
+	.reg_read = ad7380_regmap_reg_read,
+	.reg_write = ad7380_regmap_reg_write,
+	.max_register = AD7380_REG_ADDR_ALERT_HIGH_TH,
+	.can_sleep = true,
+};
+
+static int ad7380_debugfs_reg_access(struct iio_dev *indio_dev, u32 reg,
+				     u32 writeval, u32 *readval)
+{
+	struct ad7380_state *st = iio_priv(indio_dev);
+	int ret;
+
+	ret = iio_device_claim_direct_mode(indio_dev);
+	if (ret)
+		return ret;
+
+	if (readval)
+		ret = regmap_read(st->regmap, reg, readval);
+	else
+		ret = regmap_write(st->regmap, reg, writeval);
+
+	iio_device_release_direct_mode(indio_dev);
+
+	return ret;
+}
+
+static irqreturn_t ad7380_trigger_handler(int irq, void *p)
+{
+	struct iio_poll_func *pf = p;
+	struct iio_dev *indio_dev = pf->indio_dev;
+	struct ad7380_state *st = iio_priv(indio_dev);
+	struct spi_transfer xfer = {
+		.bits_per_word = st->chip_info->channels[0].scan_type.realbits,
+		.len = 4,
+		.rx_buf = st->scan_data.raw,
+	};
+	int ret;
+
+	ret = spi_sync_transfer(st->spi, &xfer, 1);
+	if (ret)
+		goto out;
+
+	iio_push_to_buffers_with_timestamp(indio_dev, &st->scan_data,
+					   pf->timestamp);
+
+out:
+	iio_trigger_notify_done(indio_dev->trig);
+
+	return IRQ_HANDLED;
+}
+
+static int ad7380_read_direct(struct ad7380_state *st,
+			      struct iio_chan_spec const *chan, int *val)
+{
+	struct spi_transfer xfers[] = {
+		/* toggle CS (no data xfer) to trigger a conversion */
+		{
+			.speed_hz = AD7380_REG_WR_SPEED_HZ,
+			.bits_per_word = chan->scan_type.realbits,
+			.delay = {
+				.value = 190, /* t[CONVERT] */
+				.unit = SPI_DELAY_UNIT_NSECS,
+			},
+			.cs_change = 1,
+			.cs_change_delay = {
+				.value = 10, /* t[CSH] */
+				.unit = SPI_DELAY_UNIT_NSECS,
+			},
+		},
+		/* then read both channels */
+		{
+			.speed_hz = AD7380_REG_WR_SPEED_HZ,
+			.bits_per_word = chan->scan_type.realbits,
+			.rx_buf = &st->rx[0],
+			.len = 4,
+		},
+	};
+	int ret;
+
+	ret = spi_sync_transfer(st->spi, xfers, ARRAY_SIZE(xfers));
+	if (ret < 0)
+		return ret;
+
+	*val = sign_extend32(st->rx[chan->scan_index],
+			     chan->scan_type.realbits - 1);
+
+	return IIO_VAL_INT;
+}
+
+static int ad7380_read_raw(struct iio_dev *indio_dev,
+			   struct iio_chan_spec const *chan,
+			   int *val, int *val2, long info)
+{
+	struct ad7380_state *st = iio_priv(indio_dev);
+	int ret;
+
+	switch (info) {
+	case IIO_CHAN_INFO_RAW:
+		ret = iio_device_claim_direct_mode(indio_dev);
+		if (ret)
+			return ret;
+
+		ret = ad7380_read_direct(st, chan, val);
+		iio_device_release_direct_mode(indio_dev);
+
+		return ret;
+	case IIO_CHAN_INFO_SCALE:
+		if (st->vref) {
+			ret = regulator_get_voltage(st->vref);
+			if (ret < 0)
+				return ret;
+
+			*val = ret / 1000;
+		} else {
+			*val = AD7380_INTERNAL_REF_MV;
+		}
+
+		*val2 = chan->scan_type.realbits;
+
+		return IIO_VAL_FRACTIONAL_LOG2;
+	}
+
+	return -EINVAL;
+}
+
+static const struct iio_info ad7380_info = {
+	.read_raw = &ad7380_read_raw,
+	.debugfs_reg_access = &ad7380_debugfs_reg_access,
+};
+
+static int ad7380_init(struct ad7380_state *st)
+{
+	int ret;
+
+	/* perform hard reset */
+	ret = regmap_update_bits(st->regmap, AD7380_REG_ADDR_CONFIG2,
+				 AD7380_CONFIG2_RESET,
+				 FIELD_PREP(AD7380_CONFIG2_RESET,
+					    AD7380_CONFIG2_RESET_HARD));
+	if (ret < 0)
+		return ret;
+
+	/* select internal or external reference voltage */
+	ret = regmap_update_bits(st->regmap, AD7380_REG_ADDR_CONFIG1,
+				 AD7380_CONFIG1_REFSEL,
+				 FIELD_PREP(AD7380_CONFIG1_REFSEL, !!st->vref));
+	if (ret < 0)
+		return ret;
+
+	/* SPI 1-wire mode */
+	return regmap_update_bits(st->regmap, AD7380_REG_ADDR_CONFIG2,
+				  AD7380_CONFIG2_SDO,
+				  FIELD_PREP(AD7380_CONFIG2_SDO, 1));
+}
+
+static void ad7380_regulator_disable(void *p)
+{
+	regulator_disable(p);
+}
+
+static int ad7380_probe(struct spi_device *spi)
+{
+	struct iio_dev *indio_dev;
+	struct ad7380_state *st;
+	int ret;
+
+	indio_dev = devm_iio_device_alloc(&spi->dev, sizeof(*st));
+	if (!indio_dev)
+		return -ENOMEM;
+
+	st = iio_priv(indio_dev);
+	st->spi = spi;
+	st->chip_info = spi_get_device_match_data(spi);
+	if (!st->chip_info)
+		return dev_err_probe(&spi->dev, -EINVAL, "missing match data\n");
+
+	st->vref = devm_regulator_get_optional(&spi->dev, "refio");
+	if (IS_ERR(st->vref)) {
+		/*
+		 * If there is no REFIO supply, then it means that we are using
+		 * the internal 2.5V reference.
+		 */
+		if (PTR_ERR(st->vref) == -ENODEV)
+			st->vref = NULL;
+		else
+			return dev_err_probe(&spi->dev, PTR_ERR(st->vref),
+					     "Failed to get refio regulator\n");
+	}
+
+	if (st->vref) {
+		ret = regulator_enable(st->vref);
+		if (ret)
+			return ret;
+
+		ret = devm_add_action_or_reset(&spi->dev, ad7380_regulator_disable,
+					       st->vref);
+		if (ret)
+			return ret;
+	}
+
+	st->regmap = devm_regmap_init(&spi->dev, NULL, st, &ad7380_regmap_config);
+	if (IS_ERR(st->regmap))
+		return dev_err_probe(&spi->dev, PTR_ERR(st->regmap),
+				     "failed to allocate register map\n");
+
+	indio_dev->channels = st->chip_info->channels;
+	indio_dev->num_channels = st->chip_info->num_channels;
+	indio_dev->name = st->chip_info->name;
+	indio_dev->info = &ad7380_info;
+	indio_dev->modes = INDIO_DIRECT_MODE;
+	indio_dev->available_scan_masks = ad7380_2_channel_scan_masks;
+
+	ret = devm_iio_triggered_buffer_setup(&spi->dev, indio_dev,
+					      iio_pollfunc_store_time,
+					      ad7380_trigger_handler, NULL);
+	if (ret)
+		return ret;
+
+	ret = ad7380_init(st);
+	if (ret)
+		return ret;
+
+	return devm_iio_device_register(&spi->dev, indio_dev);
+}
+
+static const struct of_device_id ad7380_of_match_table[] = {
+	{ .compatible = "adi,ad7380", .data = &ad7380_chip_info },
+	{ .compatible = "adi,ad7381", .data = &ad7381_chip_info },
+	{ .compatible = "adi,ad7383", .data = &ad7383_chip_info },
+	{ .compatible = "adi,ad7384", .data = &ad7384_chip_info },
+	{ }
+};
+
+static const struct spi_device_id ad7380_id_table[] = {
+	{ "ad7380", (kernel_ulong_t)&ad7380_chip_info },
+	{ "ad7381", (kernel_ulong_t)&ad7381_chip_info },
+	{ "ad7383", (kernel_ulong_t)&ad7383_chip_info },
+	{ "ad7384", (kernel_ulong_t)&ad7384_chip_info },
+	{ }
+};
+MODULE_DEVICE_TABLE(spi, ad7380_id_table);
+
+static struct spi_driver ad7380_driver = {
+	.driver = {
+		.name = "ad7380",
+		.of_match_table = ad7380_of_match_table,
+	},
+	.probe = ad7380_probe,
+	.id_table = ad7380_id_table,
+};
+module_spi_driver(ad7380_driver);
+
+MODULE_AUTHOR("Stefan Popa <stefan.popa@analog.com>");
+MODULE_DESCRIPTION("Analog Devices AD738x ADC driver");
+MODULE_LICENSE("GPL");

From 4fef5d3feab7c7bdebec58f26d2856a8734fec50 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 20 Jan 2024 21:50:05 -0800
Subject: [PATCH 303/707] iio: dummy_evgen: remove Excess kernel-doc comments

Drop kernel-doc comments for struct fields that were removed to
prevent kernel-doc warnings:

iio_dummy_evgen.c:43: warning: Excess struct member 'irq_sim' description in 'iio_dummy_eventgen'
iio_dummy_evgen.c:43: warning: Excess struct member 'base' description in 'iio_dummy_eventgen'

Fixes: 337cbeb2c13e ("genirq/irq_sim: Simplify the API")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Link: https://lore.kernel.org/r/20240121055005.20042-1-rdunlap@infradead.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/dummy/iio_dummy_evgen.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/iio/dummy/iio_dummy_evgen.c b/drivers/iio/dummy/iio_dummy_evgen.c
index 5a0072727ba4b1..16d3f144dda040 100644
--- a/drivers/iio/dummy/iio_dummy_evgen.c
+++ b/drivers/iio/dummy/iio_dummy_evgen.c
@@ -31,8 +31,6 @@
  * @regs: irq regs we are faking
  * @lock: protect the evgen state
  * @inuse: mask of which irqs are connected
- * @irq_sim: interrupt simulator
- * @base: base of irq range
  * @irq_sim_domain: irq simulator domain
  */
 struct iio_dummy_eventgen {

From 34c9ee68cd4c33921471fc956476255fd6ab8b86 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Wed, 17 Jan 2024 14:10:50 +0100
Subject: [PATCH 304/707] iio: imu: adis16475: make use of
 irq_get_trigger_type()

There's no need to call both irq_get_irq_data() and
irqd_get_trigger_type() as we already have an helper for that. This
allows for code simplification.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240117-adis-improv-v1-2-7f90e9fad200@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/imu/adis16475.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/iio/imu/adis16475.c b/drivers/iio/imu/adis16475.c
index 64be656f0b8052..01f55cc902faad 100644
--- a/drivers/iio/imu/adis16475.c
+++ b/drivers/iio/imu/adis16475.c
@@ -1363,22 +1363,16 @@ static int adis16475_config_sync_mode(struct adis16475 *st)
 static int adis16475_config_irq_pin(struct adis16475 *st)
 {
 	int ret;
-	struct irq_data *desc;
 	u32 irq_type;
 	u16 val = 0;
 	u8 polarity;
 	struct spi_device *spi = st->adis.spi;
 
-	desc = irq_get_irq_data(spi->irq);
-	if (!desc) {
-		dev_err(&spi->dev, "Could not find IRQ %d\n", spi->irq);
-		return -EINVAL;
-	}
 	/*
 	 * It is possible to configure the data ready polarity. Furthermore, we
 	 * need to update the adis struct if we want data ready as active low.
 	 */
-	irq_type = irqd_get_trigger_type(desc);
+	irq_type = irq_get_trigger_type(spi->irq);
 	if (irq_type == IRQ_TYPE_EDGE_RISING) {
 		polarity = 1;
 		st->adis.irq_flag = IRQF_TRIGGER_RISING;

From 8ed2aa0a1b56695989b1fd5677d2d66d5e6d2406 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Wed, 17 Jan 2024 14:10:51 +0100
Subject: [PATCH 305/707] iio: imu: adis16480: make use of
 irq_get_trigger_type()

There's no need to call both irq_get_irq_data() and
irqd_get_trigger_type() as we already have an helper for that. This
allows for code simplification.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240117-adis-improv-v1-3-7f90e9fad200@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/imu/adis16480.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/iio/imu/adis16480.c b/drivers/iio/imu/adis16480.c
index fe520194a83717..b40a55bba30c19 100644
--- a/drivers/iio/imu/adis16480.c
+++ b/drivers/iio/imu/adis16480.c
@@ -1246,18 +1246,11 @@ static int adis16480_config_irq_pin(struct adis16480 *st)
 {
 	struct device *dev = &st->adis.spi->dev;
 	struct fwnode_handle *fwnode = dev_fwnode(dev);
-	struct irq_data *desc;
 	enum adis16480_int_pin pin;
 	unsigned int irq_type;
 	uint16_t val;
 	int i, irq = 0;
 
-	desc = irq_get_irq_data(st->adis.spi->irq);
-	if (!desc) {
-		dev_err(dev, "Could not find IRQ %d\n", irq);
-		return -EINVAL;
-	}
-
 	/* Disable data ready since the default after reset is on */
 	val = ADIS16480_DRDY_EN(0);
 
@@ -1285,7 +1278,7 @@ static int adis16480_config_irq_pin(struct adis16480 *st)
 	 * configured as positive or negative, corresponding to
 	 * IRQ_TYPE_EDGE_RISING or IRQ_TYPE_EDGE_FALLING respectively.
 	 */
-	irq_type = irqd_get_trigger_type(desc);
+	irq_type = irq_get_trigger_type(st->adis.spi->irq);
 	if (irq_type == IRQ_TYPE_EDGE_RISING) { /* Default */
 		val |= ADIS16480_DRDY_POL(1);
 	} else if (irq_type == IRQ_TYPE_EDGE_FALLING) {

From 8ad16754891d0ea6654126515603c3c389d446dd Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Wed, 17 Jan 2024 13:41:04 +0100
Subject: [PATCH 306/707] iio: adc: ad_sigma_delta: allow overwriting the IRQ
 flags

Make sure we can specify the IRQ trigger type from firmware and drivers
won't ignore it. In fact, this how it should be done but since someone
might be already depending on the driver to hardcode the trigger type
(and not specifying it in firmware), let's do it like this so there's
no possible breakage.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240117-dev_sigma_delta_no_irq_flags-v1-2-db39261592cf@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/ad_sigma_delta.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c
index 7e21928707437b..fbba3f4a118983 100644
--- a/drivers/iio/adc/ad_sigma_delta.c
+++ b/drivers/iio/adc/ad_sigma_delta.c
@@ -568,6 +568,7 @@ EXPORT_SYMBOL_NS_GPL(ad_sd_validate_trigger, IIO_AD_SIGMA_DELTA);
 static int devm_ad_sd_probe_trigger(struct device *dev, struct iio_dev *indio_dev)
 {
 	struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
+	unsigned long irq_flags = irq_get_trigger_type(sigma_delta->spi->irq);
 	int ret;
 
 	if (dev != &sigma_delta->spi->dev) {
@@ -588,9 +589,13 @@ static int devm_ad_sd_probe_trigger(struct device *dev, struct iio_dev *indio_de
 	/* the IRQ core clears IRQ_DISABLE_UNLAZY flag when freeing an IRQ */
 	irq_set_status_flags(sigma_delta->spi->irq, IRQ_DISABLE_UNLAZY);
 
+	/* Allow overwriting the flags from firmware */
+	if (!irq_flags)
+		irq_flags = sigma_delta->info->irq_flags;
+
 	ret = devm_request_irq(dev, sigma_delta->spi->irq,
 			       ad_sd_data_rdy_trig_poll,
-			       sigma_delta->info->irq_flags | IRQF_NO_AUTOEN,
+			       irq_flags | IRQF_NO_AUTOEN,
 			       indio_dev->name,
 			       sigma_delta);
 	if (ret)

From 28065671fb19ccf932edbbc4e25f3e328079b0a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5rten=20Lindahl?= <marten.lindahl@axis.com>
Date: Mon, 15 Jan 2024 12:44:36 +0100
Subject: [PATCH 307/707] iio: light: vcnl4000: Set ps high definition for
 4040/4200
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The vcnl4040/vcnl4200 proximity sensor defaults to 12 bit data
resolution, but the chip also supports 16 bit data resolution, which is
called proximity high definition (PS_HD).

Make the vcnl4040/vcnl4200 proximity sensor use the high definition for
all data readings. Please note that in order to preserve the 12 bit
integer part of the in_proximity_raw output, the format is changed from
integer to fixed point.

Signed-off-by: Mårten Lindahl <marten.lindahl@axis.com>
Link: https://lore.kernel.org/r/20231221-vcnl4000-ps-hd-v3-1-6dcc889372be@axis.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/light/vcnl4000.c | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/iio/light/vcnl4000.c b/drivers/iio/light/vcnl4000.c
index fdf763a04b0bfa..4e3641ff2ed446 100644
--- a/drivers/iio/light/vcnl4000.c
+++ b/drivers/iio/light/vcnl4000.c
@@ -90,6 +90,7 @@
 #define VCNL4040_PS_CONF1_PS_SHUTDOWN	BIT(0)
 #define VCNL4040_PS_CONF2_PS_IT	GENMASK(3, 1) /* Proximity integration time */
 #define VCNL4040_CONF1_PS_PERS	GENMASK(5, 4) /* Proximity interrupt persistence setting */
+#define VCNL4040_PS_CONF2_PS_HD		BIT(11)	/* Proximity high definition */
 #define VCNL4040_PS_CONF2_PS_INT	GENMASK(9, 8) /* Proximity interrupt mode */
 #define VCNL4040_PS_CONF3_MPS		GENMASK(6, 5) /* Proximity multi pulse number */
 #define VCNL4040_PS_MS_LED_I		GENMASK(10, 8) /* Proximity current */
@@ -114,6 +115,13 @@
 #define VCNL4010_INT_DRDY \
 	(BIT(VCNL4010_INT_PROXIMITY) | BIT(VCNL4010_INT_ALS))
 
+#define VCNL4040_CONF3_PS_MPS_16BITS	3	/* 8 multi pulses */
+#define VCNL4040_CONF3_PS_LED_I_16BITS	3	/* 120 mA */
+
+#define VCNL4040_CONF3_PS_SAMPLE_16BITS \
+	(FIELD_PREP(VCNL4040_PS_CONF3_MPS, VCNL4040_CONF3_PS_MPS_16BITS) | \
+	 FIELD_PREP(VCNL4040_PS_MS_LED_I, VCNL4040_CONF3_PS_LED_I_16BITS))
+
 static const int vcnl4010_prox_sampling_frequency[][2] = {
 	{1, 950000},
 	{3, 906250},
@@ -195,6 +203,7 @@ struct vcnl4000_data {
 	enum vcnl4000_device_ids id;
 	int rev;
 	int al_scale;
+	int ps_scale;
 	u8 ps_int;		/* proximity interrupt mode */
 	u8 als_int;		/* ambient light interrupt mode*/
 	const struct vcnl4000_chip_spec *chip_spec;
@@ -345,6 +354,7 @@ static int vcnl4200_set_power_state(struct vcnl4000_data *data, bool on)
 static int vcnl4200_init(struct vcnl4000_data *data)
 {
 	int ret, id;
+	u16 regval;
 
 	ret = i2c_smbus_read_word_data(data->client, VCNL4200_DEV_ID);
 	if (ret < 0)
@@ -386,9 +396,32 @@ static int vcnl4200_init(struct vcnl4000_data *data)
 		break;
 	}
 	data->al_scale = data->chip_spec->ulux_step;
+	data->ps_scale = 16;
 	mutex_init(&data->vcnl4200_al.lock);
 	mutex_init(&data->vcnl4200_ps.lock);
 
+	/* Use 16 bits proximity sensor readings */
+	ret = i2c_smbus_read_word_data(data->client, VCNL4200_PS_CONF1);
+	if (ret < 0)
+		return ret;
+
+	regval = ret | VCNL4040_PS_CONF2_PS_HD;
+	ret = i2c_smbus_write_word_data(data->client, VCNL4200_PS_CONF1,
+					regval);
+	if (ret < 0)
+		return ret;
+
+	/* Align proximity sensor sample rate to 16 bits data width */
+	ret = i2c_smbus_read_word_data(data->client, VCNL4200_PS_CONF3);
+	if (ret < 0)
+		return ret;
+
+	regval = ret | VCNL4040_CONF3_PS_SAMPLE_16BITS;
+	ret = i2c_smbus_write_word_data(data->client, VCNL4200_PS_CONF3,
+					regval);
+	if (ret < 0)
+		return ret;
+
 	ret = data->chip_spec->set_power_state(data, true);
 	if (ret < 0)
 		return ret;
@@ -901,8 +934,9 @@ static int vcnl4000_read_raw(struct iio_dev *indio_dev,
 			break;
 		case IIO_PROXIMITY:
 			ret = data->chip_spec->measure_proximity(data, val);
+			*val2 = data->ps_scale;
 			if (!ret)
-				ret = IIO_VAL_INT;
+				ret = IIO_VAL_FRACTIONAL;
 			break;
 		default:
 			ret = -EINVAL;

From 0ff5430cb47cc98bb0a8da65a1c9ea79b69145c6 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <mazziesaccount@gmail.com>
Date: Mon, 15 Jan 2024 16:26:07 +0200
Subject: [PATCH 308/707] iio: test: test gain-time-scale helpers

Some light sensors can adjust both the HW-gain and integration time.
There are cases where adjusting the integration time has similar impact
to the scale of the reported values as gain setting has.

IIO users do typically expect to handle scale by a single writable 'scale'
entry. Driver should then adjust the gain/time accordingly.

It however is difficult for a driver to know whether it should change
gain or integration time to meet the requested scale. Usually it is
preferred to have longer integration time which usually improves
accuracy, but there may be use-cases where long measurement times can be
an issue. Thus it can be preferable to allow also changing the
integration time - but mitigate the scale impact by also changing the gain
underneath. Eg, if integration time change doubles the measured values,
the driver can reduce the HW-gain to half.

The theory of the computations of gain-time-scale is simple. However,
some people (undersigned) got that implemented wrong for more than once.
Hence some gain-time-scale helpers were introduced.

Add some simple tests to verify the most hairy functions.

Signed-off-by: Matti Vaittinen <mazziesaccount@gmail.com>
Link: https://lore.kernel.org/r/0f7505b43f91394dc3bb636369489c897b7e01a7.1705328293.git.mazziesaccount@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/test/Kconfig        |  14 +
 drivers/iio/test/Makefile       |   1 +
 drivers/iio/test/iio-test-gts.c | 513 ++++++++++++++++++++++++++++++++
 3 files changed, 528 insertions(+)
 create mode 100644 drivers/iio/test/iio-test-gts.c

diff --git a/drivers/iio/test/Kconfig b/drivers/iio/test/Kconfig
index 0b6e4e278a2f61..33cca49c8058ae 100644
--- a/drivers/iio/test/Kconfig
+++ b/drivers/iio/test/Kconfig
@@ -4,6 +4,20 @@
 #
 
 # Keep in alphabetical order
+config IIO_GTS_KUNIT_TEST
+	tristate "Test IIO formatting functions" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	select IIO_GTS_HELPER
+	select TEST_KUNIT_DEVICE_HELPERS
+	default KUNIT_ALL_TESTS
+	help
+	  build unit tests for the IIO light sensor gain-time-scale helpers.
+
+	  For more information on KUnit and unit tests in general, please refer
+	  to the KUnit documentation in Documentation/dev-tools/kunit/.
+
+	  If unsure, say N. Keep in alphabetical order
+
 config IIO_RESCALE_KUNIT_TEST
 	tristate "Test IIO rescale conversion functions" if !KUNIT_ALL_TESTS
 	depends on KUNIT && IIO_RESCALE
diff --git a/drivers/iio/test/Makefile b/drivers/iio/test/Makefile
index d76eaf36da8206..e9a4cf1ff57f01 100644
--- a/drivers/iio/test/Makefile
+++ b/drivers/iio/test/Makefile
@@ -6,4 +6,5 @@
 # Keep in alphabetical order
 obj-$(CONFIG_IIO_RESCALE_KUNIT_TEST) += iio-test-rescale.o
 obj-$(CONFIG_IIO_FORMAT_KUNIT_TEST) += iio-test-format.o
+obj-$(CONFIG_IIO_GTS_KUNIT_TEST) += iio-test-gts.o
 CFLAGS_iio-test-format.o += $(DISABLE_STRUCTLEAK_PLUGIN)
diff --git a/drivers/iio/test/iio-test-gts.c b/drivers/iio/test/iio-test-gts.c
new file mode 100644
index 00000000000000..cf7ab773ea0b15
--- /dev/null
+++ b/drivers/iio/test/iio-test-gts.c
@@ -0,0 +1,513 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unit tests for IIO light sensor gain-time-scale helpers
+ *
+ * Copyright (c) 2023 Matti Vaittinen <mazziesaccount@gmail.com>
+ */
+
+#include <kunit/device.h>
+#include <kunit/test.h>
+#include <linux/device.h>
+#include <linux/iio/iio-gts-helper.h>
+#include <linux/iio/types.h>
+
+/*
+ * Please, read the "rant" from the top of the lib/test_linear_ranges.c if
+ * you see a line of helper code which is not being tested.
+ *
+ * Then, please look at the line which is not being tested. Is this line
+ * somehow unusually complex? If answer is "no", then chances are that the
+ * "development inertia" caused by adding a test exceeds the benefits.
+ *
+ * If yes, then adding a test is probably a good idea but please stop for a
+ * moment and consider the effort of changing all the tests when code gets
+ * refactored. Eventually it neeeds to be.
+ */
+
+#define TEST_TSEL_50		1
+#define TEST_TSEL_X_MIN		TEST_TSEL_50
+#define TEST_TSEL_100		0
+#define TEST_TSEL_200		2
+#define TEST_TSEL_400		4
+#define TEST_TSEL_X_MAX		TEST_TSEL_400
+
+#define TEST_GSEL_1		0x00
+#define TEST_GSEL_X_MIN		TEST_GSEL_1
+#define TEST_GSEL_4		0x08
+#define TEST_GSEL_16		0x0a
+#define TEST_GSEL_32		0x0b
+#define TEST_GSEL_64		0x0c
+#define TEST_GSEL_256		0x18
+#define TEST_GSEL_512		0x19
+#define TEST_GSEL_1024		0x1a
+#define TEST_GSEL_2048		0x1b
+#define TEST_GSEL_4096		0x1c
+#define TEST_GSEL_X_MAX		TEST_GSEL_4096
+
+#define TEST_SCALE_1X		64
+#define TEST_SCALE_MIN_X	TEST_SCALE_1X
+#define TEST_SCALE_2X		32
+#define TEST_SCALE_4X		16
+#define TEST_SCALE_8X		8
+#define TEST_SCALE_16X		4
+#define TEST_SCALE_32X		2
+#define TEST_SCALE_64X		1
+
+#define TEST_SCALE_NANO_128X	500000000
+#define TEST_SCALE_NANO_256X	250000000
+#define TEST_SCALE_NANO_512X	125000000
+#define TEST_SCALE_NANO_1024X	62500000
+#define TEST_SCALE_NANO_2048X	31250000
+#define TEST_SCALE_NANO_4096X	15625000
+#define TEST_SCALE_NANO_4096X2	7812500
+#define TEST_SCALE_NANO_4096X4	3906250
+#define TEST_SCALE_NANO_4096X8	1953125
+
+#define TEST_SCALE_NANO_MAX_X TEST_SCALE_NANO_4096X8
+
+/*
+ * Can't have this allocated from stack because the kunit clean-up will
+ * happen only after the test function has already gone
+ */
+static struct iio_gts gts;
+
+static const struct iio_gain_sel_pair gts_test_gains[] = {
+	GAIN_SCALE_GAIN(1, TEST_GSEL_1),
+	GAIN_SCALE_GAIN(4, TEST_GSEL_4),
+	GAIN_SCALE_GAIN(16, TEST_GSEL_16),
+	GAIN_SCALE_GAIN(32, TEST_GSEL_32),
+	GAIN_SCALE_GAIN(64, TEST_GSEL_64),
+	GAIN_SCALE_GAIN(256, TEST_GSEL_256),
+	GAIN_SCALE_GAIN(512, TEST_GSEL_512),
+	GAIN_SCALE_GAIN(1024, TEST_GSEL_1024),
+	GAIN_SCALE_GAIN(2048, TEST_GSEL_2048),
+	GAIN_SCALE_GAIN(4096, TEST_GSEL_4096),
+#define HWGAIN_MAX 4096
+};
+
+static const struct iio_itime_sel_mul gts_test_itimes[] = {
+	GAIN_SCALE_ITIME_US(400 * 1000, TEST_TSEL_400, 8),
+	GAIN_SCALE_ITIME_US(200 * 1000, TEST_TSEL_200, 4),
+	GAIN_SCALE_ITIME_US(100 * 1000, TEST_TSEL_100, 2),
+	GAIN_SCALE_ITIME_US(50 * 1000, TEST_TSEL_50, 1),
+#define TIMEGAIN_MAX 8
+};
+#define TOTAL_GAIN_MAX	(HWGAIN_MAX * TIMEGAIN_MAX)
+#define IIO_GTS_TEST_DEV "iio-gts-test-dev"
+
+static struct device *__test_init_iio_gain_scale(struct kunit *test,
+		struct iio_gts *gts, const struct iio_gain_sel_pair *g_table,
+		int num_g, const struct iio_itime_sel_mul *i_table, int num_i)
+{
+	struct device *dev;
+	int ret;
+
+	dev = kunit_device_register(test, IIO_GTS_TEST_DEV);
+
+	KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev);
+	if (IS_ERR_OR_NULL(dev))
+		return NULL;
+
+	ret = devm_iio_init_iio_gts(dev, TEST_SCALE_1X, 0, g_table, num_g,
+				    i_table, num_i, gts);
+	KUNIT_EXPECT_EQ(test, 0, ret);
+	if (ret)
+		return NULL;
+
+	return dev;
+}
+
+#define test_init_iio_gain_scale(test, gts)	\
+	__test_init_iio_gain_scale(test, gts, gts_test_gains, \
+				   ARRAY_SIZE(gts_test_gains), gts_test_itimes, \
+				   ARRAY_SIZE(gts_test_itimes))
+
+static void test_init_iio_gts_invalid(struct kunit *test)
+{
+	struct device *dev;
+	int ret;
+	const struct iio_itime_sel_mul itimes_neg[] = {
+		GAIN_SCALE_ITIME_US(-10, TEST_TSEL_400, 8),
+		GAIN_SCALE_ITIME_US(200 * 1000, TEST_TSEL_200, 4),
+	};
+	const struct iio_gain_sel_pair gains_neg[] = {
+		GAIN_SCALE_GAIN(1, TEST_GSEL_1),
+		GAIN_SCALE_GAIN(2, TEST_GSEL_4),
+		GAIN_SCALE_GAIN(-2, TEST_GSEL_16),
+	};
+	/* 55555 * 38656 = 2147534080 => overflows 32bit int */
+	const struct iio_itime_sel_mul itimes_overflow[] = {
+		GAIN_SCALE_ITIME_US(400 * 1000, TEST_TSEL_400, 55555),
+		GAIN_SCALE_ITIME_US(200 * 1000, TEST_TSEL_200, 4),
+	};
+	const struct iio_gain_sel_pair gains_overflow[] = {
+		GAIN_SCALE_GAIN(1, TEST_GSEL_1),
+		GAIN_SCALE_GAIN(2, TEST_GSEL_4),
+		GAIN_SCALE_GAIN(38656, TEST_GSEL_16),
+	};
+
+	dev = kunit_device_register(test, IIO_GTS_TEST_DEV);
+	KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev);
+	if (!dev)
+		return;
+
+	/* Ok gains, negative time */
+	ret = devm_iio_init_iio_gts(dev, TEST_SCALE_1X, 0, gts_test_gains,
+				    ARRAY_SIZE(gts_test_gains), itimes_neg,
+				    ARRAY_SIZE(itimes_neg), &gts);
+	KUNIT_EXPECT_EQ(test, -EINVAL, ret);
+
+	/* Ok times, negative gain */
+	ret = devm_iio_init_iio_gts(dev, TEST_SCALE_1X, 0, gains_neg,
+				    ARRAY_SIZE(gains_neg), gts_test_itimes,
+				    ARRAY_SIZE(gts_test_itimes), &gts);
+	KUNIT_EXPECT_EQ(test, -EINVAL, ret);
+
+	/* gain * time overflow int */
+	ret = devm_iio_init_iio_gts(dev, TEST_SCALE_1X, 0, gains_overflow,
+				    ARRAY_SIZE(gains_overflow), itimes_overflow,
+				    ARRAY_SIZE(itimes_overflow), &gts);
+	KUNIT_EXPECT_EQ(test, -EOVERFLOW, ret);
+}
+
+static void test_iio_gts_find_gain_for_scale_using_time(struct kunit *test)
+{
+	struct device *dev;
+	int ret, gain_sel;
+
+	dev = test_init_iio_gain_scale(test, &gts);
+	if (!dev)
+		return;
+
+	ret = iio_gts_find_gain_sel_for_scale_using_time(&gts, TEST_TSEL_100,
+						TEST_SCALE_8X, 0, &gain_sel);
+	/*
+	 * Meas time 100 => gain by time 2x
+	 * TEST_SCALE_8X matches total gain 8x
+	 * => required HWGAIN 4x
+	 */
+	KUNIT_EXPECT_EQ(test, 0, ret);
+	KUNIT_EXPECT_EQ(test, TEST_GSEL_4, gain_sel);
+
+	ret = iio_gts_find_gain_sel_for_scale_using_time(&gts, TEST_TSEL_200, 0,
+						TEST_SCALE_NANO_256X, &gain_sel);
+	/*
+	 * Meas time 200 => gain by time 4x
+	 * TEST_SCALE_256X matches total gain 256x
+	 * => required HWGAIN 256/4 => 64x
+	 */
+	KUNIT_EXPECT_EQ(test, 0, ret);
+	KUNIT_EXPECT_EQ(test, TEST_GSEL_64, gain_sel);
+
+	/* Min time, Min gain */
+	ret = iio_gts_find_gain_sel_for_scale_using_time(&gts, TEST_TSEL_X_MIN,
+						TEST_SCALE_MIN_X, 0, &gain_sel);
+	KUNIT_EXPECT_EQ(test, 0, ret);
+	KUNIT_EXPECT_EQ(test, TEST_GSEL_1, gain_sel);
+
+	/* Max time, Max gain */
+	ret = iio_gts_find_gain_sel_for_scale_using_time(&gts, TEST_TSEL_X_MAX,
+					0, TEST_SCALE_NANO_MAX_X, &gain_sel);
+	KUNIT_EXPECT_EQ(test, 0, ret);
+	KUNIT_EXPECT_EQ(test, TEST_GSEL_4096, gain_sel);
+
+	ret = iio_gts_find_gain_sel_for_scale_using_time(&gts, TEST_TSEL_100, 0,
+						TEST_SCALE_NANO_256X, &gain_sel);
+	/*
+	 * Meas time 100 => gain by time 2x
+	 * TEST_SCALE_256X matches total gain 256x
+	 * => required HWGAIN 256/2 => 128x (not in gain-table - unsupported)
+	 */
+	KUNIT_EXPECT_NE(test, 0, ret);
+
+	ret = iio_gts_find_gain_sel_for_scale_using_time(&gts, TEST_TSEL_200, 0,
+						TEST_SCALE_NANO_MAX_X, &gain_sel);
+	/* We can't reach the max gain with integration time smaller than MAX */
+	KUNIT_EXPECT_NE(test, 0, ret);
+
+	ret = iio_gts_find_gain_sel_for_scale_using_time(&gts, TEST_TSEL_50, 0,
+						TEST_SCALE_NANO_MAX_X, &gain_sel);
+	/* We can't reach the max gain with integration time smaller than MAX */
+	KUNIT_EXPECT_NE(test, 0, ret);
+}
+
+static void test_iio_gts_find_new_gain_sel_by_old_gain_time(struct kunit *test)
+{
+	struct device *dev;
+	int ret, old_gain, new_gain, old_time_sel, new_time_sel;
+
+	dev = test_init_iio_gain_scale(test, &gts);
+	if (!dev)
+		return;
+
+	old_gain = 32;
+	old_time_sel = TEST_TSEL_200;
+	new_time_sel = TEST_TSEL_400;
+
+	ret = iio_gts_find_new_gain_sel_by_old_gain_time(&gts, old_gain,
+					old_time_sel, new_time_sel, &new_gain);
+	KUNIT_EXPECT_EQ(test, 0, ret);
+	/*
+	 * Doubling the integration time doubles the total gain - so old
+	 * (hw)gain must be divided by two to compensate. => 32 / 2 => 16
+	 */
+	KUNIT_EXPECT_EQ(test, 16, new_gain);
+
+	old_gain = 4;
+	old_time_sel = TEST_TSEL_50;
+	new_time_sel = TEST_TSEL_200;
+	ret = iio_gts_find_new_gain_sel_by_old_gain_time(&gts, old_gain,
+					old_time_sel, new_time_sel, &new_gain);
+	KUNIT_EXPECT_EQ(test, 0, ret);
+	/*
+	 * gain by time 1x => 4x - (hw)gain 4x => 1x
+	 */
+	KUNIT_EXPECT_EQ(test, 1, new_gain);
+
+	old_gain = 512;
+	old_time_sel = TEST_TSEL_400;
+	new_time_sel = TEST_TSEL_50;
+	ret = iio_gts_find_new_gain_sel_by_old_gain_time(&gts, old_gain,
+					old_time_sel, new_time_sel, &new_gain);
+	KUNIT_EXPECT_EQ(test, 0, ret);
+	/*
+	 * gain by time 8x => 1x - (hw)gain 512x => 4096x)
+	 */
+	KUNIT_EXPECT_EQ(test, 4096, new_gain);
+
+	/* Unsupported gain 2x */
+	old_gain = 4;
+	old_time_sel = TEST_TSEL_200;
+	new_time_sel = TEST_TSEL_400;
+	ret = iio_gts_find_new_gain_sel_by_old_gain_time(&gts, old_gain,
+					old_time_sel, new_time_sel, &new_gain);
+	KUNIT_EXPECT_NE(test, 0, ret);
+
+	/* Too small gain */
+	old_gain = 4;
+	old_time_sel = TEST_TSEL_50;
+	new_time_sel = TEST_TSEL_400;
+	ret = iio_gts_find_new_gain_sel_by_old_gain_time(&gts, old_gain,
+					old_time_sel, new_time_sel, &new_gain);
+	KUNIT_EXPECT_NE(test, 0, ret);
+
+	/* Too big gain */
+	old_gain = 1024;
+	old_time_sel = TEST_TSEL_400;
+	new_time_sel = TEST_TSEL_50;
+	ret = iio_gts_find_new_gain_sel_by_old_gain_time(&gts, old_gain,
+					old_time_sel, new_time_sel, &new_gain);
+	KUNIT_EXPECT_NE(test, 0, ret);
+
+}
+
+static void test_iio_find_closest_gain_low(struct kunit *test)
+{
+	struct device *dev;
+	bool in_range;
+	int ret;
+
+	const struct iio_gain_sel_pair gts_test_gains_gain_low[] = {
+		GAIN_SCALE_GAIN(4, TEST_GSEL_4),
+		GAIN_SCALE_GAIN(16, TEST_GSEL_16),
+		GAIN_SCALE_GAIN(32, TEST_GSEL_32),
+	};
+
+	dev = test_init_iio_gain_scale(test, &gts);
+	if (!dev)
+		return;
+
+	ret = iio_find_closest_gain_low(&gts, 2, &in_range);
+	KUNIT_EXPECT_EQ(test, 1, ret);
+	KUNIT_EXPECT_EQ(test, true, in_range);
+
+	ret = iio_find_closest_gain_low(&gts, 1, &in_range);
+	KUNIT_EXPECT_EQ(test, 1, ret);
+	KUNIT_EXPECT_EQ(test, true, in_range);
+
+	ret = iio_find_closest_gain_low(&gts, 4095, &in_range);
+	KUNIT_EXPECT_EQ(test, 2048, ret);
+	KUNIT_EXPECT_EQ(test, true, in_range);
+
+	ret = iio_find_closest_gain_low(&gts, 4097, &in_range);
+	KUNIT_EXPECT_EQ(test, 4096, ret);
+	KUNIT_EXPECT_EQ(test, false, in_range);
+
+	kunit_device_unregister(test, dev);
+
+	dev = __test_init_iio_gain_scale(test, &gts, gts_test_gains_gain_low,
+				ARRAY_SIZE(gts_test_gains_gain_low),
+				gts_test_itimes, ARRAY_SIZE(gts_test_itimes));
+	if (!dev)
+		return;
+
+	ret = iio_find_closest_gain_low(&gts, 3, &in_range);
+	KUNIT_EXPECT_EQ(test, -EINVAL, ret);
+	KUNIT_EXPECT_EQ(test, false, in_range);
+}
+
+static void test_iio_gts_total_gain_to_scale(struct kunit *test)
+{
+	struct device *dev;
+	int ret, scale_int, scale_nano;
+
+	dev = test_init_iio_gain_scale(test, &gts);
+	if (!dev)
+		return;
+
+	ret = iio_gts_total_gain_to_scale(&gts, 1, &scale_int, &scale_nano);
+	KUNIT_EXPECT_EQ(test, 0, ret);
+	KUNIT_EXPECT_EQ(test, TEST_SCALE_1X, scale_int);
+	KUNIT_EXPECT_EQ(test, 0, scale_nano);
+
+	ret = iio_gts_total_gain_to_scale(&gts, 1, &scale_int, &scale_nano);
+	KUNIT_EXPECT_EQ(test, 0, ret);
+	KUNIT_EXPECT_EQ(test, TEST_SCALE_1X, scale_int);
+	KUNIT_EXPECT_EQ(test, 0, scale_nano);
+
+	ret = iio_gts_total_gain_to_scale(&gts, 4096 * 8, &scale_int,
+					  &scale_nano);
+	KUNIT_EXPECT_EQ(test, 0, ret);
+	KUNIT_EXPECT_EQ(test, 0, scale_int);
+	KUNIT_EXPECT_EQ(test, TEST_SCALE_NANO_4096X8, scale_nano);
+}
+
+static void test_iio_gts_chk_times(struct kunit *test, const int *vals)
+{
+	static const int expected[] = {0, 50000, 0, 100000, 0, 200000, 0, 400000};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(expected); i++)
+		KUNIT_EXPECT_EQ(test, expected[i], vals[i]);
+}
+
+static void test_iio_gts_chk_scales_all(struct kunit *test, struct iio_gts *gts,
+					const int *vals, int len)
+{
+	static const int gains[] = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512,
+				    1024, 2048, 4096, 4096 * 2, 4096 * 4,
+				    4096 * 8};
+	int expected[ARRAY_SIZE(gains) * 2];
+	int i, ret;
+	int exp_len = ARRAY_SIZE(gains) * 2;
+
+	KUNIT_EXPECT_EQ(test, exp_len, len);
+	if (len != exp_len)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(gains); i++) {
+		ret = iio_gts_total_gain_to_scale(gts, gains[i],
+						  &expected[2 * i],
+						  &expected[2 * i + 1]);
+		KUNIT_EXPECT_EQ(test, 0, ret);
+		if (ret)
+			return;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(expected); i++)
+		KUNIT_EXPECT_EQ(test, expected[i], vals[i]);
+}
+
+static void test_iio_gts_chk_scales_t200(struct kunit *test, struct iio_gts *gts,
+					 const int *vals, int len)
+{
+	/* The gain caused by time 200 is 4x */
+	static const int gains[] = {
+		1 * 4,
+		4 * 4,
+		16 * 4,
+		32 * 4,
+		64 * 4,
+		256 * 4,
+		512 * 4,
+		1024 * 4,
+		2048 * 4,
+		4096 * 4
+	};
+	int expected[ARRAY_SIZE(gains) * 2];
+	int i, ret;
+
+	KUNIT_EXPECT_EQ(test, 2 * ARRAY_SIZE(gains), len);
+	if (len < 2 * ARRAY_SIZE(gains))
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(gains); i++) {
+		ret = iio_gts_total_gain_to_scale(gts, gains[i],
+						  &expected[2 * i],
+						  &expected[2 * i + 1]);
+		KUNIT_EXPECT_EQ(test, 0, ret);
+		if (ret)
+			return;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(expected); i++)
+		KUNIT_EXPECT_EQ(test, expected[i], vals[i]);
+}
+
+static void test_iio_gts_avail_test(struct kunit *test)
+{
+	struct device *dev;
+	int ret;
+	int type, len;
+	const int *vals;
+
+	dev = test_init_iio_gain_scale(test, &gts);
+	if (!dev)
+		return;
+
+	/* test table building for times and iio_gts_avail_times() */
+	ret = iio_gts_avail_times(&gts, &vals, &type, &len);
+	KUNIT_EXPECT_EQ(test, IIO_AVAIL_LIST, ret);
+	if (ret)
+		return;
+
+	KUNIT_EXPECT_EQ(test, IIO_VAL_INT_PLUS_MICRO, type);
+	KUNIT_EXPECT_EQ(test, 8, len);
+	if (len < 8)
+		return;
+
+	test_iio_gts_chk_times(test, vals);
+
+	/* Test table building for all scales and iio_gts_all_avail_scales() */
+	ret = iio_gts_all_avail_scales(&gts, &vals, &type, &len);
+	KUNIT_EXPECT_EQ(test, IIO_AVAIL_LIST, ret);
+	if (ret)
+		return;
+
+	KUNIT_EXPECT_EQ(test, IIO_VAL_INT_PLUS_NANO, type);
+
+	test_iio_gts_chk_scales_all(test, &gts, vals, len);
+
+	/*
+	 * Test table building for scales/time and
+	 * iio_gts_avail_scales_for_time()
+	 */
+	ret = iio_gts_avail_scales_for_time(&gts, 200000, &vals, &type, &len);
+	KUNIT_EXPECT_EQ(test, IIO_AVAIL_LIST, ret);
+	if (ret)
+		return;
+
+	KUNIT_EXPECT_EQ(test, IIO_VAL_INT_PLUS_NANO, type);
+	test_iio_gts_chk_scales_t200(test, &gts, vals, len);
+}
+
+static struct kunit_case iio_gts_test_cases[] = {
+	KUNIT_CASE(test_init_iio_gts_invalid),
+	KUNIT_CASE(test_iio_gts_find_gain_for_scale_using_time),
+	KUNIT_CASE(test_iio_gts_find_new_gain_sel_by_old_gain_time),
+	KUNIT_CASE(test_iio_find_closest_gain_low),
+	KUNIT_CASE(test_iio_gts_total_gain_to_scale),
+	KUNIT_CASE(test_iio_gts_avail_test),
+	{}
+};
+
+static struct kunit_suite iio_gts_test_suite = {
+	.name = "iio-gain-time-scale",
+	.test_cases = iio_gts_test_cases,
+};
+
+kunit_test_suite(iio_gts_test_suite);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Matti Vaittinen <mazziesaccount@gmail.com>");
+MODULE_DESCRIPTION("Test IIO light sensor gain-time-scale helpers");
+MODULE_IMPORT_NS(IIO_GTS_HELPER);

From 724a827065cff0a65d57aa4b6a181504e13eff47 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <mazziesaccount@gmail.com>
Date: Mon, 15 Jan 2024 16:26:33 +0200
Subject: [PATCH 309/707] MAINTAINERS: add IIO GTS tests

Add undersigned as a maintainer for IIO GTS helper's KUnit tests.

Signed-off-by: Matti Vaittinen <mazziesaccount@gmail.com>
Link: https://lore.kernel.org/r/52c66fe2798192529738ac2ab98a27230a6ad8cd.1705328293.git.mazziesaccount@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index c1a7f85fdbacfe..0d015d85e30c9f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10415,6 +10415,7 @@ L:	linux-iio@vger.kernel.org
 S:	Maintained
 F:	drivers/iio/industrialio-gts-helper.c
 F:	include/linux/iio/iio-gts-helper.h
+F:	drivers/iio/test/iio-test-gts.c
 
 IIO MULTIPLEXER
 M:	Peter Rosin <peda@axentia.se>

From 1d972256f02416ec05d8a126c36cc2fa0699af2a Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Tue, 23 Jan 2024 08:09:15 -0600
Subject: [PATCH 310/707] iio: health: afe4403: Use devm action helper for
 regulator disable

Use a device lifecycle managed action for regulator disable function.
This helps prevent mistakes like unregistering out of order in cleanup
functions and forgetting to unregister on error paths.

Signed-off-by: Andrew Davis <afd@ti.com>
Link: https://lore.kernel.org/r/20240123140918.215818-1-afd@ti.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/health/afe4403.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/drivers/iio/health/afe4403.c b/drivers/iio/health/afe4403.c
index df3bc5c3d3786d..9e9d6de2a7c83e 100644
--- a/drivers/iio/health/afe4403.c
+++ b/drivers/iio/health/afe4403.c
@@ -346,6 +346,13 @@ static irqreturn_t afe4403_trigger_handler(int irq, void *private)
 	return IRQ_HANDLED;
 }
 
+static void afe4403_regulator_disable(void *data)
+{
+	struct regulator *regulator = data;
+
+	regulator_disable(regulator);
+}
+
 #define AFE4403_TIMING_PAIRS			\
 	{ AFE440X_LED2STC,	0x000050 },	\
 	{ AFE440X_LED2ENDC,	0x0003e7 },	\
@@ -495,19 +502,24 @@ static int afe4403_probe(struct spi_device *spi)
 		dev_err(afe->dev, "Unable to enable regulator\n");
 		return ret;
 	}
+	ret = devm_add_action_or_reset(afe->dev, afe4403_regulator_disable, afe->regulator);
+	if (ret) {
+		dev_err(afe->dev, "Unable to add regulator disable action\n");
+		return ret;
+	}
 
 	ret = regmap_write(afe->regmap, AFE440X_CONTROL0,
 			   AFE440X_CONTROL0_SW_RESET);
 	if (ret) {
 		dev_err(afe->dev, "Unable to reset device\n");
-		goto err_disable_reg;
+		return ret;
 	}
 
 	ret = regmap_multi_reg_write(afe->regmap, afe4403_reg_sequences,
 				     ARRAY_SIZE(afe4403_reg_sequences));
 	if (ret) {
 		dev_err(afe->dev, "Unable to set register defaults\n");
-		goto err_disable_reg;
+		return ret;
 	}
 
 	indio_dev->modes = INDIO_DIRECT_MODE;
@@ -523,8 +535,7 @@ static int afe4403_probe(struct spi_device *spi)
 						   iio_device_id(indio_dev));
 		if (!afe->trig) {
 			dev_err(afe->dev, "Unable to allocate IIO trigger\n");
-			ret = -ENOMEM;
-			goto err_disable_reg;
+			return -ENOMEM;
 		}
 
 		iio_trigger_set_drvdata(afe->trig, indio_dev);
@@ -532,7 +543,7 @@ static int afe4403_probe(struct spi_device *spi)
 		ret = iio_trigger_register(afe->trig);
 		if (ret) {
 			dev_err(afe->dev, "Unable to register IIO trigger\n");
-			goto err_disable_reg;
+			return ret;
 		}
 
 		ret = devm_request_threaded_irq(afe->dev, afe->irq,
@@ -566,8 +577,6 @@ static int afe4403_probe(struct spi_device *spi)
 err_trig:
 	if (afe->irq > 0)
 		iio_trigger_unregister(afe->trig);
-err_disable_reg:
-	regulator_disable(afe->regulator);
 
 	return ret;
 }
@@ -584,10 +593,6 @@ static void afe4403_remove(struct spi_device *spi)
 
 	if (afe->irq > 0)
 		iio_trigger_unregister(afe->trig);
-
-	ret = regulator_disable(afe->regulator);
-	if (ret)
-		dev_warn(afe->dev, "Unable to disable regulator\n");
 }
 
 static const struct spi_device_id afe4403_ids[] = {

From 88fcff083c72af43a33e2c6de06852576afd20a8 Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Tue, 23 Jan 2024 08:09:16 -0600
Subject: [PATCH 311/707] iio: health: afe4403: Use devm IIO helpers

Use a device lifecycle managed IIO helper functions. This helps prevent
mistakes like unregistering and freeing out of order in cleanup functions
and forgetting to unregister and free on error paths.

Signed-off-by: Andrew Davis <afd@ti.com>
Link: https://lore.kernel.org/r/20240123140918.215818-2-afd@ti.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/health/afe4403.c | 38 ++++++++----------------------------
 1 file changed, 8 insertions(+), 30 deletions(-)

diff --git a/drivers/iio/health/afe4403.c b/drivers/iio/health/afe4403.c
index 9e9d6de2a7c83e..1dbe48dae74eed 100644
--- a/drivers/iio/health/afe4403.c
+++ b/drivers/iio/health/afe4403.c
@@ -540,7 +540,7 @@ static int afe4403_probe(struct spi_device *spi)
 
 		iio_trigger_set_drvdata(afe->trig, indio_dev);
 
-		ret = iio_trigger_register(afe->trig);
+		ret = devm_iio_trigger_register(afe->dev, afe->trig);
 		if (ret) {
 			dev_err(afe->dev, "Unable to register IIO trigger\n");
 			return ret;
@@ -553,46 +553,25 @@ static int afe4403_probe(struct spi_device *spi)
 						afe->trig);
 		if (ret) {
 			dev_err(afe->dev, "Unable to request IRQ\n");
-			goto err_trig;
+			return ret;
 		}
 	}
 
-	ret = iio_triggered_buffer_setup(indio_dev, &iio_pollfunc_store_time,
-					 afe4403_trigger_handler, NULL);
+	ret = devm_iio_triggered_buffer_setup(afe->dev, indio_dev,
+					      &iio_pollfunc_store_time,
+					      afe4403_trigger_handler, NULL);
 	if (ret) {
 		dev_err(afe->dev, "Unable to setup buffer\n");
-		goto err_trig;
+		return ret;
 	}
 
-	ret = iio_device_register(indio_dev);
+	ret = devm_iio_device_register(afe->dev, indio_dev);
 	if (ret) {
 		dev_err(afe->dev, "Unable to register IIO device\n");
-		goto err_buff;
+		return ret;
 	}
 
 	return 0;
-
-err_buff:
-	iio_triggered_buffer_cleanup(indio_dev);
-err_trig:
-	if (afe->irq > 0)
-		iio_trigger_unregister(afe->trig);
-
-	return ret;
-}
-
-static void afe4403_remove(struct spi_device *spi)
-{
-	struct iio_dev *indio_dev = spi_get_drvdata(spi);
-	struct afe4403_data *afe = iio_priv(indio_dev);
-	int ret;
-
-	iio_device_unregister(indio_dev);
-
-	iio_triggered_buffer_cleanup(indio_dev);
-
-	if (afe->irq > 0)
-		iio_trigger_unregister(afe->trig);
 }
 
 static const struct spi_device_id afe4403_ids[] = {
@@ -608,7 +587,6 @@ static struct spi_driver afe4403_spi_driver = {
 		.pm = pm_sleep_ptr(&afe4403_pm_ops),
 	},
 	.probe = afe4403_probe,
-	.remove = afe4403_remove,
 	.id_table = afe4403_ids,
 };
 module_spi_driver(afe4403_spi_driver);

From d07f4b7f0413bafd3a62c677ba064aad90c7aca4 Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Tue, 23 Jan 2024 08:09:17 -0600
Subject: [PATCH 312/707] iio: health: afe4404: Use devm action helper for
 regulator disable

Use a device lifecycle managed action for regulator disable function.
This helps prevent mistakes like unregistering out of order in cleanup
functions and forgetting to unregister on error paths.

Signed-off-by: Andrew Davis <afd@ti.com>
Link: https://lore.kernel.org/r/20240123140918.215818-3-afd@ti.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/health/afe4404.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/iio/health/afe4404.c b/drivers/iio/health/afe4404.c
index ede1e82013118c..75a513a92242c1 100644
--- a/drivers/iio/health/afe4404.c
+++ b/drivers/iio/health/afe4404.c
@@ -349,6 +349,13 @@ static irqreturn_t afe4404_trigger_handler(int irq, void *private)
 	return IRQ_HANDLED;
 }
 
+static void afe4404_regulator_disable(void *data)
+{
+	struct regulator *regulator = data;
+
+	regulator_disable(regulator);
+}
+
 /* Default timings from data-sheet */
 #define AFE4404_TIMING_PAIRS			\
 	{ AFE440X_PRPCOUNT,	39999	},	\
@@ -502,19 +509,24 @@ static int afe4404_probe(struct i2c_client *client)
 		dev_err(afe->dev, "Unable to enable regulator\n");
 		return ret;
 	}
+	ret = devm_add_action_or_reset(afe->dev, afe4404_regulator_disable, afe->regulator);
+	if (ret) {
+		dev_err(afe->dev, "Unable to enable regulator\n");
+		return ret;
+	}
 
 	ret = regmap_write(afe->regmap, AFE440X_CONTROL0,
 			   AFE440X_CONTROL0_SW_RESET);
 	if (ret) {
 		dev_err(afe->dev, "Unable to reset device\n");
-		goto disable_reg;
+		return ret;
 	}
 
 	ret = regmap_multi_reg_write(afe->regmap, afe4404_reg_sequences,
 				     ARRAY_SIZE(afe4404_reg_sequences));
 	if (ret) {
 		dev_err(afe->dev, "Unable to set register defaults\n");
-		goto disable_reg;
+		return ret;
 	}
 
 	indio_dev->modes = INDIO_DIRECT_MODE;
@@ -530,8 +542,7 @@ static int afe4404_probe(struct i2c_client *client)
 						   iio_device_id(indio_dev));
 		if (!afe->trig) {
 			dev_err(afe->dev, "Unable to allocate IIO trigger\n");
-			ret = -ENOMEM;
-			goto disable_reg;
+			return -ENOMEM;
 		}
 
 		iio_trigger_set_drvdata(afe->trig, indio_dev);
@@ -539,7 +550,7 @@ static int afe4404_probe(struct i2c_client *client)
 		ret = iio_trigger_register(afe->trig);
 		if (ret) {
 			dev_err(afe->dev, "Unable to register IIO trigger\n");
-			goto disable_reg;
+			return ret;
 		}
 
 		ret = devm_request_threaded_irq(afe->dev, afe->irq,
@@ -549,7 +560,7 @@ static int afe4404_probe(struct i2c_client *client)
 						afe->trig);
 		if (ret) {
 			dev_err(afe->dev, "Unable to request IRQ\n");
-			goto disable_reg;
+			return ret;
 		}
 	}
 
@@ -573,8 +584,6 @@ static int afe4404_probe(struct i2c_client *client)
 unregister_trigger:
 	if (afe->irq > 0)
 		iio_trigger_unregister(afe->trig);
-disable_reg:
-	regulator_disable(afe->regulator);
 
 	return ret;
 }
@@ -591,10 +600,6 @@ static void afe4404_remove(struct i2c_client *client)
 
 	if (afe->irq > 0)
 		iio_trigger_unregister(afe->trig);
-
-	ret = regulator_disable(afe->regulator);
-	if (ret)
-		dev_err(afe->dev, "Unable to disable regulator\n");
 }
 
 static const struct i2c_device_id afe4404_ids[] = {

From 07af2d40bf7a8632c1a6c74cf967a9290d830679 Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Tue, 23 Jan 2024 08:09:18 -0600
Subject: [PATCH 313/707] iio: health: afe4404: Use devm IIO helpers

Use a device lifecycle managed IIO helper functions. This helps prevent
mistakes like unregistering and freeing out of order in cleanup functions
and forgetting to unregister and free on error paths.

Signed-off-by: Andrew Davis <afd@ti.com>
Link: https://lore.kernel.org/r/20240123140918.215818-4-afd@ti.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/health/afe4404.c | 36 +++++++-----------------------------
 1 file changed, 7 insertions(+), 29 deletions(-)

diff --git a/drivers/iio/health/afe4404.c b/drivers/iio/health/afe4404.c
index 75a513a92242c1..7768b07ef7a6f8 100644
--- a/drivers/iio/health/afe4404.c
+++ b/drivers/iio/health/afe4404.c
@@ -547,7 +547,7 @@ static int afe4404_probe(struct i2c_client *client)
 
 		iio_trigger_set_drvdata(afe->trig, indio_dev);
 
-		ret = iio_trigger_register(afe->trig);
+		ret = devm_iio_trigger_register(afe->dev, afe->trig);
 		if (ret) {
 			dev_err(afe->dev, "Unable to register IIO trigger\n");
 			return ret;
@@ -564,42 +564,21 @@ static int afe4404_probe(struct i2c_client *client)
 		}
 	}
 
-	ret = iio_triggered_buffer_setup(indio_dev, &iio_pollfunc_store_time,
-					 afe4404_trigger_handler, NULL);
+	ret = devm_iio_triggered_buffer_setup(afe->dev, indio_dev,
+					      &iio_pollfunc_store_time,
+					      afe4404_trigger_handler, NULL);
 	if (ret) {
 		dev_err(afe->dev, "Unable to setup buffer\n");
-		goto unregister_trigger;
+		return ret;
 	}
 
-	ret = iio_device_register(indio_dev);
+	ret = devm_iio_device_register(afe->dev, indio_dev);
 	if (ret) {
 		dev_err(afe->dev, "Unable to register IIO device\n");
-		goto unregister_triggered_buffer;
+		return ret;
 	}
 
 	return 0;
-
-unregister_triggered_buffer:
-	iio_triggered_buffer_cleanup(indio_dev);
-unregister_trigger:
-	if (afe->irq > 0)
-		iio_trigger_unregister(afe->trig);
-
-	return ret;
-}
-
-static void afe4404_remove(struct i2c_client *client)
-{
-	struct iio_dev *indio_dev = i2c_get_clientdata(client);
-	struct afe4404_data *afe = iio_priv(indio_dev);
-	int ret;
-
-	iio_device_unregister(indio_dev);
-
-	iio_triggered_buffer_cleanup(indio_dev);
-
-	if (afe->irq > 0)
-		iio_trigger_unregister(afe->trig);
 }
 
 static const struct i2c_device_id afe4404_ids[] = {
@@ -615,7 +594,6 @@ static struct i2c_driver afe4404_i2c_driver = {
 		.pm = pm_sleep_ptr(&afe4404_pm_ops),
 	},
 	.probe = afe4404_probe,
-	.remove = afe4404_remove,
 	.id_table = afe4404_ids,
 };
 module_i2c_driver(afe4404_i2c_driver);

From f460d1951562aae64af600cab35619633e65b8c8 Mon Sep 17 00:00:00 2001
From: Kim Seer Paller <kimseer.paller@analog.com>
Date: Tue, 23 Jan 2024 16:10:58 +0800
Subject: [PATCH 314/707] dt-bindings: iio: frequency: add admfm2000

Dual microwave down converter module with input RF and LO frequency
ranges from 0.5 to 32 GHz and an output IF frequency range from 0.1 to
8 GHz. It consists of a LNA, mixer, IF filter, DSA, and IF amplifier
for each down conversion path.

Signed-off-by: Kim Seer Paller <kimseer.paller@analog.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20240123081059.5746-1-kimseer.paller@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 .../bindings/iio/frequency/adi,admfm2000.yaml | 127 ++++++++++++++++++
 MAINTAINERS                                   |   7 +
 2 files changed, 134 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/iio/frequency/adi,admfm2000.yaml

diff --git a/Documentation/devicetree/bindings/iio/frequency/adi,admfm2000.yaml b/Documentation/devicetree/bindings/iio/frequency/adi,admfm2000.yaml
new file mode 100644
index 00000000000000..2bcf4bbc12e41f
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/frequency/adi,admfm2000.yaml
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright 2024 Analog Devices Inc.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/frequency/adi,admfm2000.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ADMFM2000 Dual Microwave Down Converter
+
+maintainers:
+  - Kim Seer Paller <kimseer.paller@analog.com>
+
+description:
+  Dual microwave down converter module with input RF and LO frequency ranges
+  from 0.5 to 32 GHz and an output IF frequency range from 0.1 to 8 GHz.
+  It consists of a LNA, mixer, IF filter, DSA, and IF amplifier for each down
+  conversion path.
+
+properties:
+  compatible:
+    enum:
+      - adi,admfm2000
+
+  '#address-cells':
+    const: 1
+
+  '#size-cells':
+    const: 0
+
+patternProperties:
+  "^channel@[0-1]$":
+    type: object
+    description: Represents a channel of the device.
+
+    additionalProperties: false
+
+    properties:
+      reg:
+        description:
+          The channel number.
+        minimum: 0
+        maximum: 1
+
+      adi,mixer-mode:
+        description:
+          Enable mixer mode for the channel. It downconverts RF between 5 GHz
+          and 32 GHz to IF between 0.5 GHz and 8 GHz. If not present, the channel
+          is in direct IF mode which bypasses the mixer and downconverts RF
+          between 2 GHz and 8 GHz to IF between 0.5 GHz and 8 GHz.
+        type: boolean
+
+      switch-gpios:
+        description: |
+          GPIOs to select the RF path for the channel. The same state of CTRL-A
+          and CTRL-B GPIOs is not permitted.
+          CTRL-A   CTRL-B    CH1 Status        CH2 Status
+          1        0         Direct IF mode    Mixer mode
+          0        1         Mixer mode        Direct IF mode
+
+        items:
+          - description: CTRL-A GPIO
+          - description: CTRL-B GPIO
+
+      attenuation-gpios:
+        description: |
+          Choice of attenuation:
+          DSA-V4  DSA-V3  DSA-V2  DSA-V1  DSA-V0
+          1       1       1       1       1        0 dB
+          1       1       1       1       0        -1 dB
+          1       1       1       0       1        -2 dB
+          1       1       0       1       1        -4 dB
+          1       0       1       1       1        -8 dB
+          0       1       1       1       1        -16 dB
+          0       0       0       0       0        -31 dB
+
+        items:
+          - description: DSA-V0 GPIO
+          - description: DSA-V1 GPIO
+          - description: DSA-V2 GPIO
+          - description: DSA-V3 GPIO
+          - description: DSA-V4 GPIO
+
+    required:
+      - reg
+      - switch-gpios
+      - attenuation-gpios
+
+required:
+  - compatible
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    converter {
+      compatible = "adi,admfm2000";
+
+      #address-cells = <1>;
+      #size-cells = <0>;
+
+      channel@0 {
+        reg = <0>;
+        switch-gpios = <&gpio 1 GPIO_ACTIVE_LOW>,
+                       <&gpio 2 GPIO_ACTIVE_HIGH>;
+
+        attenuation-gpios = <&gpio 17 GPIO_ACTIVE_LOW>,
+                            <&gpio 22 GPIO_ACTIVE_LOW>,
+                            <&gpio 23 GPIO_ACTIVE_LOW>,
+                            <&gpio 24 GPIO_ACTIVE_LOW>,
+                            <&gpio 25 GPIO_ACTIVE_LOW>;
+      };
+
+      channel@1 {
+        reg = <1>;
+        adi,mixer-mode;
+        switch-gpios = <&gpio 3 GPIO_ACTIVE_LOW>,
+                       <&gpio 4 GPIO_ACTIVE_HIGH>;
+
+        attenuation-gpios = <&gpio 0 GPIO_ACTIVE_LOW>,
+                            <&gpio 5 GPIO_ACTIVE_LOW>,
+                            <&gpio 6 GPIO_ACTIVE_LOW>,
+                            <&gpio 16 GPIO_ACTIVE_LOW>,
+                            <&gpio 26 GPIO_ACTIVE_LOW>;
+      };
+    };
+...
diff --git a/MAINTAINERS b/MAINTAINERS
index 0d015d85e30c9f..10c8048771b48f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1277,6 +1277,13 @@ W:	https://ez.analog.com/linux-software-drivers
 F:	Documentation/devicetree/bindings/hwmon/adi,adm1177.yaml
 F:	drivers/hwmon/adm1177.c
 
+ANALOG DEVICES INC ADMFM2000 DRIVER
+M:	Kim Seer Paller <kimseer.paller@analog.com>
+L:	linux-iio@vger.kernel.org
+S:	Supported
+W:	https://ez.analog.com/linux-software-drivers
+F:	Documentation/devicetree/bindings/iio/frequency/adi,admfm2000.yaml
+
 ANALOG DEVICES INC ADMV1013 DRIVER
 M:	Antoniu Miclaus <antoniu.miclaus@analog.com>
 L:	linux-iio@vger.kernel.org

From a0295c1bd4a79461f291c9b0df0523cbbeb75560 Mon Sep 17 00:00:00 2001
From: Kim Seer Paller <kimseer.paller@analog.com>
Date: Tue, 23 Jan 2024 16:10:59 +0800
Subject: [PATCH 315/707] iio: frequency: admfm2000: New driver

Dual microwave down converter module with input RF and LO frequency
ranges from 0.5 to 32 GHz and an output IF frequency range from 0.1 to
8 GHz. It consists of a LNA, mixer, IF filter, DSA, and IF amplifier
for each down conversion path.

Signed-off-by: Kim Seer Paller <kimseer.paller@analog.com>
Link: https://lore.kernel.org/r/20240123081059.5746-2-kimseer.paller@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 MAINTAINERS                       |   1 +
 drivers/iio/frequency/Kconfig     |  10 ++
 drivers/iio/frequency/Makefile    |   1 +
 drivers/iio/frequency/admfm2000.c | 282 ++++++++++++++++++++++++++++++
 4 files changed, 294 insertions(+)
 create mode 100644 drivers/iio/frequency/admfm2000.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 10c8048771b48f..00d354af10f5dd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1283,6 +1283,7 @@ L:	linux-iio@vger.kernel.org
 S:	Supported
 W:	https://ez.analog.com/linux-software-drivers
 F:	Documentation/devicetree/bindings/iio/frequency/adi,admfm2000.yaml
+F:	drivers/iio/frequency/admfm2000.c
 
 ANALOG DEVICES INC ADMV1013 DRIVER
 M:	Antoniu Miclaus <antoniu.miclaus@analog.com>
diff --git a/drivers/iio/frequency/Kconfig b/drivers/iio/frequency/Kconfig
index 9e85dfa585081c..c455be7d4a1c88 100644
--- a/drivers/iio/frequency/Kconfig
+++ b/drivers/iio/frequency/Kconfig
@@ -60,6 +60,16 @@ config ADF4377
 	  To compile this driver as a module, choose M here: the
 	  module will be called adf4377.
 
+config ADMFM2000
+	tristate "Analog Devices ADMFM2000 Dual Microwave Down Converter"
+	depends on GPIOLIB
+	help
+	  Say yes here to build support for Analog Devices ADMFM2000 Dual
+	  Microwave Down Converter.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called admfm2000.
+
 config ADMV1013
 	tristate "Analog Devices ADMV1013 Microwave Upconverter"
 	depends on SPI && COMMON_CLK
diff --git a/drivers/iio/frequency/Makefile b/drivers/iio/frequency/Makefile
index b616c29b4a0873..70d0e0b70e8021 100644
--- a/drivers/iio/frequency/Makefile
+++ b/drivers/iio/frequency/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_AD9523) += ad9523.o
 obj-$(CONFIG_ADF4350) += adf4350.o
 obj-$(CONFIG_ADF4371) += adf4371.o
 obj-$(CONFIG_ADF4377) += adf4377.o
+obj-$(CONFIG_ADMFM2000) += admfm2000.o
 obj-$(CONFIG_ADMV1013) += admv1013.o
 obj-$(CONFIG_ADMV1014) += admv1014.o
 obj-$(CONFIG_ADMV4420) += admv4420.o
diff --git a/drivers/iio/frequency/admfm2000.c b/drivers/iio/frequency/admfm2000.c
new file mode 100644
index 00000000000000..c34d79e55a7c58
--- /dev/null
+++ b/drivers/iio/frequency/admfm2000.c
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ADMFM2000 Dual Microwave Down Converter
+ *
+ * Copyright 2024 Analog Devices Inc.
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/gpio/consumer.h>
+#include <linux/iio/iio.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/platform_device.h>
+#include <linux/property.h>
+
+#define ADMFM2000_MIXER_MODE		0
+#define ADMFM2000_DIRECT_IF_MODE	1
+#define ADMFM2000_DSA_GPIOS		5
+#define ADMFM2000_MODE_GPIOS		2
+#define ADMFM2000_MAX_GAIN		0
+#define ADMFM2000_MIN_GAIN		-31000
+#define ADMFM2000_DEFAULT_GAIN		-0x20
+
+struct admfm2000_state {
+	struct mutex			lock; /* protect sensor state */
+	struct gpio_desc		*sw1_ch[2];
+	struct gpio_desc		*sw2_ch[2];
+	struct gpio_desc		*dsa1_gpios[5];
+	struct gpio_desc		*dsa2_gpios[5];
+	u32				gain[2];
+};
+
+static int admfm2000_mode(struct iio_dev *indio_dev, u32 chan, u32 mode)
+{
+	struct admfm2000_state *st = iio_priv(indio_dev);
+	int i;
+
+	switch (mode) {
+	case ADMFM2000_MIXER_MODE:
+		for (i = 0; i < ADMFM2000_MODE_GPIOS; i++) {
+			gpiod_set_value_cansleep(st->sw1_ch[i], (chan == 0) ? 1 : 0);
+			gpiod_set_value_cansleep(st->sw2_ch[i], (chan == 0) ? 0 : 1);
+		}
+		return 0;
+	case ADMFM2000_DIRECT_IF_MODE:
+		for (i = 0; i < ADMFM2000_MODE_GPIOS; i++) {
+			gpiod_set_value_cansleep(st->sw1_ch[i], (chan == 0) ? 0 : 1);
+			gpiod_set_value_cansleep(st->sw2_ch[i], (chan == 0) ? 1 : 0);
+		}
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int admfm2000_attenuation(struct iio_dev *indio_dev, u32 chan, u32 value)
+{
+	struct admfm2000_state *st = iio_priv(indio_dev);
+	int i;
+
+	switch (chan) {
+	case 0:
+		for (i = 0; i < ADMFM2000_DSA_GPIOS; i++)
+			gpiod_set_value_cansleep(st->dsa1_gpios[i], value & (1 << i));
+		return 0;
+	case 1:
+		for (i = 0; i < ADMFM2000_DSA_GPIOS; i++)
+			gpiod_set_value_cansleep(st->dsa2_gpios[i], value & (1 << i));
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int admfm2000_read_raw(struct iio_dev *indio_dev,
+			      struct iio_chan_spec const *chan, int *val,
+			      int *val2, long mask)
+{
+	struct admfm2000_state *st = iio_priv(indio_dev);
+	int gain;
+
+	switch (mask) {
+	case IIO_CHAN_INFO_HARDWAREGAIN:
+		mutex_lock(&st->lock);
+		gain = ~(st->gain[chan->channel]) * -1000;
+		*val = gain / 1000;
+		*val2 = (gain % 1000) * 1000;
+		mutex_unlock(&st->lock);
+
+		return IIO_VAL_INT_PLUS_MICRO_DB;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int admfm2000_write_raw(struct iio_dev *indio_dev,
+			       struct iio_chan_spec const *chan, int val,
+			       int val2, long mask)
+{
+	struct admfm2000_state *st = iio_priv(indio_dev);
+	int gain, ret;
+
+	if (val < 0)
+		gain = (val * 1000) - (val2 / 1000);
+	else
+		gain = (val * 1000) + (val2 / 1000);
+
+	if (gain > ADMFM2000_MAX_GAIN || gain < ADMFM2000_MIN_GAIN)
+		return -EINVAL;
+
+	switch (mask) {
+	case IIO_CHAN_INFO_HARDWAREGAIN:
+		mutex_lock(&st->lock);
+		st->gain[chan->channel] = ~((abs(gain) / 1000) & 0x1F);
+
+		ret = admfm2000_attenuation(indio_dev, chan->channel,
+					    st->gain[chan->channel]);
+		mutex_unlock(&st->lock);
+		return ret;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int admfm2000_write_raw_get_fmt(struct iio_dev *indio_dev,
+				       struct iio_chan_spec const *chan,
+				       long mask)
+{
+	switch (mask) {
+	case IIO_CHAN_INFO_HARDWAREGAIN:
+		return IIO_VAL_INT_PLUS_MICRO_DB;
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct iio_info admfm2000_info = {
+	.read_raw = &admfm2000_read_raw,
+	.write_raw = &admfm2000_write_raw,
+	.write_raw_get_fmt = &admfm2000_write_raw_get_fmt,
+};
+
+#define ADMFM2000_CHAN(_channel) {					\
+	.type = IIO_VOLTAGE,						\
+	.output = 1,							\
+	.indexed = 1,							\
+	.channel = _channel,						\
+	.info_mask_separate = BIT(IIO_CHAN_INFO_HARDWAREGAIN),		\
+}
+
+static const struct iio_chan_spec admfm2000_channels[] = {
+	ADMFM2000_CHAN(0),
+	ADMFM2000_CHAN(1),
+};
+
+static int admfm2000_channel_config(struct admfm2000_state *st,
+				    struct iio_dev *indio_dev)
+{
+	struct platform_device *pdev = to_platform_device(indio_dev->dev.parent);
+	struct device *dev = &pdev->dev;
+	struct fwnode_handle *child;
+	struct gpio_desc **dsa;
+	struct gpio_desc **sw;
+	int ret, i;
+	bool mode;
+	u32 reg;
+
+	device_for_each_child_node(dev, child) {
+		ret = fwnode_property_read_u32(child, "reg", &reg);
+		if (ret) {
+			fwnode_handle_put(child);
+			return dev_err_probe(dev, ret,
+					     "Failed to get reg property\n");
+		}
+
+		if (reg >= indio_dev->num_channels) {
+			fwnode_handle_put(child);
+			return dev_err_probe(dev, -EINVAL, "reg bigger than: %d\n",
+					     indio_dev->num_channels);
+		}
+
+		if (fwnode_property_present(child, "adi,mixer-mode"))
+			mode = ADMFM2000_MIXER_MODE;
+		else
+			mode = ADMFM2000_DIRECT_IF_MODE;
+
+		switch (reg) {
+		case 0:
+			sw = st->sw1_ch;
+			dsa = st->dsa1_gpios;
+			break;
+		case 1:
+			sw = st->sw2_ch;
+			dsa = st->dsa2_gpios;
+			break;
+		default:
+			fwnode_handle_put(child);
+			return -EINVAL;
+		}
+
+		for (i = 0; i < ADMFM2000_MODE_GPIOS; i++) {
+			sw[i] = devm_fwnode_gpiod_get_index(dev, child, "switch",
+							    i, GPIOD_OUT_LOW, NULL);
+			if (IS_ERR(sw[i])) {
+				fwnode_handle_put(child);
+				return dev_err_probe(dev, PTR_ERR(sw[i]),
+						     "Failed to get gpios\n");
+			}
+		}
+
+		for (i = 0; i < ADMFM2000_DSA_GPIOS; i++) {
+			dsa[i] = devm_fwnode_gpiod_get_index(dev, child,
+							     "attenuation", i,
+							     GPIOD_OUT_LOW, NULL);
+			if (IS_ERR(dsa[i])) {
+				fwnode_handle_put(child);
+				return dev_err_probe(dev, PTR_ERR(dsa[i]),
+						     "Failed to get gpios\n");
+			}
+		}
+
+		ret = admfm2000_mode(indio_dev, reg, mode);
+		if (ret) {
+			fwnode_handle_put(child);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int admfm2000_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct admfm2000_state *st;
+	struct iio_dev *indio_dev;
+	int ret;
+
+	indio_dev = devm_iio_device_alloc(dev, sizeof(*st));
+	if (!indio_dev)
+		return -ENOMEM;
+
+	st = iio_priv(indio_dev);
+
+	indio_dev->name = "admfm2000";
+	indio_dev->num_channels = ARRAY_SIZE(admfm2000_channels);
+	indio_dev->channels = admfm2000_channels;
+	indio_dev->info = &admfm2000_info;
+	indio_dev->modes = INDIO_DIRECT_MODE;
+
+	st->gain[0] = ADMFM2000_DEFAULT_GAIN;
+	st->gain[1] = ADMFM2000_DEFAULT_GAIN;
+
+	mutex_init(&st->lock);
+
+	ret = admfm2000_channel_config(st, indio_dev);
+	if (ret)
+		return ret;
+
+	return devm_iio_device_register(dev, indio_dev);
+}
+
+static const struct of_device_id admfm2000_of_match[] = {
+	{ .compatible = "adi,admfm2000" },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, admfm2000_of_match);
+
+static struct platform_driver admfm2000_driver = {
+	.driver = {
+		.name = "admfm2000",
+		.of_match_table = admfm2000_of_match,
+	},
+	.probe = admfm2000_probe,
+};
+module_platform_driver(admfm2000_driver);
+
+MODULE_AUTHOR("Kim Seer Paller <kimseer.paller@analog.com>");
+MODULE_DESCRIPTION("ADMFM2000 Dual Microwave Down Converter");
+MODULE_LICENSE("GPL");

From 83253fc066b969c9c86acb44c8dd090f4f39e820 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Fri, 26 Jan 2024 23:19:16 +0800
Subject: [PATCH 316/707] f2fs: support printk_ratelimited() in f2fs_printk()

This patch supports using printk_ratelimited() in f2fs_printk(), and
wrap ratelimited f2fs_printk() into f2fs_{err,warn,info}_ratelimited(),
then, use these new helps to clean up codes.

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c | 10 +++++-----
 fs/f2fs/dir.c      |  5 ++---
 fs/f2fs/f2fs.h     | 40 +++++++++++++++++++++++-----------------
 fs/f2fs/super.c    | 11 ++++++++---
 4 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index ff26b49c0d71ff..0fd839358c1576 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -512,8 +512,8 @@ static int lzorle_compress_pages(struct compress_ctx *cc)
 	ret = lzorle1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata,
 					&cc->clen, cc->private);
 	if (ret != LZO_E_OK) {
-		printk_ratelimited("%sF2FS-fs (%s): lzo-rle compress failed, ret:%d\n",
-				KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret);
+		f2fs_err_ratelimited(F2FS_I_SB(cc->inode),
+				"lzo-rle compress failed, ret:%d", ret);
 		return -EIO;
 	}
 	return 0;
@@ -780,9 +780,9 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task)
 		if (provided != calculated) {
 			if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) {
 				set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT);
-				printk_ratelimited(
-					"%sF2FS-fs (%s): checksum invalid, nid = %lu, %x vs %x",
-					KERN_INFO, sbi->sb->s_id, dic->inode->i_ino,
+				f2fs_info_ratelimited(sbi,
+					"checksum invalid, nid = %lu, %x vs %x",
+					dic->inode->i_ino,
 					provided, calculated);
 			}
 			set_sbi_flag(sbi, SBI_NEED_FSCK);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 042593aed1ec0a..3f20d94e12f90a 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -995,9 +995,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 		de = &d->dentry[bit_pos];
 		if (de->name_len == 0) {
 			if (found_valid_dirent || !bit_pos) {
-				printk_ratelimited(
-					"%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.",
-					KERN_WARNING, sbi->sb->s_id,
+				f2fs_warn_ratelimited(sbi,
+					"invalid namelen(0), ino:%u, run fsck to fix.",
 					le32_to_cpu(de->ino));
 				set_sbi_flag(sbi, SBI_NEED_FSCK);
 			}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4481f68d64181c..b4b737e43a6b3a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1812,6 +1812,27 @@ struct f2fs_sb_info {
 #endif
 };
 
+__printf(3, 4)
+void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, const char *fmt, ...);
+
+#define f2fs_err(sbi, fmt, ...)						\
+	f2fs_printk(sbi, false, KERN_ERR fmt, ##__VA_ARGS__)
+#define f2fs_warn(sbi, fmt, ...)					\
+	f2fs_printk(sbi, false, KERN_WARNING fmt, ##__VA_ARGS__)
+#define f2fs_notice(sbi, fmt, ...)					\
+	f2fs_printk(sbi, false, KERN_NOTICE fmt, ##__VA_ARGS__)
+#define f2fs_info(sbi, fmt, ...)					\
+	f2fs_printk(sbi, false, KERN_INFO fmt, ##__VA_ARGS__)
+#define f2fs_debug(sbi, fmt, ...)					\
+	f2fs_printk(sbi, false, KERN_DEBUG fmt, ##__VA_ARGS__)
+
+#define f2fs_err_ratelimited(sbi, fmt, ...)				\
+	f2fs_printk(sbi, true, KERN_ERR fmt, ##__VA_ARGS__)
+#define f2fs_warn_ratelimited(sbi, fmt, ...)				\
+	f2fs_printk(sbi, true, KERN_WARNING fmt, ##__VA_ARGS__)
+#define f2fs_info_ratelimited(sbi, fmt, ...)				\
+	f2fs_printk(sbi, true, KERN_INFO fmt, ##__VA_ARGS__)
+
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 #define time_to_inject(sbi, type) __time_to_inject(sbi, type, __func__,	\
 									__builtin_return_address(0))
@@ -1829,9 +1850,8 @@ static inline bool __time_to_inject(struct f2fs_sb_info *sbi, int type,
 	atomic_inc(&ffi->inject_ops);
 	if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
 		atomic_set(&ffi->inject_ops, 0);
-		printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n",
-			KERN_INFO, sbi->sb->s_id, f2fs_fault_name[type],
-			func, parent_func);
+		f2fs_info_ratelimited(sbi, "inject %s in %s of %pS",
+				f2fs_fault_name[type], func, parent_func);
 		return true;
 	}
 	return false;
@@ -2325,20 +2345,6 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
 	return -ENOSPC;
 }
 
-__printf(2, 3)
-void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...);
-
-#define f2fs_err(sbi, fmt, ...)						\
-	f2fs_printk(sbi, KERN_ERR fmt, ##__VA_ARGS__)
-#define f2fs_warn(sbi, fmt, ...)					\
-	f2fs_printk(sbi, KERN_WARNING fmt, ##__VA_ARGS__)
-#define f2fs_notice(sbi, fmt, ...)					\
-	f2fs_printk(sbi, KERN_NOTICE fmt, ##__VA_ARGS__)
-#define f2fs_info(sbi, fmt, ...)					\
-	f2fs_printk(sbi, KERN_INFO fmt, ##__VA_ARGS__)
-#define f2fs_debug(sbi, fmt, ...)					\
-	f2fs_printk(sbi, KERN_DEBUG fmt, ##__VA_ARGS__)
-
 #define PAGE_PRIVATE_GET_FUNC(name, flagname) \
 static inline bool page_private_##name(struct page *page) \
 { \
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index e2c066fbc0fa11..3e2a5e3b3e9919 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -264,7 +264,8 @@ static match_table_t f2fs_tokens = {
 	{Opt_err, NULL},
 };
 
-void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...)
+void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate,
+						const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
@@ -275,8 +276,12 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...)
 	level = printk_get_level(fmt);
 	vaf.fmt = printk_skip_level(fmt);
 	vaf.va = &args;
-	printk("%c%cF2FS-fs (%s): %pV\n",
-	       KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+	if (limit_rate)
+		printk_ratelimited("%c%cF2FS-fs (%s): %pV\n",
+			KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+	else
+		printk("%c%cF2FS-fs (%s): %pV\n",
+			KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
 
 	va_end(args);
 }

From 9a63d6e6382a93b8c6fa7f53d1ec64d9b87e7293 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Fri, 26 Jan 2024 23:19:17 +0800
Subject: [PATCH 317/707] f2fs: use f2fs_err_ratelimited() to avoid redundant
 logs

Use f2fs_err_ratelimited() to instead f2fs_err() in
f2fs_record_stop_reason() and f2fs_record_errors() to
avoid redundant logs.

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3e2a5e3b3e9919..1b718bebfaa106 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4096,7 +4096,9 @@ static void f2fs_record_stop_reason(struct f2fs_sb_info *sbi)
 
 	f2fs_up_write(&sbi->sb_lock);
 	if (err)
-		f2fs_err(sbi, "f2fs_commit_super fails to record err:%d", err);
+		f2fs_err_ratelimited(sbi,
+			"f2fs_commit_super fails to record stop_reason, err:%d",
+			err);
 }
 
 void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag)
@@ -4139,8 +4141,9 @@ static void f2fs_record_errors(struct f2fs_sb_info *sbi, unsigned char error)
 
 	err = f2fs_commit_super(sbi, false);
 	if (err)
-		f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d",
-								error, err);
+		f2fs_err_ratelimited(sbi,
+			"f2fs_commit_super fails to record errors:%u, err:%d",
+			error, err);
 out_unlock:
 	f2fs_up_write(&sbi->sb_lock);
 }

From 9f100ecdedc3f9a5d8a7aeb5b53bc12659825f9f Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Mon, 22 Jan 2024 10:23:13 +0800
Subject: [PATCH 318/707] f2fs: compress: fix to cover
 f2fs_disable_compressed_file() w/ i_sem

- f2fs_disable_compressed_file
  - check inode_has_data
					- f2fs_file_mmap
					- mkwrite
					 - f2fs_get_block_locked
					 : update metadata in compressed
					   inode's disk layout
  - fi->i_flags &= ~F2FS_COMPR_FL
  - clear_inode_flag(inode, FI_COMPRESSED_FILE);

we should use i_sem lock to prevent above race case.

Fixes: 4c8ff7095bef ("f2fs: support data compression")
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b4b737e43a6b3a..40d428636532ab 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4415,15 +4415,24 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode)
 {
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
-	if (!f2fs_compressed_file(inode))
+	f2fs_down_write(&F2FS_I(inode)->i_sem);
+
+	if (!f2fs_compressed_file(inode)) {
+		f2fs_up_write(&F2FS_I(inode)->i_sem);
 		return true;
-	if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))
+	}
+	if (f2fs_is_mmap_file(inode) ||
+		(S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))) {
+		f2fs_up_write(&F2FS_I(inode)->i_sem);
 		return false;
+	}
 
 	fi->i_flags &= ~F2FS_COMPR_FL;
 	stat_dec_compr_inode(inode);
 	clear_inode_flag(inode, FI_COMPRESSED_FILE);
 	f2fs_mark_inode_dirty_sync(inode, true);
+
+	f2fs_up_write(&F2FS_I(inode)->i_sem);
 	return true;
 }
 

From 16c326c7a519c6148766cc78c8a251bd7b62345d Mon Sep 17 00:00:00 2001
From: Zhiguo Niu <zhiguo.niu@unisoc.com>
Date: Thu, 18 Jan 2024 13:48:31 +0800
Subject: [PATCH 319/707] f2fs: compress: remove some redundant codes in
 f2fs_cache_compressed_page

Just remove some redundant codes, no logic change.

Signed-off-by: Zhiguo Niu <zhiguo.niu@unisoc.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 0fd839358c1576..3dc488ce882be6 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1889,12 +1889,8 @@ void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
 
 	set_page_private_data(cpage, ino);
 
-	if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ))
-		goto out;
-
 	memcpy(page_address(cpage), page_address(page), PAGE_SIZE);
 	SetPageUptodate(cpage);
-out:
 	f2fs_put_page(cpage, 1);
 }
 

From 5e9f083a7ae81058ceffd91e00c6fea3e7078dce Mon Sep 17 00:00:00 2001
From: Zhiguo Niu <zhiguo.niu@unisoc.com>
Date: Wed, 17 Jan 2024 15:59:58 +0800
Subject: [PATCH 320/707] f2fs: use IS_INODE replace IS_DNODE in
 f2fs_flush_inline_data

Now IS_DNODE is used in f2fs_flush_inline_data and it has some problems:
1. Just only inodes may include inline data,not all direct nodes
2. When system IO is busy, it is inefficient to lock a direct node page
but not an inode page. Besides, if this direct node page is being
locked by others for IO, f2fs_flush_inline_data will be blocked here,
which will affects the checkpoint process, this is unreasonable.

So IS_INODE should be used in f2fs_flush_inline_data.

Signed-off-by: Zhiguo Niu <zhiguo.niu@unisoc.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 9b546fd2101004..1d898a16f05a1d 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1919,7 +1919,7 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
 		for (i = 0; i < nr_folios; i++) {
 			struct page *page = &fbatch.folios[i]->page;
 
-			if (!IS_DNODE(page))
+			if (!IS_INODE(page))
 				continue;
 
 			lock_page(page);

From f31438c16879f0612fae83f02b11367c906a7d00 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Wed, 24 Jan 2024 22:49:15 +0800
Subject: [PATCH 321/707] f2fs: fix to avoid potential panic during recovery

During recovery, if FAULT_BLOCK is on, it is possible that
f2fs_reserve_new_block() will return -ENOSPC during recovery,
then it may trigger panic.

Also, if fault injection rate is 1 and only FAULT_BLOCK fault
type is on, it may encounter deadloop in loop of block reservation.

Let's change as below to fix these issues:
- remove bug_on() to avoid panic.
- limit the loop count of block reservation to avoid potential
deadloop.

Fixes: 956fa1ddc132 ("f2fs: fix to check return value of f2fs_reserve_new_block()")
Reported-by: Zhiguo Niu <zhiguo.niu@unisoc.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h     |  5 +++++
 fs/f2fs/recovery.c | 33 ++++++++++++++++-----------------
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 40d428636532ab..543898482f8b0c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -76,6 +76,11 @@ struct f2fs_fault_info {
 
 extern const char *f2fs_fault_name[FAULT_MAX];
 #define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type))
+
+/* maximum retry count for injected failure */
+#define DEFAULT_FAILURE_RETRY_COUNT		8
+#else
+#define DEFAULT_FAILURE_RETRY_COUNT		1
 #endif
 
 /*
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index d0f24ccbd1ac6e..aad1d1a9b3d647 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -611,6 +611,19 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
 	return 0;
 }
 
+static int f2fs_reserve_new_block_retry(struct dnode_of_data *dn)
+{
+	int i, err = 0;
+
+	for (i = DEFAULT_FAILURE_RETRY_COUNT; i > 0; i--) {
+		err = f2fs_reserve_new_block(dn);
+		if (!err)
+			break;
+	}
+
+	return err;
+}
+
 static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 					struct page *page)
 {
@@ -712,14 +725,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 		 */
 		if (dest == NEW_ADDR) {
 			f2fs_truncate_data_blocks_range(&dn, 1);
-			do {
-				err = f2fs_reserve_new_block(&dn);
-				if (err == -ENOSPC) {
-					f2fs_bug_on(sbi, 1);
-					break;
-				}
-			} while (err &&
-				IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION));
+
+			err = f2fs_reserve_new_block_retry(&dn);
 			if (err)
 				goto err;
 			continue;
@@ -727,16 +734,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 
 		/* dest is valid block, try to recover from src to dest */
 		if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) {
-
 			if (src == NULL_ADDR) {
-				do {
-					err = f2fs_reserve_new_block(&dn);
-					if (err == -ENOSPC) {
-						f2fs_bug_on(sbi, 1);
-						break;
-					}
-				} while (err &&
-					IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION));
+				err = f2fs_reserve_new_block_retry(&dn);
 				if (err)
 					goto err;
 			}

From 55bc47ddbfbadc16e286891f2b3b6c92c2eeeeff Mon Sep 17 00:00:00 2001
From: Li Ming <ming4.li@intel.com>
Date: Mon, 29 Jan 2024 13:18:56 +0000
Subject: [PATCH 322/707] cxl/pci: Skip to handle RAS errors if CXL.mem device
 is detached

The PCI AER model is an awkward fit for CXL error handling. While the
expectation is that a PCI device can escalate to link reset to recover
from an AER event, the same reset on CXL amounts to a suprise memory
hotplug of massive amounts of memory.

At present, the CXL error handler attempts some optimisitic error
handling to unbind the device from the cxl_mem driver after reaping some
RAS register values. This results in a "hopeful" attempt to unplug the
memory, but there is no guarantee that will succeed.

A subsequent AER notification after the memdev unbind event can no
longer assume the registers are mapped. Check for memdev bind before
reaping status register values to avoid crashes of the form:

 BUG: unable to handle page fault for address: ffa00000195e9100
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 [...]
 RIP: 0010:__cxl_handle_ras+0x30/0x110 [cxl_core]
 [...]
 Call Trace:
  <TASK>
  ? __die+0x24/0x70
  ? page_fault_oops+0x82/0x160
  ? kernelmode_fixup_or_oops+0x84/0x110
  ? exc_page_fault+0x113/0x170
  ? asm_exc_page_fault+0x26/0x30
  ? __pfx_dpc_reset_link+0x10/0x10
  ? __cxl_handle_ras+0x30/0x110 [cxl_core]
  ? find_cxl_port+0x59/0x80 [cxl_core]
  cxl_handle_rp_ras+0xbc/0xd0 [cxl_core]
  cxl_error_detected+0x6c/0xf0 [cxl_core]
  report_error_detected+0xc7/0x1c0
  pci_walk_bus+0x73/0x90
  pcie_do_recovery+0x23f/0x330

Longer term, the unbind and PCI_ERS_RESULT_DISCONNECT behavior might
need to be replaced with a new PCI_ERS_RESULT_PANIC.

Fixes: 6ac07883dbb5 ("cxl/pci: Add RCH downstream port error logging")
Cc: stable@vger.kernel.org
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Li Ming <ming4.li@intel.com>
Link: https://lore.kernel.org/r/20240129131856.2458980-1-ming4.li@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/pci.c | 43 ++++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 6c9c8d92f8f714..480489f5644e18 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -932,11 +932,21 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
 void cxl_cor_error_detected(struct pci_dev *pdev)
 {
 	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	struct device *dev = &cxlds->cxlmd->dev;
+
+	scoped_guard(device, dev) {
+		if (!dev->driver) {
+			dev_warn(&pdev->dev,
+				 "%s: memdev disabled, abort error handling\n",
+				 dev_name(dev));
+			return;
+		}
 
-	if (cxlds->rcd)
-		cxl_handle_rdport_errors(cxlds);
+		if (cxlds->rcd)
+			cxl_handle_rdport_errors(cxlds);
 
-	cxl_handle_endpoint_cor_ras(cxlds);
+		cxl_handle_endpoint_cor_ras(cxlds);
+	}
 }
 EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL);
 
@@ -948,16 +958,25 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
 	struct device *dev = &cxlmd->dev;
 	bool ue;
 
-	if (cxlds->rcd)
-		cxl_handle_rdport_errors(cxlds);
+	scoped_guard(device, dev) {
+		if (!dev->driver) {
+			dev_warn(&pdev->dev,
+				 "%s: memdev disabled, abort error handling\n",
+				 dev_name(dev));
+			return PCI_ERS_RESULT_DISCONNECT;
+		}
+
+		if (cxlds->rcd)
+			cxl_handle_rdport_errors(cxlds);
+		/*
+		 * A frozen channel indicates an impending reset which is fatal to
+		 * CXL.mem operation, and will likely crash the system. On the off
+		 * chance the situation is recoverable dump the status of the RAS
+		 * capability registers and bounce the active state of the memdev.
+		 */
+		ue = cxl_handle_endpoint_ras(cxlds);
+	}
 
-	/*
-	 * A frozen channel indicates an impending reset which is fatal to
-	 * CXL.mem operation, and will likely crash the system. On the off
-	 * chance the situation is recoverable dump the status of the RAS
-	 * capability registers and bounce the active state of the memdev.
-	 */
-	ue = cxl_handle_endpoint_ras(cxlds);
 
 	switch (state) {
 	case pci_channel_io_normal:

From eeab239d6a2418fc5d2cd7ea76187085a97acde0 Mon Sep 17 00:00:00 2001
From: Fullway Wang <fullwaywang@outlook.com>
Date: Thu, 18 Jan 2024 15:52:49 +0800
Subject: [PATCH 323/707] ASoC: wcd934x: fix an incorrect use of kstrndup()

In wcd934x_codec_enable_dec(), kstrndup() is used to alloc memory.
However, kmemdup_nul() should be used instead with the size known.

This is similar to CVE-2019-12454 which was fixed in commit
a549881.

Signed-off-by: Fullway Wang <fullwaywang@outlook.com>
Link: https://msgid.link/r/PH7PR20MB59255EF9DFFB022CB1BBB574BF712@PH7PR20MB5925.namprd20.prod.outlook.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/codecs/wcd934x.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/soc/codecs/wcd934x.c b/sound/soc/codecs/wcd934x.c
index 6813268e6a19f3..eaed17760ddda9 100644
--- a/sound/soc/codecs/wcd934x.c
+++ b/sound/soc/codecs/wcd934x.c
@@ -4989,7 +4989,7 @@ static int wcd934x_codec_enable_dec(struct snd_soc_dapm_widget *w,
 	char *dec;
 	u8 hpf_coff_freq;
 
-	widget_name = kstrndup(w->name, 15, GFP_KERNEL);
+	widget_name = kmemdup_nul(w->name, 15, GFP_KERNEL);
 	if (!widget_name)
 		return -ENOMEM;
 

From 94a571344df867011b60e7c1399dea29410d7bd4 Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Fri, 12 Jan 2024 12:09:50 -0800
Subject: [PATCH 324/707] x86/numa: Fix the address overlap check in
 numa_fill_memblks()

numa_fill_memblks() fills in the gaps in numa_meminfo memblks over a
physical address range. To do so, it first creates a list of existing
memblks that overlap that address range. The issue is that it is off
by one when comparing to the end of the address range, so memblks
that do not overlap are selected.

The impact of selecting a memblk that does not actually overlap is
that an existing memblk may be filled when the expected action is to
do nothing and return NUMA_NO_MEMBLK to the caller. The caller can
then add a new NUMA node and memblk.

Replace the broken open-coded search for address overlap with the
memblock helper memblock_addrs_overlap(). Update the kernel doc
and in code comments.

Suggested by: "Huang, Ying" <ying.huang@intel.com>

Fixes: 8f012db27c95 ("x86/numa: Introduce numa_fill_memblks()")
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/10a3e6109c34c21a8dd4c513cf63df63481a2b07.1705085543.git.alison.schofield@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/mm/numa.c       | 19 +++++++------------
 include/linux/memblock.h |  2 ++
 mm/memblock.c            |  5 +++--
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index adc497b93f0374..8ada9bbfad583f 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -944,14 +944,12 @@ static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
  * @start: address to begin fill
  * @end: address to end fill
  *
- * Find and extend numa_meminfo memblks to cover the @start-@end
- * physical address range, such that the first memblk includes
- * @start, the last memblk includes @end, and any gaps in between
- * are filled.
+ * Find and extend numa_meminfo memblks to cover the physical
+ * address range @start-@end
  *
  * RETURNS:
  * 0		  : Success
- * NUMA_NO_MEMBLK : No memblk exists in @start-@end range
+ * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end
  */
 
 int __init numa_fill_memblks(u64 start, u64 end)
@@ -963,17 +961,14 @@ int __init numa_fill_memblks(u64 start, u64 end)
 
 	/*
 	 * Create a list of pointers to numa_meminfo memblks that
-	 * overlap start, end. Exclude (start == bi->end) since
-	 * end addresses in both a CFMWS range and a memblk range
-	 * are exclusive.
-	 *
-	 * This list of pointers is used to make in-place changes
-	 * that fill out the numa_meminfo memblks.
+	 * overlap start, end. The list is used to make in-place
+	 * changes that fill out the numa_meminfo memblks.
 	 */
 	for (int i = 0; i < mi->nr_blks; i++) {
 		struct numa_memblk *bi = &mi->blk[i];
 
-		if (start < bi->end && end >= bi->start) {
+		if (memblock_addrs_overlap(start, end - start, bi->start,
+					   bi->end - bi->start)) {
 			blk[count] = &mi->blk[i];
 			count++;
 		}
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index b695f9e946dabb..e2082240586d00 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -121,6 +121,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
 int memblock_physmem_add(phys_addr_t base, phys_addr_t size);
 #endif
 void memblock_trim_memory(phys_addr_t align);
+unsigned long memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
+				     phys_addr_t base2, phys_addr_t size2);
 bool memblock_overlaps_region(struct memblock_type *type,
 			      phys_addr_t base, phys_addr_t size);
 bool memblock_validate_numa_coverage(unsigned long threshold_bytes);
diff --git a/mm/memblock.c b/mm/memblock.c
index 4dcb2ee35eca85..964eb72db539e1 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -180,8 +180,9 @@ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
 /*
  * Address comparison utilities
  */
-static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
-				       phys_addr_t base2, phys_addr_t size2)
+unsigned long __init_memblock
+memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, phys_addr_t base2,
+		       phys_addr_t size2)
 {
 	return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
 }

From 6be99530c92c6b8ff7a01903edc42393575ad63b Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Fri, 12 Jan 2024 12:09:51 -0800
Subject: [PATCH 325/707] x86/numa: Fix the sort compare func used in
 numa_fill_memblks()

The compare function used to sort memblks into starting address
order fails when the result of its u64 address subtraction gets
truncated to an int upon return.

The impact of the bad sort is that memblks will be filled out
incorrectly. Depending on the set of memblks, a user may see no
errors at all but still have a bad fill, or see messages reporting
a node overlap that leads to numa init failure:

[] node 0 [mem: ] overlaps with node 1 [mem: ]
[] No NUMA configuration found

Replace with a comparison that can only result in: 1, 0, -1.

Fixes: 8f012db27c95 ("x86/numa: Introduce numa_fill_memblks()")
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/99dcb3ae87e04995e9f293f6158dc8fa0749a487.1705085543.git.alison.schofield@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/mm/numa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 8ada9bbfad583f..65e9a6e391c046 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -934,7 +934,7 @@ static int __init cmp_memblk(const void *a, const void *b)
 	const struct numa_memblk *ma = *(const struct numa_memblk **)a;
 	const struct numa_memblk *mb = *(const struct numa_memblk **)b;
 
-	return ma->start - mb->start;
+	return (ma->start > mb->start) - (ma->start < mb->start);
 }
 
 static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;

From 17525952fa834a75751f51726eb3cd683948b148 Mon Sep 17 00:00:00 2001
From: Shyam Prasad N <sprasad@microsoft.com>
Date: Mon, 29 Jan 2024 13:58:13 +0000
Subject: [PATCH 326/707] cifs: make sure that channel scaling is done only
 once

Following a successful cifs_tree_connect, we have the code
to scale up/down the number of channels in the session.
However, it is not protected by a lock today.

As a result, this code can be executed by several processes
that select the same channel. The core functions handle this
well, as they pick chan_lock. However, we've seen cases where
smb2_reconnect throws some warnings.

To fix that, this change introduces a flags bitmap inside the
cifs_ses structure. A new flag type is used to ensure that
only one process enters this section at any time.

Signed-off-by: Shyam Prasad N <sprasad@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsglob.h |  3 +++
 fs/smb/client/smb2pdu.c  | 17 ++++++++++++++---
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 16befff4cbb47c..9093c507042fa1 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -1032,6 +1032,8 @@ struct cifs_chan {
 	__u8 signkey[SMB3_SIGN_KEY_SIZE];
 };
 
+#define CIFS_SES_FLAG_SCALE_CHANNELS (0x1)
+
 /*
  * Session structure.  One of these for each uid session with a particular host
  */
@@ -1064,6 +1066,7 @@ struct cifs_ses {
 	enum securityEnum sectype; /* what security flavor was specified? */
 	bool sign;		/* is signing required? */
 	bool domainAuto:1;
+	unsigned int flags;
 	__u16 session_flags;
 	__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
 	__u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE];
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 86f6f35b7f32e8..273e24f9da1316 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -399,6 +399,12 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
 		goto out;
 	}
 
+	spin_lock(&ses->ses_lock);
+	if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS)
+		goto skip_add_channels;
+	ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS;
+	spin_unlock(&ses->ses_lock);
+
 	if (!rc &&
 	    (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
 		mutex_unlock(&ses->session_mutex);
@@ -428,17 +434,22 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
 		if (ses->chan_max > ses->chan_count &&
 		    ses->iface_count &&
 		    !SERVER_IS_CHAN(server)) {
-			if (ses->chan_count == 1)
+			if (ses->chan_count == 1) {
 				cifs_server_dbg(VFS, "supports multichannel now\n");
+				queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+						 (SMB_INTERFACE_POLL_INTERVAL * HZ));
+			}
 
 			cifs_try_adding_channels(ses);
-			queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
-					   (SMB_INTERFACE_POLL_INTERVAL * HZ));
 		}
 	} else {
 		mutex_unlock(&ses->session_mutex);
 	}
+
 skip_add_channels:
+	spin_lock(&ses->ses_lock);
+	ses->flags &= ~CIFS_SES_FLAG_SCALE_CHANNELS;
+	spin_unlock(&ses->ses_lock);
 
 	if (smb2_command != SMB2_INTERNAL_CMD)
 		mod_delayed_work(cifsiod_wq, &server->reconnect, 0);

From 4f6bb3af2346530c53a2ef727f2b682558c2ee8f Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Mon, 29 Jan 2024 21:04:44 -0300
Subject: [PATCH 327/707] smb: client: increase number of PDUs allowed in a
 compound request

With the introduction of SMB2_OP_QUERY_WSL_EA, the client may now send
5 commands in a single compound request in order to query xattrs from
potential WSL reparse points, which should be fine as we currently
allow up to 5 PDUs in a single compound request.  However, if
encryption is enabled (e.g. 'seal' mount option) or enforced by the
server, current MAX_COMPOUND(5) won't be enough as we require an extra
PDU for the transform header.

Fix this by increasing MAX_COMPOUND to 7 and, while we're at it, add
an WARN_ON_ONCE() and return -EIO instead of -ENOMEM in case we
attempt to send a compound request that couldn't include the extra
transform header.

Signed-off-by: Paulo Alcantara <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsglob.h  | 2 +-
 fs/smb/client/transport.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 9093c507042fa1..c86a72c9d9ecd4 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -87,7 +87,7 @@
 #define SMB_INTERFACE_POLL_INTERVAL	600
 
 /* maximum number of PDUs in one compound */
-#define MAX_COMPOUND 5
+#define MAX_COMPOUND 7
 
 /*
  * Default number of credits to keep available for SMB3.
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index e00278fcfa4fa6..994d7019343297 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -435,8 +435,8 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
 	if (!(flags & CIFS_TRANSFORM_REQ))
 		return __smb_send_rqst(server, num_rqst, rqst);
 
-	if (num_rqst > MAX_COMPOUND - 1)
-		return -ENOMEM;
+	if (WARN_ON_ONCE(num_rqst > MAX_COMPOUND - 1))
+		return -EIO;
 
 	if (!server->ops->init_transform_rq) {
 		cifs_server_dbg(VFS, "Encryption requested but transform callback is missing\n");

From c7e65cd6e359372b857463041cfa3767c82c55f8 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Sun, 21 Jan 2024 13:28:21 -0300
Subject: [PATCH 328/707] smb: client: introduce reparse mount option

Allow the user to create special files and symlinks by choosing
between WSL and NFS reparse points via 'reparse={nfs,wsl}' mount
options.  If unset or 'reparse=default', the client will default to
creating them via NFS reparse points.

Creating WSL reparse points isn't supported yet, so simply return
error when attempting to mount with 'reparse=wsl' for now.

Signed-off-by: Paulo Alcantara <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsglob.h   |  6 ++++++
 fs/smb/client/connect.c    |  2 ++
 fs/smb/client/fs_context.c | 35 +++++++++++++++++++++++++++++++++++
 fs/smb/client/fs_context.h |  9 +++++++++
 4 files changed, 52 insertions(+)

diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index c86a72c9d9ecd4..b633f08f40f548 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -153,6 +153,12 @@ enum securityEnum {
 	Kerberos,		/* Kerberos via SPNEGO */
 };
 
+enum cifs_reparse_type {
+	CIFS_REPARSE_TYPE_NFS,
+	CIFS_REPARSE_TYPE_WSL,
+	CIFS_REPARSE_TYPE_DEFAULT = CIFS_REPARSE_TYPE_NFS,
+};
+
 struct session_key {
 	unsigned int len;
 	char *response;
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index bfd568f8971056..c5cf88de32b732 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -2797,6 +2797,8 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
 		return 0;
 	if (old->ctx->closetimeo != new->ctx->closetimeo)
 		return 0;
+	if (old->ctx->reparse_type != new->ctx->reparse_type)
+		return 0;
 
 	return 1;
 }
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 52cbef2eeb28f6..aee8be2cae4676 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -174,6 +174,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
 	fsparam_string("vers", Opt_vers),
 	fsparam_string("sec", Opt_sec),
 	fsparam_string("cache", Opt_cache),
+	fsparam_string("reparse", Opt_reparse),
 
 	/* Arguments that should be ignored */
 	fsparam_flag("guest", Opt_ignore),
@@ -296,6 +297,35 @@ cifs_parse_cache_flavor(struct fs_context *fc, char *value, struct smb3_fs_conte
 	return 0;
 }
 
+static const match_table_t reparse_flavor_tokens = {
+	{ Opt_reparse_default,	"default" },
+	{ Opt_reparse_nfs,	"nfs" },
+	{ Opt_reparse_wsl,	"wsl" },
+	{ Opt_reparse_err,	NULL },
+};
+
+static int parse_reparse_flavor(struct fs_context *fc, char *value,
+				struct smb3_fs_context *ctx)
+{
+	substring_t args[MAX_OPT_ARGS];
+
+	switch (match_token(value, reparse_flavor_tokens, args)) {
+	case Opt_reparse_default:
+		ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT;
+		break;
+	case Opt_reparse_nfs:
+		ctx->reparse_type = CIFS_REPARSE_TYPE_NFS;
+		break;
+	case Opt_reparse_wsl:
+		cifs_errorf(fc, "unsupported reparse= option: %s\n", value);
+		return 1;
+	default:
+		cifs_errorf(fc, "bad reparse= option: %s\n", value);
+		return 1;
+	}
+	return 0;
+}
+
 #define DUP_CTX_STR(field)						\
 do {									\
 	if (ctx->field) {						\
@@ -1538,6 +1568,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
 	case Opt_rdma:
 		ctx->rdma = true;
 		break;
+	case Opt_reparse:
+		if (parse_reparse_flavor(fc, param->string, ctx))
+			goto cifs_parse_mount_err;
+		break;
 	}
 	/* case Opt_ignore: - is ignored as expected ... */
 
@@ -1624,6 +1658,7 @@ int smb3_init_fs_context(struct fs_context *fc)
 	ctx->backupgid_specified = false; /* no backup intent for a group */
 
 	ctx->retrans = 1;
+	ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT;
 
 /*
  *	short int override_uid = -1;
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 182ce11cbe9362..1f09754977e7cc 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -41,6 +41,13 @@ enum {
 	Opt_cache_err
 };
 
+enum cifs_reparse_parm {
+	Opt_reparse_default,
+	Opt_reparse_nfs,
+	Opt_reparse_wsl,
+	Opt_reparse_err
+};
+
 enum cifs_sec_param {
 	Opt_sec_krb5,
 	Opt_sec_krb5i,
@@ -148,6 +155,7 @@ enum cifs_param {
 	Opt_vers,
 	Opt_sec,
 	Opt_cache,
+	Opt_reparse,
 
 	/* Mount options to be ignored */
 	Opt_ignore,
@@ -271,6 +279,7 @@ struct smb3_fs_context {
 	char *leaf_fullpath;
 	struct cifs_ses *dfs_root_ses;
 	bool dfs_automount:1; /* set for dfs automount only */
+	enum cifs_reparse_type reparse_type;
 };
 
 extern const struct fs_parameter_spec smb3_fs_parameters[];

From 50e429b8523a8b84f2563fa87dde52285bfacbc7 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Sun, 21 Jan 2024 19:00:44 -0300
Subject: [PATCH 329/707] smb: client: move most of reparse point handling code
 to common file

In preparation to add support for creating special files also via WSL
reparse points in next commits.

Signed-off-by: Paulo Alcantara <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/Makefile    |   2 +-
 fs/smb/client/cifsglob.h  |  13 --
 fs/smb/client/cifsproto.h |   4 -
 fs/smb/client/inode.c     |  79 +---------
 fs/smb/client/readdir.c   |  18 +--
 fs/smb/client/reparse.c   | 316 ++++++++++++++++++++++++++++++++++++++
 fs/smb/client/reparse.h   |  73 +++++++++
 fs/smb/client/smb2ops.c   | 250 +-----------------------------
 fs/smb/client/smb2proto.h |   6 +
 9 files changed, 401 insertions(+), 360 deletions(-)
 create mode 100644 fs/smb/client/reparse.c
 create mode 100644 fs/smb/client/reparse.h

diff --git a/fs/smb/client/Makefile b/fs/smb/client/Makefile
index 0b07eb94c93b38..e11985f2460b26 100644
--- a/fs/smb/client/Makefile
+++ b/fs/smb/client/Makefile
@@ -12,7 +12,7 @@ cifs-y := trace.o cifsfs.o cifs_debug.o connect.o dir.o file.o \
 	  smb2ops.o smb2maperror.o smb2transport.o \
 	  smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \
 	  dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o \
-	  namespace.o
+	  namespace.o reparse.o
 
 $(obj)/asn1.o: $(obj)/cifs_spnego_negtokeninit.asn1.h
 
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index b633f08f40f548..a4aa95439e18c6 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -223,19 +223,6 @@ struct cifs_open_info_data {
 	};
 };
 
-static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data)
-{
-	struct smb2_file_all_info *fi = &data->fi;
-	u32 attrs = le32_to_cpu(fi->Attributes);
-	bool ret;
-
-	ret = data->reparse_point || (attrs & ATTR_REPARSE);
-	if (ret)
-		attrs |= ATTR_REPARSE;
-	fi->Attributes = cpu_to_le32(attrs);
-	return ret;
-}
-
 /*
  *****************************************************************
  * Except the CIFS PDUs themselves all the
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index a841bf4967fa4d..770db902685013 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -208,10 +208,6 @@ extern struct inode *cifs_iget(struct super_block *sb,
 int cifs_get_inode_info(struct inode **inode, const char *full_path,
 			struct cifs_open_info_data *data, struct super_block *sb, int xid,
 			const struct cifs_fid *fid);
-bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
-				 struct cifs_fattr *fattr,
-				 struct cifs_open_info_data *data);
-
 extern int smb311_posix_get_inode_info(struct inode **inode,
 				       const char *full_path,
 				       struct cifs_open_info_data *data,
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index d02f8ba29cb5bf..56d77ff8249d50 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -26,6 +26,7 @@
 #include "fs_context.h"
 #include "cifs_ioctl.h"
 #include "cached_dir.h"
+#include "reparse.h"
 
 static void cifs_set_ops(struct inode *inode)
 {
@@ -727,84 +728,6 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr,
 		fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
 }
 
-static inline dev_t nfs_mkdev(struct reparse_posix_data *buf)
-{
-	u64 v = le64_to_cpu(*(__le64 *)buf->DataBuffer);
-
-	return MKDEV(v >> 32, v & 0xffffffff);
-}
-
-bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
-				 struct cifs_fattr *fattr,
-				 struct cifs_open_info_data *data)
-{
-	struct reparse_posix_data *buf = data->reparse.posix;
-	u32 tag = data->reparse.tag;
-
-	if (tag == IO_REPARSE_TAG_NFS && buf) {
-		switch (le64_to_cpu(buf->InodeType)) {
-		case NFS_SPECFILE_CHR:
-			fattr->cf_mode |= S_IFCHR;
-			fattr->cf_dtype = DT_CHR;
-			fattr->cf_rdev = nfs_mkdev(buf);
-			break;
-		case NFS_SPECFILE_BLK:
-			fattr->cf_mode |= S_IFBLK;
-			fattr->cf_dtype = DT_BLK;
-			fattr->cf_rdev = nfs_mkdev(buf);
-			break;
-		case NFS_SPECFILE_FIFO:
-			fattr->cf_mode |= S_IFIFO;
-			fattr->cf_dtype = DT_FIFO;
-			break;
-		case NFS_SPECFILE_SOCK:
-			fattr->cf_mode |= S_IFSOCK;
-			fattr->cf_dtype = DT_SOCK;
-			break;
-		case NFS_SPECFILE_LNK:
-			fattr->cf_mode |= S_IFLNK;
-			fattr->cf_dtype = DT_LNK;
-			break;
-		default:
-			WARN_ON_ONCE(1);
-			return false;
-		}
-		return true;
-	}
-
-	switch (tag) {
-	case IO_REPARSE_TAG_LX_SYMLINK:
-		fattr->cf_mode |= S_IFLNK;
-		fattr->cf_dtype = DT_LNK;
-		break;
-	case IO_REPARSE_TAG_LX_FIFO:
-		fattr->cf_mode |= S_IFIFO;
-		fattr->cf_dtype = DT_FIFO;
-		break;
-	case IO_REPARSE_TAG_AF_UNIX:
-		fattr->cf_mode |= S_IFSOCK;
-		fattr->cf_dtype = DT_SOCK;
-		break;
-	case IO_REPARSE_TAG_LX_CHR:
-		fattr->cf_mode |= S_IFCHR;
-		fattr->cf_dtype = DT_CHR;
-		break;
-	case IO_REPARSE_TAG_LX_BLK:
-		fattr->cf_mode |= S_IFBLK;
-		fattr->cf_dtype = DT_BLK;
-		break;
-	case 0: /* SMB1 symlink */
-	case IO_REPARSE_TAG_SYMLINK:
-	case IO_REPARSE_TAG_NFS:
-		fattr->cf_mode |= S_IFLNK;
-		fattr->cf_dtype = DT_LNK;
-		break;
-	default:
-		return false;
-	}
-	return true;
-}
-
 static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
 				    struct cifs_open_info_data *data,
 				    struct super_block *sb)
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index 3b1b01d10f7d7a..7ef5a4b37901db 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -22,6 +22,7 @@
 #include "smb2proto.h"
 #include "fs_context.h"
 #include "cached_dir.h"
+#include "reparse.h"
 
 /*
  * To be safe - for UCS to UTF-8 with strings loaded with the rare long
@@ -55,23 +56,6 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 }
 #endif /* DEBUG2 */
 
-/*
- * Match a reparse point inode if reparse tag and ctime haven't changed.
- *
- * Windows Server updates ctime of reparse points when their data have changed.
- * The server doesn't allow changing reparse tags from existing reparse points,
- * though it's worth checking.
- */
-static inline bool reparse_inode_match(struct inode *inode,
-				       struct cifs_fattr *fattr)
-{
-	struct timespec64 ctime = inode_get_ctime(inode);
-
-	return (CIFS_I(inode)->cifsAttrs & ATTR_REPARSE) &&
-		CIFS_I(inode)->reparse_tag == fattr->cf_cifstag &&
-		timespec64_equal(&ctime, &fattr->cf_ctime);
-}
-
 /*
  * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
  *
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
new file mode 100644
index 00000000000000..5ce9d1ed5b8881
--- /dev/null
+++ b/fs/smb/client/reparse.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024 Paulo Alcantara <pc@manguebit.com>
+ */
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include "cifsglob.h"
+#include "smb2proto.h"
+#include "cifsproto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "reparse.h"
+
+int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
+				struct dentry *dentry, struct cifs_tcon *tcon,
+				const char *full_path, const char *symname)
+{
+	struct reparse_symlink_data_buffer *buf = NULL;
+	struct cifs_open_info_data data;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct inode *new;
+	struct kvec iov;
+	__le16 *path;
+	char *sym;
+	u16 len, plen;
+	int rc = 0;
+
+	sym = kstrdup(symname, GFP_KERNEL);
+	if (!sym)
+		return -ENOMEM;
+
+	data = (struct cifs_open_info_data) {
+		.reparse_point = true,
+		.reparse = { .tag = IO_REPARSE_TAG_SYMLINK, },
+		.symlink_target = sym,
+	};
+
+	path = cifs_convert_path_to_utf16(symname, cifs_sb);
+	if (!path) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	plen = 2 * UniStrnlen((wchar_t *)path, PATH_MAX);
+	len = sizeof(*buf) + plen * 2;
+	buf = kzalloc(len, GFP_KERNEL);
+	if (!buf) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK);
+	buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer));
+	buf->SubstituteNameOffset = cpu_to_le16(plen);
+	buf->SubstituteNameLength = cpu_to_le16(plen);
+	memcpy(&buf->PathBuffer[plen], path, plen);
+	buf->PrintNameOffset = 0;
+	buf->PrintNameLength = cpu_to_le16(plen);
+	memcpy(buf->PathBuffer, path, plen);
+	buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0);
+
+	iov.iov_base = buf;
+	iov.iov_len = len;
+	new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+				     tcon, full_path, &iov);
+	if (!IS_ERR(new))
+		d_instantiate(dentry, new);
+	else
+		rc = PTR_ERR(new);
+out:
+	kfree(path);
+	cifs_free_open_info(&data);
+	kfree(buf);
+	return rc;
+}
+
+static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
+			       mode_t mode, dev_t dev,
+			       struct kvec *iov)
+{
+	u64 type;
+	u16 len, dlen;
+
+	len = sizeof(*buf);
+
+	switch ((type = reparse_mode_nfs_type(mode))) {
+	case NFS_SPECFILE_BLK:
+	case NFS_SPECFILE_CHR:
+		dlen = sizeof(__le64);
+		break;
+	case NFS_SPECFILE_FIFO:
+	case NFS_SPECFILE_SOCK:
+		dlen = 0;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_NFS);
+	buf->Reserved = 0;
+	buf->InodeType = cpu_to_le64(type);
+	buf->ReparseDataLength = cpu_to_le16(len + dlen -
+					     sizeof(struct reparse_data_buffer));
+	*(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MAJOR(dev) << 32) |
+						 MINOR(dev));
+	iov->iov_base = buf;
+	iov->iov_len = len + dlen;
+	return 0;
+}
+
+int smb2_make_nfs_node(unsigned int xid, struct inode *inode,
+		       struct dentry *dentry, struct cifs_tcon *tcon,
+		       const char *full_path, umode_t mode, dev_t dev)
+{
+	struct cifs_open_info_data data;
+	struct reparse_posix_data *p;
+	struct inode *new;
+	struct kvec iov;
+	__u8 buf[sizeof(*p) + sizeof(__le64)];
+	int rc;
+
+	p = (struct reparse_posix_data *)buf;
+	rc = nfs_set_reparse_buf(p, mode, dev, &iov);
+	if (rc)
+		return rc;
+
+	data = (struct cifs_open_info_data) {
+		.reparse_point = true,
+		.reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, },
+	};
+
+	new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+				     tcon, full_path, &iov);
+	if (!IS_ERR(new))
+		d_instantiate(dentry, new);
+	else
+		rc = PTR_ERR(new);
+	cifs_free_open_info(&data);
+	return rc;
+}
+
+/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
+static int parse_reparse_posix(struct reparse_posix_data *buf,
+			       struct cifs_sb_info *cifs_sb,
+			       struct cifs_open_info_data *data)
+{
+	unsigned int len;
+	u64 type;
+
+	switch ((type = le64_to_cpu(buf->InodeType))) {
+	case NFS_SPECFILE_LNK:
+		len = le16_to_cpu(buf->ReparseDataLength);
+		data->symlink_target = cifs_strndup_from_utf16(buf->DataBuffer,
+							       len, true,
+							       cifs_sb->local_nls);
+		if (!data->symlink_target)
+			return -ENOMEM;
+		convert_delimiter(data->symlink_target, '/');
+		cifs_dbg(FYI, "%s: target path: %s\n",
+			 __func__, data->symlink_target);
+		break;
+	case NFS_SPECFILE_CHR:
+	case NFS_SPECFILE_BLK:
+	case NFS_SPECFILE_FIFO:
+	case NFS_SPECFILE_SOCK:
+		break;
+	default:
+		cifs_dbg(VFS, "%s: unhandled inode type: 0x%llx\n",
+			 __func__, type);
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym,
+				 u32 plen, bool unicode,
+				 struct cifs_sb_info *cifs_sb,
+				 struct cifs_open_info_data *data)
+{
+	unsigned int len;
+	unsigned int offs;
+
+	/* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */
+
+	offs = le16_to_cpu(sym->SubstituteNameOffset);
+	len = le16_to_cpu(sym->SubstituteNameLength);
+	if (offs + 20 > plen || offs + len + 20 > plen) {
+		cifs_dbg(VFS, "srv returned malformed symlink buffer\n");
+		return -EIO;
+	}
+
+	data->symlink_target = cifs_strndup_from_utf16(sym->PathBuffer + offs,
+						       len, unicode,
+						       cifs_sb->local_nls);
+	if (!data->symlink_target)
+		return -ENOMEM;
+
+	convert_delimiter(data->symlink_target, '/');
+	cifs_dbg(FYI, "%s: target path: %s\n", __func__, data->symlink_target);
+
+	return 0;
+}
+
+int parse_reparse_point(struct reparse_data_buffer *buf,
+			u32 plen, struct cifs_sb_info *cifs_sb,
+			bool unicode, struct cifs_open_info_data *data)
+{
+	data->reparse.buf = buf;
+
+	/* See MS-FSCC 2.1.2 */
+	switch (le32_to_cpu(buf->ReparseTag)) {
+	case IO_REPARSE_TAG_NFS:
+		return parse_reparse_posix((struct reparse_posix_data *)buf,
+					   cifs_sb, data);
+	case IO_REPARSE_TAG_SYMLINK:
+		return parse_reparse_symlink(
+			(struct reparse_symlink_data_buffer *)buf,
+			plen, unicode, cifs_sb, data);
+	case IO_REPARSE_TAG_LX_SYMLINK:
+	case IO_REPARSE_TAG_AF_UNIX:
+	case IO_REPARSE_TAG_LX_FIFO:
+	case IO_REPARSE_TAG_LX_CHR:
+	case IO_REPARSE_TAG_LX_BLK:
+		return 0;
+	default:
+		cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n",
+			 __func__, le32_to_cpu(buf->ReparseTag));
+		return -EOPNOTSUPP;
+	}
+}
+
+int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
+			     struct kvec *rsp_iov,
+			     struct cifs_open_info_data *data)
+{
+	struct reparse_data_buffer *buf;
+	struct smb2_ioctl_rsp *io = rsp_iov->iov_base;
+	u32 plen = le32_to_cpu(io->OutputCount);
+
+	buf = (struct reparse_data_buffer *)((u8 *)io +
+					     le32_to_cpu(io->OutputOffset));
+	return parse_reparse_point(buf, plen, cifs_sb, true, data);
+}
+
+bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
+				 struct cifs_fattr *fattr,
+				 struct cifs_open_info_data *data)
+{
+	struct reparse_posix_data *buf = data->reparse.posix;
+	u32 tag = data->reparse.tag;
+
+	if (tag == IO_REPARSE_TAG_NFS && buf) {
+		switch (le64_to_cpu(buf->InodeType)) {
+		case NFS_SPECFILE_CHR:
+			fattr->cf_mode |= S_IFCHR;
+			fattr->cf_dtype = DT_CHR;
+			fattr->cf_rdev = reparse_nfs_mkdev(buf);
+			break;
+		case NFS_SPECFILE_BLK:
+			fattr->cf_mode |= S_IFBLK;
+			fattr->cf_dtype = DT_BLK;
+			fattr->cf_rdev = reparse_nfs_mkdev(buf);
+			break;
+		case NFS_SPECFILE_FIFO:
+			fattr->cf_mode |= S_IFIFO;
+			fattr->cf_dtype = DT_FIFO;
+			break;
+		case NFS_SPECFILE_SOCK:
+			fattr->cf_mode |= S_IFSOCK;
+			fattr->cf_dtype = DT_SOCK;
+			break;
+		case NFS_SPECFILE_LNK:
+			fattr->cf_mode |= S_IFLNK;
+			fattr->cf_dtype = DT_LNK;
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			return false;
+		}
+		return true;
+	}
+
+	switch (tag) {
+	case IO_REPARSE_TAG_LX_SYMLINK:
+		fattr->cf_mode |= S_IFLNK;
+		fattr->cf_dtype = DT_LNK;
+		break;
+	case IO_REPARSE_TAG_LX_FIFO:
+		fattr->cf_mode |= S_IFIFO;
+		fattr->cf_dtype = DT_FIFO;
+		break;
+	case IO_REPARSE_TAG_AF_UNIX:
+		fattr->cf_mode |= S_IFSOCK;
+		fattr->cf_dtype = DT_SOCK;
+		break;
+	case IO_REPARSE_TAG_LX_CHR:
+		fattr->cf_mode |= S_IFCHR;
+		fattr->cf_dtype = DT_CHR;
+		break;
+	case IO_REPARSE_TAG_LX_BLK:
+		fattr->cf_mode |= S_IFBLK;
+		fattr->cf_dtype = DT_BLK;
+		break;
+	case 0: /* SMB1 symlink */
+	case IO_REPARSE_TAG_SYMLINK:
+	case IO_REPARSE_TAG_NFS:
+		fattr->cf_mode |= S_IFLNK;
+		fattr->cf_dtype = DT_LNK;
+		break;
+	default:
+		return false;
+	}
+	return true;
+}
diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h
new file mode 100644
index 00000000000000..3ceb90da0df901
--- /dev/null
+++ b/fs/smb/client/reparse.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024 Paulo Alcantara <pc@manguebit.com>
+ */
+
+#ifndef _CIFS_REPARSE_H
+#define _CIFS_REPARSE_H
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include "cifsglob.h"
+
+static inline dev_t reparse_nfs_mkdev(struct reparse_posix_data *buf)
+{
+	u64 v = le64_to_cpu(*(__le64 *)buf->DataBuffer);
+
+	return MKDEV(v >> 32, v & 0xffffffff);
+}
+
+static inline u64 reparse_mode_nfs_type(mode_t mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFBLK: return NFS_SPECFILE_BLK;
+	case S_IFCHR: return NFS_SPECFILE_CHR;
+	case S_IFIFO: return NFS_SPECFILE_FIFO;
+	case S_IFSOCK: return NFS_SPECFILE_SOCK;
+	}
+	return 0;
+}
+
+/*
+ * Match a reparse point inode if reparse tag and ctime haven't changed.
+ *
+ * Windows Server updates ctime of reparse points when their data have changed.
+ * The server doesn't allow changing reparse tags from existing reparse points,
+ * though it's worth checking.
+ */
+static inline bool reparse_inode_match(struct inode *inode,
+				       struct cifs_fattr *fattr)
+{
+	struct timespec64 ctime = inode_get_ctime(inode);
+
+	return (CIFS_I(inode)->cifsAttrs & ATTR_REPARSE) &&
+		CIFS_I(inode)->reparse_tag == fattr->cf_cifstag &&
+		timespec64_equal(&ctime, &fattr->cf_ctime);
+}
+
+static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data)
+{
+	struct smb2_file_all_info *fi = &data->fi;
+	u32 attrs = le32_to_cpu(fi->Attributes);
+	bool ret;
+
+	ret = data->reparse_point || (attrs & ATTR_REPARSE);
+	if (ret)
+		attrs |= ATTR_REPARSE;
+	fi->Attributes = cpu_to_le32(attrs);
+	return ret;
+}
+
+bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
+				 struct cifs_fattr *fattr,
+				 struct cifs_open_info_data *data);
+int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
+				struct dentry *dentry, struct cifs_tcon *tcon,
+				const char *full_path, const char *symname);
+int smb2_make_nfs_node(unsigned int xid, struct inode *inode,
+		       struct dentry *dentry, struct cifs_tcon *tcon,
+		       const char *full_path, umode_t mode, dev_t dev);
+int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, struct kvec *rsp_iov,
+			     struct cifs_open_info_data *data);
+
+#endif /* _CIFS_REPARSE_H */
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 83c898afc8354b..aee0cbf01e6cdc 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -28,6 +28,7 @@
 #include "fscache.h"
 #include "fs_context.h"
 #include "cached_dir.h"
+#include "reparse.h"
 
 /* Change credits for different ops and return the total number of credits */
 static int
@@ -2982,109 +2983,6 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
 	return rc;
 }
 
-/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
-static int parse_reparse_posix(struct reparse_posix_data *buf,
-			       struct cifs_sb_info *cifs_sb,
-			       struct cifs_open_info_data *data)
-{
-	unsigned int len;
-	u64 type;
-
-	switch ((type = le64_to_cpu(buf->InodeType))) {
-	case NFS_SPECFILE_LNK:
-		len = le16_to_cpu(buf->ReparseDataLength);
-		data->symlink_target = cifs_strndup_from_utf16(buf->DataBuffer,
-							       len, true,
-							       cifs_sb->local_nls);
-		if (!data->symlink_target)
-			return -ENOMEM;
-		convert_delimiter(data->symlink_target, '/');
-		cifs_dbg(FYI, "%s: target path: %s\n",
-			 __func__, data->symlink_target);
-		break;
-	case NFS_SPECFILE_CHR:
-	case NFS_SPECFILE_BLK:
-	case NFS_SPECFILE_FIFO:
-	case NFS_SPECFILE_SOCK:
-		break;
-	default:
-		cifs_dbg(VFS, "%s: unhandled inode type: 0x%llx\n",
-			 __func__, type);
-		return -EOPNOTSUPP;
-	}
-	return 0;
-}
-
-static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym,
-				 u32 plen, bool unicode,
-				 struct cifs_sb_info *cifs_sb,
-				 struct cifs_open_info_data *data)
-{
-	unsigned int len;
-	unsigned int offs;
-
-	/* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */
-
-	offs = le16_to_cpu(sym->SubstituteNameOffset);
-	len = le16_to_cpu(sym->SubstituteNameLength);
-	if (offs + 20 > plen || offs + len + 20 > plen) {
-		cifs_dbg(VFS, "srv returned malformed symlink buffer\n");
-		return -EIO;
-	}
-
-	data->symlink_target = cifs_strndup_from_utf16(sym->PathBuffer + offs,
-						       len, unicode,
-						       cifs_sb->local_nls);
-	if (!data->symlink_target)
-		return -ENOMEM;
-
-	convert_delimiter(data->symlink_target, '/');
-	cifs_dbg(FYI, "%s: target path: %s\n", __func__, data->symlink_target);
-
-	return 0;
-}
-
-int parse_reparse_point(struct reparse_data_buffer *buf,
-			u32 plen, struct cifs_sb_info *cifs_sb,
-			bool unicode, struct cifs_open_info_data *data)
-{
-	data->reparse.buf = buf;
-
-	/* See MS-FSCC 2.1.2 */
-	switch (le32_to_cpu(buf->ReparseTag)) {
-	case IO_REPARSE_TAG_NFS:
-		return parse_reparse_posix((struct reparse_posix_data *)buf,
-					   cifs_sb, data);
-	case IO_REPARSE_TAG_SYMLINK:
-		return parse_reparse_symlink(
-			(struct reparse_symlink_data_buffer *)buf,
-			plen, unicode, cifs_sb, data);
-	case IO_REPARSE_TAG_LX_SYMLINK:
-	case IO_REPARSE_TAG_AF_UNIX:
-	case IO_REPARSE_TAG_LX_FIFO:
-	case IO_REPARSE_TAG_LX_CHR:
-	case IO_REPARSE_TAG_LX_BLK:
-		return 0;
-	default:
-		cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n",
-			 __func__, le32_to_cpu(buf->ReparseTag));
-		return -EOPNOTSUPP;
-	}
-}
-
-static int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
-				    struct kvec *rsp_iov,
-				    struct cifs_open_info_data *data)
-{
-	struct reparse_data_buffer *buf;
-	struct smb2_ioctl_rsp *io = rsp_iov->iov_base;
-	u32 plen = le32_to_cpu(io->OutputCount);
-
-	buf = (struct reparse_data_buffer *)((u8 *)io +
-					     le32_to_cpu(io->OutputOffset));
-	return parse_reparse_point(buf, plen, cifs_sb, true, data);
-}
-
 static struct cifs_ntsd *
 get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb,
 		    const struct cifs_fid *cifsfid, u32 *pacllen, u32 info)
@@ -5124,148 +5022,6 @@ int cifs_sfu_make_node(unsigned int xid, struct inode *inode,
 	return rc;
 }
 
-static inline u64 mode_nfs_type(mode_t mode)
-{
-	switch (mode & S_IFMT) {
-	case S_IFBLK: return NFS_SPECFILE_BLK;
-	case S_IFCHR: return NFS_SPECFILE_CHR;
-	case S_IFIFO: return NFS_SPECFILE_FIFO;
-	case S_IFSOCK: return NFS_SPECFILE_SOCK;
-	}
-	return 0;
-}
-
-static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
-			       mode_t mode, dev_t dev,
-			       struct kvec *iov)
-{
-	u64 type;
-	u16 len, dlen;
-
-	len = sizeof(*buf);
-
-	switch ((type = mode_nfs_type(mode))) {
-	case NFS_SPECFILE_BLK:
-	case NFS_SPECFILE_CHR:
-		dlen = sizeof(__le64);
-		break;
-	case NFS_SPECFILE_FIFO:
-	case NFS_SPECFILE_SOCK:
-		dlen = 0;
-		break;
-	default:
-		return -EOPNOTSUPP;
-	}
-
-	buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_NFS);
-	buf->Reserved = 0;
-	buf->InodeType = cpu_to_le64(type);
-	buf->ReparseDataLength = cpu_to_le16(len + dlen -
-					     sizeof(struct reparse_data_buffer));
-	*(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MAJOR(dev) << 32) |
-						 MINOR(dev));
-	iov->iov_base = buf;
-	iov->iov_len = len + dlen;
-	return 0;
-}
-
-static int nfs_make_node(unsigned int xid, struct inode *inode,
-			 struct dentry *dentry, struct cifs_tcon *tcon,
-			 const char *full_path, umode_t mode, dev_t dev)
-{
-	struct cifs_open_info_data data;
-	struct reparse_posix_data *p;
-	struct inode *new;
-	struct kvec iov;
-	__u8 buf[sizeof(*p) + sizeof(__le64)];
-	int rc;
-
-	p = (struct reparse_posix_data *)buf;
-	rc = nfs_set_reparse_buf(p, mode, dev, &iov);
-	if (rc)
-		return rc;
-
-	data = (struct cifs_open_info_data) {
-		.reparse_point = true,
-		.reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, },
-	};
-
-	new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
-				     tcon, full_path, &iov);
-	if (!IS_ERR(new))
-		d_instantiate(dentry, new);
-	else
-		rc = PTR_ERR(new);
-	cifs_free_open_info(&data);
-	return rc;
-}
-
-static int smb2_create_reparse_symlink(const unsigned int xid,
-				       struct inode *inode,
-				       struct dentry *dentry,
-				       struct cifs_tcon *tcon,
-				       const char *full_path,
-				       const char *symname)
-{
-	struct reparse_symlink_data_buffer *buf = NULL;
-	struct cifs_open_info_data data;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct inode *new;
-	struct kvec iov;
-	__le16 *path;
-	char *sym;
-	u16 len, plen;
-	int rc = 0;
-
-	sym = kstrdup(symname, GFP_KERNEL);
-	if (!sym)
-		return -ENOMEM;
-
-	data = (struct cifs_open_info_data) {
-		.reparse_point = true,
-		.reparse = { .tag = IO_REPARSE_TAG_SYMLINK, },
-		.symlink_target = sym,
-	};
-
-	path = cifs_convert_path_to_utf16(symname, cifs_sb);
-	if (!path) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	plen = 2 * UniStrnlen((wchar_t *)path, PATH_MAX);
-	len = sizeof(*buf) + plen * 2;
-	buf = kzalloc(len, GFP_KERNEL);
-	if (!buf) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK);
-	buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer));
-	buf->SubstituteNameOffset = cpu_to_le16(plen);
-	buf->SubstituteNameLength = cpu_to_le16(plen);
-	memcpy(&buf->PathBuffer[plen], path, plen);
-	buf->PrintNameOffset = 0;
-	buf->PrintNameLength = cpu_to_le16(plen);
-	memcpy(buf->PathBuffer, path, plen);
-	buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0);
-
-	iov.iov_base = buf;
-	iov.iov_len = len;
-	new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
-				     tcon, full_path, &iov);
-	if (!IS_ERR(new))
-		d_instantiate(dentry, new);
-	else
-		rc = PTR_ERR(new);
-out:
-	kfree(path);
-	cifs_free_open_info(&data);
-	kfree(buf);
-	return rc;
-}
-
 static int smb2_make_node(unsigned int xid, struct inode *inode,
 			  struct dentry *dentry, struct cifs_tcon *tcon,
 			  const char *full_path, umode_t mode, dev_t dev)
@@ -5283,8 +5039,8 @@ static int smb2_make_node(unsigned int xid, struct inode *inode,
 		rc = cifs_sfu_make_node(xid, inode, dentry, tcon,
 					full_path, mode, dev);
 	} else {
-		rc = nfs_make_node(xid, inode, dentry, tcon,
-				   full_path, mode, dev);
+		rc = smb2_make_nfs_node(xid, inode, dentry, tcon,
+					full_path, mode, dev);
 	}
 	return rc;
 }
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index b3069911e9dd8f..5322b769bca21c 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -308,5 +308,11 @@ int smb311_posix_query_path_info(const unsigned int xid,
 int posix_info_parse(const void *beg, const void *end,
 		     struct smb2_posix_info_parsed *out);
 int posix_info_sid_size(const void *beg, const void *end);
+int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
+				struct dentry *dentry, struct cifs_tcon *tcon,
+				const char *full_path, const char *symname);
+int smb2_make_nfs_node(unsigned int xid, struct inode *inode,
+		       struct dentry *dentry, struct cifs_tcon *tcon,
+		       const char *full_path, umode_t mode, dev_t dev);
 
 #endif			/* _SMB2PROTO_H */

From e0954d76ef62711d17b51c25c73ed8f1d3def2f5 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Thu, 25 Jan 2024 17:04:05 -0300
Subject: [PATCH 330/707] smb: client: fix potential broken compound request

Now that smb2_compound_op() can accept up to 5 commands in a single
compound request, set the appropriate NextCommand and related flags to
all subsequent commands as well as handling the case where a valid
@cfile is passed and therefore skipping create and close requests in
the compound chain.

This fix a potential broken compound request that could be sent from
smb2_get_reparse_inode() if the client found a valid open
file (@cfile) prior to calling smb2_compound_op().

Signed-off-by: Paulo Alcantara <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb2inode.c | 106 ++++++++++++++++++++++----------------
 1 file changed, 63 insertions(+), 43 deletions(-)

diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 05818cd6d932e9..11de1318d8e0f5 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -202,14 +202,13 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 							  SMB2_O_INFO_FILE, 0,
 							  sizeof(struct smb2_file_all_info) +
 							  PATH_MAX * 2, 0, NULL);
-				if (!rc) {
-					smb2_set_next_command(tcon, &rqst[num_rqst]);
-					smb2_set_related(&rqst[num_rqst]);
-				}
 			}
-
-			if (rc)
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
 				goto finished;
+			}
 			num_rqst++;
 			trace_smb3_query_info_compound_enter(xid, ses->Suid,
 							     tcon->tid, full_path);
@@ -239,14 +238,13 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 							  sizeof(struct smb311_posix_qinfo *) +
 							  (PATH_MAX * 2) +
 							  (sizeof(struct cifs_sid) * 2), 0, NULL);
-				if (!rc) {
-					smb2_set_next_command(tcon, &rqst[num_rqst]);
-					smb2_set_related(&rqst[num_rqst]);
-				}
 			}
-
-			if (rc)
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
 				goto finished;
+			}
 			num_rqst++;
 			trace_smb3_posix_query_info_compound_enter(xid, ses->Suid,
 								   tcon->tid, full_path);
@@ -304,13 +302,13 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 							FILE_END_OF_FILE_INFORMATION,
 							SMB2_O_INFO_FILE, 0,
 							data, size);
-				if (!rc) {
-					smb2_set_next_command(tcon, &rqst[num_rqst]);
-					smb2_set_related(&rqst[num_rqst]);
-				}
 			}
-			if (rc)
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
 				goto finished;
+			}
 			num_rqst++;
 			trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path);
 			break;
@@ -335,14 +333,13 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 							COMPOUND_FID, current->tgid,
 							FILE_BASIC_INFORMATION,
 							SMB2_O_INFO_FILE, 0, data, size);
-				if (!rc) {
-					smb2_set_next_command(tcon, &rqst[num_rqst]);
-					smb2_set_related(&rqst[num_rqst]);
-				}
 			}
-
-			if (rc)
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
 				goto finished;
+			}
 			num_rqst++;
 			trace_smb3_set_info_compound_enter(xid, ses->Suid,
 							   tcon->tid, full_path);
@@ -376,13 +373,13 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 							COMPOUND_FID, COMPOUND_FID,
 							current->tgid, FILE_RENAME_INFORMATION,
 							SMB2_O_INFO_FILE, 0, data, size);
-				if (!rc) {
-					smb2_set_next_command(tcon, &rqst[num_rqst]);
-					smb2_set_related(&rqst[num_rqst]);
-				}
 			}
-			if (rc)
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
 				goto finished;
+			}
 			num_rqst++;
 			trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path);
 			break;
@@ -417,15 +414,27 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 			rqst[num_rqst].rq_iov = vars->io_iov;
 			rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov);
 
-			rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
-					     COMPOUND_FID, COMPOUND_FID,
-					     FSCTL_SET_REPARSE_POINT,
-					     in_iov[i].iov_base,
-					     in_iov[i].iov_len, 0);
-			if (rc)
+			if (cfile) {
+				rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+						     cfile->fid.persistent_fid,
+						     cfile->fid.volatile_fid,
+						     FSCTL_SET_REPARSE_POINT,
+						     in_iov[i].iov_base,
+						     in_iov[i].iov_len, 0);
+			} else {
+				rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+						     COMPOUND_FID, COMPOUND_FID,
+						     FSCTL_SET_REPARSE_POINT,
+						     in_iov[i].iov_base,
+						     in_iov[i].iov_len, 0);
+			}
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
 				goto finished;
-			smb2_set_next_command(tcon, &rqst[num_rqst]);
-			smb2_set_related(&rqst[num_rqst++]);
+			}
+			num_rqst++;
 			trace_smb3_set_reparse_compound_enter(xid, ses->Suid,
 							      tcon->tid, full_path);
 			break;
@@ -433,14 +442,25 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 			rqst[num_rqst].rq_iov = vars->io_iov;
 			rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov);
 
-			rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
-					     COMPOUND_FID, COMPOUND_FID,
-					     FSCTL_GET_REPARSE_POINT,
-					     NULL, 0, CIFSMaxBufSize);
-			if (rc)
+			if (cfile) {
+				rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+						     cfile->fid.persistent_fid,
+						     cfile->fid.volatile_fid,
+						     FSCTL_GET_REPARSE_POINT,
+						     NULL, 0, CIFSMaxBufSize);
+			} else {
+				rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+						     COMPOUND_FID, COMPOUND_FID,
+						     FSCTL_GET_REPARSE_POINT,
+						     NULL, 0, CIFSMaxBufSize);
+			}
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
 				goto finished;
-			smb2_set_next_command(tcon, &rqst[num_rqst]);
-			smb2_set_related(&rqst[num_rqst++]);
+			}
+			num_rqst++;
 			trace_smb3_get_reparse_compound_enter(xid, ses->Suid,
 							      tcon->tid, full_path);
 			break;

From 88b95f351a721d9c585eb7ece6bc4a38c5fdcdfd Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Thu, 25 Jan 2024 19:21:48 -0300
Subject: [PATCH 331/707] smb: client: reduce number of parameters in
 smb2_compound_op()

Replace @desired_access, @create_disposition, @create_options and
@mode parameters with a single @oparms.

No functional changes.

Signed-off-by: Paulo Alcantara <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsglob.h  |  11 ++++
 fs/smb/client/smb2inode.c | 125 +++++++++++++++++++++-----------------
 2 files changed, 81 insertions(+), 55 deletions(-)

diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index a4aa95439e18c6..23db1686f9104f 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -2269,6 +2269,17 @@ static inline void cifs_sg_set_buf(struct sg_table *sgtable,
 	}
 }
 
+#define CIFS_OPARMS(_cifs_sb, _tcon, _path, _da, _cd, _co, _mode) \
+	((struct cifs_open_parms) { \
+		.tcon = _tcon, \
+		.path = _path, \
+		.desired_access = (_da), \
+		.disposition = (_cd), \
+		.create_options = cifs_create_options(_cifs_sb, (_co)), \
+		.mode = (_mode), \
+		.cifs_sb = _cifs_sb, \
+	})
+
 struct smb2_compound_vars {
 	struct cifs_open_parms oparms;
 	struct kvec rsp_iov[MAX_COMPOUND];
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 11de1318d8e0f5..93ca39b4be4d29 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -95,8 +95,7 @@ static int parse_posix_sids(struct cifs_open_info_data *data,
  */
 static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 			    struct cifs_sb_info *cifs_sb, const char *full_path,
-			    __u32 desired_access, __u32 create_disposition,
-			    __u32 create_options, umode_t mode, struct kvec *in_iov,
+			    struct cifs_open_parms *oparms, struct kvec *in_iov,
 			    int *cmds, int num_cmds, struct cifsFileInfo *cfile,
 			    struct kvec *out_iov, int *out_buftype)
 {
@@ -152,16 +151,8 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 		goto finished;
 	}
 
-	vars->oparms = (struct cifs_open_parms) {
-		.tcon = tcon,
-		.path = full_path,
-		.desired_access = desired_access,
-		.disposition = create_disposition,
-		.create_options = cifs_create_options(cifs_sb, create_options),
-		.fid = &fid,
-		.mode = mode,
-		.cifs_sb = cifs_sb,
-	};
+	vars->oparms = *oparms;
+	vars->oparms.fid = &fid;
 
 	rqst[num_rqst].rq_iov = &vars->open_iov[0];
 	rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE;
@@ -713,6 +704,7 @@ int smb2_query_path_info(const unsigned int xid,
 			 const char *full_path,
 			 struct cifs_open_info_data *data)
 {
+	struct cifs_open_parms oparms;
 	__u32 create_options = 0;
 	struct cifsFileInfo *cfile;
 	struct cached_fid *cfid = NULL;
@@ -764,10 +756,11 @@ int smb2_query_path_info(const unsigned int xid,
 	in_iov[1] = in_iov[0];
 
 	cifs_get_readable_path(tcon, full_path, &cfile);
+	oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_READ_ATTRIBUTES,
+			     FILE_OPEN, create_options, ACL_NO_MODE);
 	rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
-			      FILE_READ_ATTRIBUTES, FILE_OPEN,
-			      create_options, ACL_NO_MODE, in_iov,
-			      cmds, 1, cfile, out_iov, out_buftype);
+			      &oparms, in_iov, cmds, 1, cfile,
+			      out_iov, out_buftype);
 	hdr = out_iov[0].iov_base;
 	/*
 	 * If first iov is unset, then SMB session was dropped or we've got a
@@ -794,12 +787,14 @@ int smb2_query_path_info(const unsigned int xid,
 			cmds[1] = SMB2_OP_GET_REPARSE;
 			num_cmds = 2;
 		}
-		create_options |= OPEN_REPARSE_POINT;
+		oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
+				     FILE_READ_ATTRIBUTES, FILE_OPEN,
+				     create_options | OPEN_REPARSE_POINT,
+				     ACL_NO_MODE);
 		cifs_get_readable_path(tcon, full_path, &cfile);
 		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
-				      FILE_READ_ATTRIBUTES, FILE_OPEN,
-				      create_options, ACL_NO_MODE, in_iov,
-				      cmds, num_cmds, cfile, NULL, NULL);
+				      &oparms, in_iov, cmds, num_cmds,
+				      cfile, NULL, NULL);
 		break;
 	case -EREMOTE:
 		break;
@@ -827,10 +822,13 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
 	   struct cifs_tcon *tcon, const char *name,
 	   struct cifs_sb_info *cifs_sb)
 {
-	return smb2_compound_op(xid, tcon, cifs_sb, name,
-				FILE_WRITE_ATTRIBUTES, FILE_CREATE,
-				CREATE_NOT_FILE, mode,
-				NULL, &(int){SMB2_OP_MKDIR}, 1,
+	struct cifs_open_parms oparms;
+
+	oparms = CIFS_OPARMS(cifs_sb, tcon, name, FILE_WRITE_ATTRIBUTES,
+			     FILE_CREATE, CREATE_NOT_FILE, mode);
+	return smb2_compound_op(xid, tcon, cifs_sb,
+				name, &oparms, NULL,
+				&(int){SMB2_OP_MKDIR}, 1,
 				NULL, NULL, NULL);
 }
 
@@ -839,6 +837,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
 		   struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon,
 		   const unsigned int xid)
 {
+	struct cifs_open_parms oparms;
 	FILE_BASIC_INFO data = {};
 	struct cifsInodeInfo *cifs_i;
 	struct cifsFileInfo *cfile;
@@ -852,9 +851,10 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
 	dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
 	data.Attributes = cpu_to_le32(dosattrs);
 	cifs_get_writable_path(tcon, name, FIND_WR_ANY, &cfile);
+	oparms = CIFS_OPARMS(cifs_sb, tcon, name, FILE_WRITE_ATTRIBUTES,
+			     FILE_CREATE, CREATE_NOT_FILE, ACL_NO_MODE);
 	tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
-				 FILE_WRITE_ATTRIBUTES, FILE_CREATE,
-				 CREATE_NOT_FILE, ACL_NO_MODE, &in_iov,
+				 &oparms, &in_iov,
 				 &(int){SMB2_OP_SET_INFO}, 1,
 				 cfile, NULL, NULL);
 	if (tmprc == 0)
@@ -865,10 +865,13 @@ int
 smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
 	   struct cifs_sb_info *cifs_sb)
 {
+	struct cifs_open_parms oparms;
+
 	drop_cached_dir_by_name(xid, tcon, name, cifs_sb);
-	return smb2_compound_op(xid, tcon, cifs_sb, name,
-				DELETE, FILE_OPEN, CREATE_NOT_FILE,
-				ACL_NO_MODE, NULL,
+	oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE,
+			     FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE);
+	return smb2_compound_op(xid, tcon, cifs_sb,
+				name, &oparms, NULL,
 				&(int){SMB2_OP_RMDIR}, 1,
 				NULL, NULL, NULL);
 }
@@ -877,10 +880,14 @@ int
 smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
 	    struct cifs_sb_info *cifs_sb)
 {
-	return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
-				CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
-				ACL_NO_MODE, NULL,
-				&(int){SMB2_OP_DELETE}, 1,
+	struct cifs_open_parms oparms;
+
+	oparms = CIFS_OPARMS(cifs_sb, tcon, name,
+			     DELETE, FILE_OPEN,
+			     CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
+			     ACL_NO_MODE);
+	return smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
+				NULL, &(int){SMB2_OP_DELETE}, 1,
 				NULL, NULL, NULL);
 }
 
@@ -890,6 +897,7 @@ static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
 			      __u32 create_options, __u32 access,
 			      int command, struct cifsFileInfo *cfile)
 {
+	struct cifs_open_parms oparms;
 	struct kvec in_iov;
 	__le16 *smb2_to_name = NULL;
 	int rc;
@@ -901,8 +909,9 @@ static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
 	}
 	in_iov.iov_base = smb2_to_name;
 	in_iov.iov_len = 2 * UniStrnlen((wchar_t *)smb2_to_name, PATH_MAX);
-	rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
-			      FILE_OPEN, create_options, ACL_NO_MODE,
+	oparms = CIFS_OPARMS(cifs_sb, tcon, from_name, access, FILE_OPEN,
+			     create_options, ACL_NO_MODE);
+	rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, &oparms,
 			      &in_iov, &command, 1, cfile, NULL, NULL);
 smb2_rename_path:
 	kfree(smb2_to_name);
@@ -943,6 +952,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
 		   const char *full_path, __u64 size,
 		   struct cifs_sb_info *cifs_sb, bool set_alloc)
 {
+	struct cifs_open_parms oparms;
 	struct cifsFileInfo *cfile;
 	struct kvec in_iov;
 	__le64 eof = cpu_to_le64(size);
@@ -950,9 +960,10 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
 	in_iov.iov_base = &eof;
 	in_iov.iov_len = sizeof(eof);
 	cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
-	return smb2_compound_op(xid, tcon, cifs_sb, full_path,
-				FILE_WRITE_DATA, FILE_OPEN,
-				0, ACL_NO_MODE, &in_iov,
+	oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_WRITE_DATA,
+			     FILE_OPEN, 0, ACL_NO_MODE);
+	return smb2_compound_op(xid, tcon, cifs_sb,
+				full_path, &oparms, &in_iov,
 				&(int){SMB2_OP_SET_EOF}, 1,
 				cfile, NULL, NULL);
 }
@@ -961,6 +972,7 @@ int
 smb2_set_file_info(struct inode *inode, const char *full_path,
 		   FILE_BASIC_INFO *buf, const unsigned int xid)
 {
+	struct cifs_open_parms oparms;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct tcon_link *tlink;
 	struct cifs_tcon *tcon;
@@ -979,9 +991,10 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
 	tcon = tlink_tcon(tlink);
 
 	cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
-	rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
-			      FILE_WRITE_ATTRIBUTES, FILE_OPEN,
-			      0, ACL_NO_MODE, &in_iov,
+	oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_WRITE_ATTRIBUTES,
+			     FILE_OPEN, 0, ACL_NO_MODE);
+	rc = smb2_compound_op(xid, tcon, cifs_sb,
+			      full_path, &oparms, &in_iov,
 			      &(int){SMB2_OP_SET_INFO}, 1,
 			      cfile, NULL, NULL);
 	cifs_put_tlink(tlink);
@@ -995,19 +1008,21 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
 				     const char *full_path,
 				     struct kvec *iov)
 {
+	struct cifs_open_parms oparms;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifsFileInfo *cfile;
 	struct inode *new = NULL;
 	struct kvec in_iov[2];
 	int cmds[2];
-	int da, co, cd;
 	int rc;
 
-	da = SYNCHRONIZE | DELETE |
-		FILE_READ_ATTRIBUTES |
-		FILE_WRITE_ATTRIBUTES;
-	co = CREATE_NOT_DIR | OPEN_REPARSE_POINT;
-	cd = FILE_CREATE;
+	oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
+			     SYNCHRONIZE | DELETE |
+			     FILE_READ_ATTRIBUTES |
+			     FILE_WRITE_ATTRIBUTES,
+			     FILE_CREATE,
+			     CREATE_NOT_DIR | OPEN_REPARSE_POINT,
+			     ACL_NO_MODE);
 	cmds[0] = SMB2_OP_SET_REPARSE;
 	in_iov[0] = *iov;
 	in_iov[1].iov_base = data;
@@ -1016,9 +1031,8 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
 	if (tcon->posix_extensions) {
 		cmds[1] = SMB2_OP_POSIX_QUERY_INFO;
 		cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
-		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
-				      da, cd, co, ACL_NO_MODE, in_iov,
-				      cmds, 2, cfile, NULL, NULL);
+		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms,
+				      in_iov, cmds, 2, cfile, NULL, NULL);
 		if (!rc) {
 			rc = smb311_posix_get_inode_info(&new, full_path,
 							 data, sb, xid);
@@ -1026,9 +1040,8 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
 	} else {
 		cmds[1] = SMB2_OP_QUERY_INFO;
 		cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
-		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
-				      da, cd, co, ACL_NO_MODE, in_iov,
-				      cmds, 2, cfile, NULL, NULL);
+		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms,
+				      in_iov, cmds, 2, cfile, NULL, NULL);
 		if (!rc) {
 			rc = cifs_get_inode_info(&new, full_path,
 						 data, sb, xid, NULL);
@@ -1044,6 +1057,7 @@ int smb2_query_reparse_point(const unsigned int xid,
 			     u32 *tag, struct kvec *rsp,
 			     int *rsp_buftype)
 {
+	struct cifs_open_parms oparms;
 	struct cifs_open_info_data data = {};
 	struct cifsFileInfo *cfile;
 	struct kvec in_iov = { .iov_base = &data, .iov_len = sizeof(data), };
@@ -1052,9 +1066,10 @@ int smb2_query_reparse_point(const unsigned int xid,
 	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
 
 	cifs_get_readable_path(tcon, full_path, &cfile);
-	rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
-			      FILE_READ_ATTRIBUTES, FILE_OPEN,
-			      OPEN_REPARSE_POINT, ACL_NO_MODE, &in_iov,
+	oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_READ_ATTRIBUTES,
+			     FILE_OPEN, OPEN_REPARSE_POINT, ACL_NO_MODE);
+	rc = smb2_compound_op(xid, tcon, cifs_sb,
+			      full_path, &oparms, &in_iov,
 			      &(int){SMB2_OP_GET_REPARSE}, 1,
 			      cfile, NULL, NULL);
 	if (rc)

From 3815959fe70011dea1589df4f90623ade113c5ad Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Fri, 26 Jan 2024 19:26:06 -0300
Subject: [PATCH 332/707] smb: client: add support for WSL reparse points

Add support for creating special files via WSL reparse points when
using 'reparse=wsl' mount option.  They're faster than NFS reparse
points because they don't require extra roundtrips to figure out what
->d_type a specific dirent is as such information is already stored in
query dir responses and then making getdents() calls faster.

Signed-off-by: Paulo Alcantara <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsglob.h   |   1 +
 fs/smb/client/fs_context.c |   4 +-
 fs/smb/client/reparse.c    | 170 +++++++++++++++++++++++++++++++++++--
 fs/smb/client/reparse.h    |  13 ++-
 fs/smb/client/smb2inode.c  |   8 +-
 fs/smb/client/smb2ops.c    |   2 +-
 fs/smb/client/smb2pdu.c    |  12 +++
 fs/smb/client/smb2pdu.h    |  11 ++-
 fs/smb/client/smb2proto.h  |   3 +-
 fs/smb/common/smbfsctl.h   |   6 --
 10 files changed, 210 insertions(+), 20 deletions(-)

diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 23db1686f9104f..90bc90ed97ef66 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -1371,6 +1371,7 @@ struct cifs_open_parms {
 	struct cifs_fid *fid;
 	umode_t mode;
 	bool reconnect:1;
+	struct kvec *ea_cctx;
 };
 
 struct cifs_fid {
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index aee8be2cae4676..82eafe0815dc52 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -317,8 +317,8 @@ static int parse_reparse_flavor(struct fs_context *fc, char *value,
 		ctx->reparse_type = CIFS_REPARSE_TYPE_NFS;
 		break;
 	case Opt_reparse_wsl:
-		cifs_errorf(fc, "unsupported reparse= option: %s\n", value);
-		return 1;
+		ctx->reparse_type = CIFS_REPARSE_TYPE_WSL;
+		break;
 	default:
 		cifs_errorf(fc, "bad reparse= option: %s\n", value);
 		return 1;
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 5ce9d1ed5b8881..23e0cb62552ec7 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -11,6 +11,7 @@
 #include "cifsproto.h"
 #include "cifs_unicode.h"
 #include "cifs_debug.h"
+#include "fs_context.h"
 #include "reparse.h"
 
 int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
@@ -64,7 +65,7 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
 	iov.iov_base = buf;
 	iov.iov_len = len;
 	new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
-				     tcon, full_path, &iov);
+				     tcon, full_path, &iov, NULL);
 	if (!IS_ERR(new))
 		d_instantiate(dentry, new);
 	else
@@ -110,9 +111,9 @@ static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
 	return 0;
 }
 
-int smb2_make_nfs_node(unsigned int xid, struct inode *inode,
-		       struct dentry *dentry, struct cifs_tcon *tcon,
-		       const char *full_path, umode_t mode, dev_t dev)
+static int mknod_nfs(unsigned int xid, struct inode *inode,
+		     struct dentry *dentry, struct cifs_tcon *tcon,
+		     const char *full_path, umode_t mode, dev_t dev)
 {
 	struct cifs_open_info_data data;
 	struct reparse_posix_data *p;
@@ -132,12 +133,171 @@ int smb2_make_nfs_node(unsigned int xid, struct inode *inode,
 	};
 
 	new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
-				     tcon, full_path, &iov);
+				     tcon, full_path, &iov, NULL);
+	if (!IS_ERR(new))
+		d_instantiate(dentry, new);
+	else
+		rc = PTR_ERR(new);
+	cifs_free_open_info(&data);
+	return rc;
+}
+
+static int wsl_set_reparse_buf(struct reparse_data_buffer *buf,
+			       mode_t mode, struct kvec *iov)
+{
+	u32 tag;
+
+	switch ((tag = reparse_mode_wsl_tag(mode))) {
+	case IO_REPARSE_TAG_LX_BLK:
+	case IO_REPARSE_TAG_LX_CHR:
+	case IO_REPARSE_TAG_LX_FIFO:
+	case IO_REPARSE_TAG_AF_UNIX:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	buf->ReparseTag = cpu_to_le32(tag);
+	buf->Reserved = 0;
+	buf->ReparseDataLength = 0;
+	iov->iov_base = buf;
+	iov->iov_len = sizeof(*buf);
+	return 0;
+}
+
+static struct smb2_create_ea_ctx *ea_create_context(u32 dlen, size_t *cc_len)
+{
+	struct smb2_create_ea_ctx *cc;
+
+	*cc_len = round_up(sizeof(*cc) + dlen, 8);
+	cc = kzalloc(*cc_len, GFP_KERNEL);
+	if (!cc)
+		return ERR_PTR(-ENOMEM);
+
+	cc->ctx.NameOffset = cpu_to_le16(offsetof(struct smb2_create_ea_ctx,
+						  name));
+	cc->ctx.NameLength = cpu_to_le16(4);
+	memcpy(cc->name, SMB2_CREATE_EA_BUFFER, strlen(SMB2_CREATE_EA_BUFFER));
+	cc->ctx.DataOffset = cpu_to_le16(offsetof(struct smb2_create_ea_ctx, ea));
+	cc->ctx.DataLength = cpu_to_le32(dlen);
+	return cc;
+}
+
+struct wsl_xattr {
+	const char	*name;
+	__le64		value;
+	u16		size;
+	u32		next;
+};
+
+static int wsl_set_xattrs(struct inode *inode, umode_t _mode,
+			  dev_t _dev, struct kvec *iov)
+{
+	struct smb2_file_full_ea_info *ea;
+	struct smb2_create_ea_ctx *cc;
+	struct smb3_fs_context *ctx = CIFS_SB(inode->i_sb)->ctx;
+	__le64 uid = cpu_to_le64(from_kuid(current_user_ns(), ctx->linux_uid));
+	__le64 gid = cpu_to_le64(from_kgid(current_user_ns(), ctx->linux_gid));
+	__le64 dev = cpu_to_le64(((u64)MINOR(_dev) << 32) | MAJOR(_dev));
+	__le64 mode = cpu_to_le64(_mode);
+	struct wsl_xattr xattrs[] = {
+		{ .name = "$LXUID", .value = uid, .size = 4, },
+		{ .name = "$LXGID", .value = gid, .size = 4, },
+		{ .name = "$LXMOD", .value = mode, .size = 4, },
+		{ .name = "$LXDEV", .value = dev, .size = 8, },
+	};
+	size_t cc_len;
+	u32 dlen = 0, next = 0;
+	int i, num_xattrs;
+	u8 name_size = strlen(xattrs[0].name) + 1;
+
+	memset(iov, 0, sizeof(*iov));
+
+	/* Exclude $LXDEV xattr for sockets and fifos */
+	if (S_ISSOCK(_mode) || S_ISFIFO(_mode))
+		num_xattrs = ARRAY_SIZE(xattrs) - 1;
+	else
+		num_xattrs = ARRAY_SIZE(xattrs);
+
+	for (i = 0; i < num_xattrs; i++) {
+		xattrs[i].next = ALIGN(sizeof(*ea) + name_size +
+				       xattrs[i].size, 4);
+		dlen += xattrs[i].next;
+	}
+
+	cc = ea_create_context(dlen, &cc_len);
+	if (!cc)
+		return PTR_ERR(cc);
+
+	ea = &cc->ea;
+	for (i = 0; i < num_xattrs; i++) {
+		ea = (void *)((u8 *)ea + next);
+		next = xattrs[i].next;
+		ea->next_entry_offset = cpu_to_le32(next);
+
+		ea->ea_name_length = name_size - 1;
+		ea->ea_value_length = cpu_to_le16(xattrs[i].size);
+		memcpy(ea->ea_data, xattrs[i].name, name_size);
+		memcpy(&ea->ea_data[name_size],
+		       &xattrs[i].value, xattrs[i].size);
+	}
+	ea->next_entry_offset = 0;
+
+	iov->iov_base = cc;
+	iov->iov_len = cc_len;
+	return 0;
+}
+
+static int mknod_wsl(unsigned int xid, struct inode *inode,
+		     struct dentry *dentry, struct cifs_tcon *tcon,
+		     const char *full_path, umode_t mode, dev_t dev)
+{
+	struct cifs_open_info_data data;
+	struct reparse_data_buffer buf;
+	struct inode *new;
+	struct kvec reparse_iov, xattr_iov;
+	int rc;
+
+	rc = wsl_set_reparse_buf(&buf, mode, &reparse_iov);
+	if (rc)
+		return rc;
+
+	rc = wsl_set_xattrs(inode, mode, dev, &xattr_iov);
+	if (rc)
+		return rc;
+
+	data = (struct cifs_open_info_data) {
+		.reparse_point = true,
+		.reparse = { .tag = le32_to_cpu(buf.ReparseTag), .buf = &buf, },
+	};
+
+	new = smb2_get_reparse_inode(&data, inode->i_sb,
+				     xid, tcon, full_path,
+				     &reparse_iov, &xattr_iov);
 	if (!IS_ERR(new))
 		d_instantiate(dentry, new);
 	else
 		rc = PTR_ERR(new);
 	cifs_free_open_info(&data);
+	kfree(xattr_iov.iov_base);
+	return rc;
+}
+
+int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
+		       struct dentry *dentry, struct cifs_tcon *tcon,
+		       const char *full_path, umode_t mode, dev_t dev)
+{
+	struct smb3_fs_context *ctx = CIFS_SB(inode->i_sb)->ctx;
+	int rc = -EOPNOTSUPP;
+
+	switch (ctx->reparse_type) {
+	case CIFS_REPARSE_TYPE_NFS:
+		rc = mknod_nfs(xid, inode, dentry, tcon, full_path, mode, dev);
+		break;
+	case CIFS_REPARSE_TYPE_WSL:
+		rc = mknod_wsl(xid, inode, dentry, tcon, full_path, mode, dev);
+		break;
+	}
 	return rc;
 }
 
diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h
index 3ceb90da0df901..9816bac9855257 100644
--- a/fs/smb/client/reparse.h
+++ b/fs/smb/client/reparse.h
@@ -28,6 +28,17 @@ static inline u64 reparse_mode_nfs_type(mode_t mode)
 	return 0;
 }
 
+static inline u32 reparse_mode_wsl_tag(mode_t mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFBLK: return IO_REPARSE_TAG_LX_BLK;
+	case S_IFCHR: return IO_REPARSE_TAG_LX_CHR;
+	case S_IFIFO: return IO_REPARSE_TAG_LX_FIFO;
+	case S_IFSOCK: return IO_REPARSE_TAG_AF_UNIX;
+	}
+	return 0;
+}
+
 /*
  * Match a reparse point inode if reparse tag and ctime haven't changed.
  *
@@ -64,7 +75,7 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
 int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
 				struct dentry *dentry, struct cifs_tcon *tcon,
 				const char *full_path, const char *symname);
-int smb2_make_nfs_node(unsigned int xid, struct inode *inode,
+int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
 		       struct dentry *dentry, struct cifs_tcon *tcon,
 		       const char *full_path, umode_t mode, dev_t dev);
 int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, struct kvec *rsp_iov,
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 93ca39b4be4d29..e9955b964ea470 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -1006,7 +1006,8 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
 				     const unsigned int xid,
 				     struct cifs_tcon *tcon,
 				     const char *full_path,
-				     struct kvec *iov)
+				     struct kvec *reparse_iov,
+				     struct kvec *xattr_iov)
 {
 	struct cifs_open_parms oparms;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -1023,8 +1024,11 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
 			     FILE_CREATE,
 			     CREATE_NOT_DIR | OPEN_REPARSE_POINT,
 			     ACL_NO_MODE);
+	if (xattr_iov)
+		oparms.ea_cctx = xattr_iov;
+
 	cmds[0] = SMB2_OP_SET_REPARSE;
-	in_iov[0] = *iov;
+	in_iov[0] = *reparse_iov;
 	in_iov[1].iov_base = data;
 	in_iov[1].iov_len = sizeof(*data);
 
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index aee0cbf01e6cdc..708973ec750b8f 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -5039,7 +5039,7 @@ static int smb2_make_node(unsigned int xid, struct inode *inode,
 		rc = cifs_sfu_make_node(xid, inode, dentry, tcon,
 					full_path, mode, dev);
 	} else {
-		rc = smb2_make_nfs_node(xid, inode, dentry, tcon,
+		rc = smb2_mknod_reparse(xid, inode, dentry, tcon,
 					full_path, mode, dev);
 	}
 	return rc;
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 273e24f9da1316..2837fc4465a7ff 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -2707,6 +2707,17 @@ add_query_id_context(struct kvec *iov, unsigned int *num_iovec)
 	return 0;
 }
 
+static void add_ea_context(struct cifs_open_parms *oparms,
+			   struct kvec *rq_iov, unsigned int *num_iovs)
+{
+	struct kvec *iov = oparms->ea_cctx;
+
+	if (iov && iov->iov_base && iov->iov_len) {
+		rq_iov[(*num_iovs)++] = *iov;
+		memset(iov, 0, sizeof(*iov));
+	}
+}
+
 static int
 alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
 			    const char *treename, const __le16 *path)
@@ -3073,6 +3084,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
 	}
 
 	add_query_id_context(iov, &n_iov);
+	add_ea_context(oparms, iov, &n_iov);
 
 	if (n_iov > 2) {
 		/*
diff --git a/fs/smb/client/smb2pdu.h b/fs/smb/client/smb2pdu.h
index db08194484e06c..ea63d33e455322 100644
--- a/fs/smb/client/smb2pdu.h
+++ b/fs/smb/client/smb2pdu.h
@@ -117,9 +117,10 @@ struct share_redirect_error_context_rsp {
  * [4] : posix context
  * [5] : time warp context
  * [6] : query id context
- * [7] : compound padding
+ * [7] : create ea context
+ * [8] : compound padding
  */
-#define SMB2_CREATE_IOV_SIZE 8
+#define SMB2_CREATE_IOV_SIZE 9
 
 /*
  * Maximum size of a SMB2_CREATE response is 64 (smb2 header) +
@@ -413,4 +414,10 @@ struct smb2_posix_info_parsed {
 	const u8 *name;
 };
 
+struct smb2_create_ea_ctx {
+	struct create_context ctx;
+	__u8 name[8];
+	struct smb2_file_full_ea_info ea;
+} __packed;
+
 #endif				/* _SMB2PDU_H */
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 5322b769bca21c..d910d38671ee86 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -61,7 +61,8 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
 				     const unsigned int xid,
 				     struct cifs_tcon *tcon,
 				     const char *full_path,
-				     struct kvec *iov);
+				     struct kvec *reparse_iov,
+				     struct kvec *xattr_iov);
 int smb2_query_reparse_point(const unsigned int xid,
 			     struct cifs_tcon *tcon,
 			     struct cifs_sb_info *cifs_sb,
diff --git a/fs/smb/common/smbfsctl.h b/fs/smb/common/smbfsctl.h
index edd7fc2a7921b8..a94d658b88e86b 100644
--- a/fs/smb/common/smbfsctl.h
+++ b/fs/smb/common/smbfsctl.h
@@ -158,12 +158,6 @@
 #define IO_REPARSE_TAG_LX_CHR	     0x80000025
 #define IO_REPARSE_TAG_LX_BLK	     0x80000026
 
-#define IO_REPARSE_TAG_LX_SYMLINK_LE	cpu_to_le32(0xA000001D)
-#define IO_REPARSE_TAG_AF_UNIX_LE	cpu_to_le32(0x80000023)
-#define IO_REPARSE_TAG_LX_FIFO_LE	cpu_to_le32(0x80000024)
-#define IO_REPARSE_TAG_LX_CHR_LE	cpu_to_le32(0x80000025)
-#define IO_REPARSE_TAG_LX_BLK_LE	cpu_to_le32(0x80000026)
-
 /* fsctl flags */
 /* If Flags is set to this value, the request is an FSCTL not ioctl request */
 #define SMB2_0_IOCTL_IS_FSCTL		0x00000001

From 27931312fd50069dcad71e9660a84417b36224b3 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 10 Apr 2023 21:04:50 +0900
Subject: [PATCH 333/707] sysv: don't call sb_bread() with pointers_lock held

syzbot is reporting sleep in atomic context in SysV filesystem [1], for
sb_bread() is called with rw_spinlock held.

A "write_lock(&pointers_lock) => read_lock(&pointers_lock) deadlock" bug
and a "sb_bread() with write_lock(&pointers_lock)" bug were introduced by
"Replace BKL for chain locking with sysvfs-private rwlock" in Linux 2.5.12.

Then, "[PATCH] err1-40: sysvfs locking fix" in Linux 2.6.8 fixed the
former bug by moving pointers_lock lock to the callers, but instead
introduced a "sb_bread() with read_lock(&pointers_lock)" bug (which made
this problem easier to hit).

Al Viro suggested that why not to do like get_branch()/get_block()/
find_shared() in Minix filesystem does. And doing like that is almost a
revert of "[PATCH] err1-40: sysvfs locking fix" except that get_branch()
 from with find_shared() is called without write_lock(&pointers_lock).

Reported-by: syzbot <syzbot+69b40dc5fd40f32c199f@syzkaller.appspotmail.com>
Link: https://syzkaller.appspot.com/bug?extid=69b40dc5fd40f32c199f
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Link: https://lore.kernel.org/r/0d195f93-a22a-49a2-0020-103534d6f7f6@I-love.SAKURA.ne.jp
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/sysv/itree.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 410ab2a44d2f60..19bcb51a220366 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -83,9 +83,6 @@ static inline sysv_zone_t *block_end(struct buffer_head *bh)
 	return (sysv_zone_t*)((char*)bh->b_data + bh->b_size);
 }
 
-/*
- * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock)
- */
 static Indirect *get_branch(struct inode *inode,
 			    int depth,
 			    int offsets[],
@@ -105,15 +102,18 @@ static Indirect *get_branch(struct inode *inode,
 		bh = sb_bread(sb, block);
 		if (!bh)
 			goto failure;
+		read_lock(&pointers_lock);
 		if (!verify_chain(chain, p))
 			goto changed;
 		add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets);
+		read_unlock(&pointers_lock);
 		if (!p->key)
 			goto no_block;
 	}
 	return NULL;
 
 changed:
+	read_unlock(&pointers_lock);
 	brelse(bh);
 	*err = -EAGAIN;
 	goto no_block;
@@ -219,9 +219,7 @@ static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *b
 		goto out;
 
 reread:
-	read_lock(&pointers_lock);
 	partial = get_branch(inode, depth, offsets, chain, &err);
-	read_unlock(&pointers_lock);
 
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
@@ -291,9 +289,9 @@ static Indirect *find_shared(struct inode *inode,
 	*top = 0;
 	for (k = depth; k > 1 && !offsets[k-1]; k--)
 		;
+	partial = get_branch(inode, k, offsets, chain, &err);
 
 	write_lock(&pointers_lock);
-	partial = get_branch(inode, k, offsets, chain, &err);
 	if (!partial)
 		partial = chain + k-1;
 	/*

From 994fe04908b762031eaadd6bcaf343e85f3cf163 Mon Sep 17 00:00:00 2001
From: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Date: Mon, 29 Jan 2024 19:00:23 +0100
Subject: [PATCH 334/707] ntfs3: use file_mnt_idmap helper

Let's use file_mnt_idmap() as we do that across the tree.

No functional impact.

Cc: Christian Brauner <brauner@kernel.org>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc:  <ntfs3@lists.linux.dev>
Cc:  <linux-fsdevel@vger.kernel.org>
Cc:  <linux-kernel@vger.kernel.org>
Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Link: https://lore.kernel.org/r/20240129180024.219766-1-aleksandr.mikhalitsyn@canonical.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ntfs3/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index ee3093be51701e..144aa80cca433e 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -419,7 +419,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry,
 	 * fnd contains tree's path to insert to.
 	 * If fnd is not NULL then dir is locked.
 	 */
-	inode = ntfs_create_inode(mnt_idmap(file->f_path.mnt), dir, dentry, uni,
+	inode = ntfs_create_inode(file_mnt_idmap(file), dir, dentry, uni,
 				  mode, 0, NULL, 0, fnd);
 	err = IS_ERR(inode) ? PTR_ERR(inode) :
 			      finish_open(file, dentry, ntfs_file_open);

From 563bd99dc1910e62b4d99eb1906bc8858acf483b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 29 Jan 2024 10:37:29 -0800
Subject: [PATCH 335/707] iov_iter: Avoid wrap-around instrumentation in
 copy_compat_iovec_from_user()

The loop counter "i" in copy_compat_iovec_from_user() is an int, but
because the nr_segs argument is unsigned long, the signed overflow
sanitizer got worried "i" could wrap around. Instead of making "i" an
unsigned long (which may enlarge the type size), switch both nr_segs
and i to u32. There is no truncation with nr_segs since it is never
larger than UIO_MAXIOV anyway. This keeps sanitizer instrumentation[1]
out of a UACCESS path:

vmlinux.o: warning: objtool: copy_compat_iovec_from_user+0xa9: call to __ubsan_handle_add_overflow() with UACCESS enabled

Link: https://github.com/KSPP/linux/issues/26 [1]
Cc: Christian Brauner <brauner@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20240129183729.work.991-kees@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 lib/iov_iter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 15f5040709c36e..73715d10c812bf 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1167,11 +1167,12 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
 EXPORT_SYMBOL(dup_iter);
 
 static __noclone int copy_compat_iovec_from_user(struct iovec *iov,
-		const struct iovec __user *uvec, unsigned long nr_segs)
+		const struct iovec __user *uvec, u32 nr_segs)
 {
 	const struct compat_iovec __user *uiov =
 		(const struct compat_iovec __user *)uvec;
-	int ret = -EFAULT, i;
+	int ret = -EFAULT;
+	u32 i;
 
 	if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
 		return -EFAULT;

From ee393cff29bd46cbf469abbfb795857ffc96b98b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 29 Jan 2024 10:40:15 -0800
Subject: [PATCH 336/707] select: Avoid wrap-around instrumentation in
 do_sys_poll()

The mix of int, unsigned int, and unsigned long used by struct
poll_list::len, todo, len, and j meant that the signed overflow
sanitizer got worried it needed to instrument several places where
arithmetic happens between these variables. Since all of the variables
are always positive and bounded by unsigned int, use a single type in
all places. Additionally expand the zero-test into an explicit range
check before updating "todo".

This keeps sanitizer instrumentation[1] out of a UACCESS path:

vmlinux.o: warning: objtool: do_sys_poll+0x285: call to __ubsan_handle_sub_overflow() with UACCESS enabled

Link: https://github.com/KSPP/linux/issues/26 [1]
Cc: Christian Brauner <brauner@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jan Kara <jack@suse.cz>
Cc:  <linux-fsdevel@vger.kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20240129184014.work.593-kees@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/select.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/select.c b/fs/select.c
index 0ee55af1a55c29..11a3b1312abeff 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -839,7 +839,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
 
 struct poll_list {
 	struct poll_list *next;
-	int len;
+	unsigned int len;
 	struct pollfd entries[];
 };
 
@@ -975,14 +975,15 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 		struct timespec64 *end_time)
 {
 	struct poll_wqueues table;
-	int err = -EFAULT, fdcount, len;
+	int err = -EFAULT, fdcount;
 	/* Allocate small arguments on the stack to save memory and be
 	   faster - use long to make sure the buffer is aligned properly
 	   on 64 bit archs to avoid unaligned access */
 	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
 	struct poll_list *const head = (struct poll_list *)stack_pps;
  	struct poll_list *walk = head;
- 	unsigned long todo = nfds;
+	unsigned int todo = nfds;
+	unsigned int len;
 
 	if (nfds > rlimit(RLIMIT_NOFILE))
 		return -EINVAL;
@@ -998,9 +999,9 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 					sizeof(struct pollfd) * walk->len))
 			goto out_fds;
 
-		todo -= walk->len;
-		if (!todo)
+		if (walk->len >= todo)
 			break;
+		todo -= walk->len;
 
 		len = min(todo, POLLFD_PER_PAGE);
 		walk = walk->next = kmalloc(struct_size(walk, entries, len),
@@ -1020,7 +1021,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 
 	for (walk = head; walk; walk = walk->next) {
 		struct pollfd *fds = walk->entries;
-		int j;
+		unsigned int j;
 
 		for (j = walk->len; j; fds++, ufds++, j--)
 			unsafe_put_user(fds->revents, &ufds->revents, Efault);

From 21a3b988053a05e0c397ce407fa5fb01550c6e0d Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 14 Jan 2024 16:01:57 +0100
Subject: [PATCH 337/707] mmc: core: Remove usage of the deprecated
 ida_simple_xx() API

ida_alloc() and ida_free() should be preferred to the deprecated
ida_simple_get() and ida_simple_remove().

Note that the upper limit of ida_simple_get() is exclusive, but the one of
ida_alloc_range()/ida_alloc_max() is inclusive. So a -1 has been added when
needed.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/583c57d0ae09f9d3a1e1a7b80c1e39ada17954b7.1705244502.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/block.c | 12 ++++++------
 drivers/mmc/core/host.c  |  5 +++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 32d49100dff516..a9b60b91e32f6d 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -206,7 +206,7 @@ static void mmc_blk_kref_release(struct kref *ref)
 	int devidx;
 
 	devidx = mmc_get_devidx(md->disk);
-	ida_simple_remove(&mmc_blk_ida, devidx);
+	ida_free(&mmc_blk_ida, devidx);
 
 	mutex_lock(&open_lock);
 	md->disk->private_data = NULL;
@@ -2467,7 +2467,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
 	bool cache_enabled = false;
 	bool fua_enabled = false;
 
-	devidx = ida_simple_get(&mmc_blk_ida, 0, max_devices, GFP_KERNEL);
+	devidx = ida_alloc_max(&mmc_blk_ida, max_devices - 1, GFP_KERNEL);
 	if (devidx < 0) {
 		/*
 		 * We get -ENOSPC because there are no more any available
@@ -2577,7 +2577,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
  err_kfree:
 	kfree(md);
  out:
-	ida_simple_remove(&mmc_blk_ida, devidx);
+	ida_free(&mmc_blk_ida, devidx);
 	return ERR_PTR(ret);
 }
 
@@ -2703,7 +2703,7 @@ static void mmc_blk_rpmb_device_release(struct device *dev)
 {
 	struct mmc_rpmb_data *rpmb = dev_get_drvdata(dev);
 
-	ida_simple_remove(&mmc_rpmb_ida, rpmb->id);
+	ida_free(&mmc_rpmb_ida, rpmb->id);
 	kfree(rpmb);
 }
 
@@ -2719,13 +2719,13 @@ static int mmc_blk_alloc_rpmb_part(struct mmc_card *card,
 	struct mmc_rpmb_data *rpmb;
 
 	/* This creates the minor number for the RPMB char device */
-	devidx = ida_simple_get(&mmc_rpmb_ida, 0, max_devices, GFP_KERNEL);
+	devidx = ida_alloc_max(&mmc_rpmb_ida, max_devices - 1, GFP_KERNEL);
 	if (devidx < 0)
 		return devidx;
 
 	rpmb = kzalloc(sizeof(*rpmb), GFP_KERNEL);
 	if (!rpmb) {
-		ida_simple_remove(&mmc_rpmb_ida, devidx);
+		ida_free(&mmc_rpmb_ida, devidx);
 		return -ENOMEM;
 	}
 
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index cf396e8f34e986..7cc9a33d28cacb 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -76,7 +76,7 @@ static void mmc_host_classdev_release(struct device *dev)
 	struct mmc_host *host = cls_dev_to_mmc_host(dev);
 	wakeup_source_unregister(host->ws);
 	if (of_alias_get_id(host->parent->of_node, "mmc") < 0)
-		ida_simple_remove(&mmc_host_ida, host->index);
+		ida_free(&mmc_host_ida, host->index);
 	kfree(host);
 }
 
@@ -538,7 +538,8 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 		min_idx = mmc_first_nonreserved_index();
 		max_idx = 0;
 
-		index = ida_simple_get(&mmc_host_ida, min_idx, max_idx, GFP_KERNEL);
+		index = ida_alloc_range(&mmc_host_ida, min_idx, max_idx - 1,
+					GFP_KERNEL);
 		if (index < 0) {
 			kfree(host);
 			return NULL;

From 05c7b901a03f5ac745ebd8205b1400ce26606588 Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Mon, 22 Jan 2024 17:16:23 +0800
Subject: [PATCH 338/707] dt-bindings: mmc: fsl-imx-esdhc: add i.MX95
 compatible string

Same as i.MX93, add i.MX95 SDHC which is compatible with i.MX8MM USDHC.

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20240122091623.2078089-1-peng.fan@oss.nxp.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml b/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml
index 82eb7a24c85782..f3c5aa64affcb4 100644
--- a/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml
+++ b/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml
@@ -55,8 +55,9 @@ properties:
           - enum:
               - fsl,imx8mn-usdhc
               - fsl,imx8mp-usdhc
-              - fsl,imx93-usdhc
               - fsl,imx8ulp-usdhc
+              - fsl,imx93-usdhc
+              - fsl,imx95-usdhc
           - const: fsl,imx8mm-usdhc
       - items:
           - enum:

From 4e99ffb173faaf38f010acb369bff57a20e9e531 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 25 Jan 2024 09:50:23 +0100
Subject: [PATCH 339/707] mmc: core Drop BLK_BOUNCE_HIGH

The MMC core sets BLK_BOUNCE_HIGH for devices where dma_mask
is unassigned.

For the majority of MMC hosts this path is never taken: the
OF core will unconditionally assign a 32-bit mask to any
OF device, and most MMC hosts are probed from device tree,
see drivers/of/platform.c:

of_platform_device_create_pdata()
        dev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
        if (!dev->dev.dma_mask)
                dev->dev.dma_mask = &dev->dev.coherent_dma_mask;

of_amba_device_create()
        dev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
        dev->dev.dma_mask = &dev->dev.coherent_dma_mask;

MMC devices that are probed from ACPI or PCI will likewise
have a proper dma_mask assigned.

The only remaining devices that could have a blank dma_mask
are platform devices instantiated from board files.

These are mostly used on systems without CONFIG_HIGHMEM
enabled which means the block layer will not bounce, and in
the few cases where it is enabled it is not used anyway:
for example some OMAP2 systems such as Nokia n800/n810 will
create a platform_device and not assign a dma_mask, however
they do not have any highmem, so no bouncing will happen
anyway: the block core checks if max_low_pfn >= max_pfn
and this will always be false.

Should it turn out there is a platform_device with blank
DMA mask actually using CONFIG_HIGHMEM somewhere out there
we should set dma_mask for it, not do this trickery.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240125-mmc-no-blk-bounce-high-v1-1-d0f92a30e085@linaro.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/queue.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index a0a2412f62a730..316415588a77ce 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -351,8 +351,6 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
 	if (mmc_can_erase(card))
 		mmc_queue_setup_discard(mq->queue, card);
 
-	if (!mmc_dev(host)->dma_mask || !*mmc_dev(host)->dma_mask)
-		blk_queue_bounce_limit(mq->queue, BLK_BOUNCE_HIGH);
 	blk_queue_max_hw_sectors(mq->queue,
 		min(host->max_blk_count, host->max_req_size / 512));
 	if (host->can_dma_map_merge)

From f73c3a862847b49e0643bf554962d980526abfd6 Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Tue, 23 Jan 2024 17:12:47 +0800
Subject: [PATCH 340/707] exfat: fix file not locking when writing zeros in
 exfat_file_mmap()

inode->i_rwsem should be locked when writing file. But the lock
is missing when writing zeros to the file in exfat_file_mmap().

Fixes: 11a347fb6cef ("exfat: change to get file size from DataLength")
Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/file.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index d25a96a148af4c..473c1641d50d50 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -613,7 +613,11 @@ static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma)
 			start + vma->vm_end - vma->vm_start);
 
 	if ((vma->vm_flags & VM_WRITE) && ei->valid_size < end) {
+		if (!inode_trylock(inode))
+			return -EAGAIN;
+
 		ret = exfat_file_zeroed_range(file, ei->valid_size, end);
+		inode_unlock(inode);
 		if (ret < 0) {
 			exfat_err(inode->i_sb,
 				  "mmap: fail to zero from %llu to %llu(%d)",

From 8b29fa18400ccb7fb681f105d74b2cabb59e5d62 Mon Sep 17 00:00:00 2001
From: Yuezhang Mo <Yuezhang.Mo@sony.com>
Date: Tue, 30 Jan 2024 12:46:21 +0800
Subject: [PATCH 341/707] exfat: ratelimit error msg in exfat_file_mmap()

Ratelimit the error message of zeroing out data between the valid
size and the file size in exfat_file_mmap() to not flood dmesg.

Signed-off-by: Yuezhang Mo <Yuezhang.Mo@sony.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
---
 fs/exfat/exfat_fs.h | 5 +++++
 fs/exfat/file.c     | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 9474cd50da6d4f..46f2760d984644 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -542,6 +542,11 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
 /* expand to pr_*() with prefix */
 #define exfat_err(sb, fmt, ...)						\
 	pr_err("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__)
+#define exfat_err_ratelimit(sb, fmt, args...) \
+	do { \
+		if (__ratelimit(&EXFAT_SB(sb)->ratelimit)) \
+			exfat_err(sb, fmt, ## args); \
+	} while (0)
 #define exfat_warn(sb, fmt, ...)					\
 	pr_warn("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__)
 #define exfat_info(sb, fmt, ...)					\
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 473c1641d50d50..68405ae06772d4 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -619,7 +619,7 @@ static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma)
 		ret = exfat_file_zeroed_range(file, ei->valid_size, end);
 		inode_unlock(inode);
 		if (ret < 0) {
-			exfat_err(inode->i_sb,
+			exfat_err_ratelimit(inode->i_sb,
 				  "mmap: fail to zero from %llu to %llu(%d)",
 				  start, end, ret);
 			return ret;

From 90a7463fae9eb9f68a6e4ff3e8868beb8fbfc649 Mon Sep 17 00:00:00 2001
From: Duy Nguyen <duy.nguyen.rh@renesas.com>
Date: Thu, 25 Jan 2024 16:34:37 +0100
Subject: [PATCH 342/707] pmdomain: renesas: r8a779h0-sysc: Add r8a779h0
 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for R-Car V4M (R8A779H0) SoC power areas to the R-Car SYSC
driver.

Signed-off-by: Duy Nguyen <duy.nguyen.rh@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Link: https://lore.kernel.org/r/eed6faa02c628d32676ab8ea0eee636b4ffd6c47.1706194617.git.geert+renesas@glider.be
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/pmdomain/renesas/Kconfig          |  4 ++
 drivers/pmdomain/renesas/Makefile         |  1 +
 drivers/pmdomain/renesas/r8a779h0-sysc.c  | 54 +++++++++++++++++++++++
 drivers/pmdomain/renesas/rcar-gen4-sysc.c |  3 ++
 drivers/pmdomain/renesas/rcar-gen4-sysc.h |  1 +
 5 files changed, 63 insertions(+)
 create mode 100644 drivers/pmdomain/renesas/r8a779h0-sysc.c

diff --git a/drivers/pmdomain/renesas/Kconfig b/drivers/pmdomain/renesas/Kconfig
index 80bf2cf8b60e6f..54acb4b1ec7c48 100644
--- a/drivers/pmdomain/renesas/Kconfig
+++ b/drivers/pmdomain/renesas/Kconfig
@@ -71,6 +71,10 @@ config SYSC_R8A779G0
 	bool "System Controller support for R-Car V4H" if COMPILE_TEST
 	select SYSC_RCAR_GEN4
 
+config SYSC_R8A779H0
+	bool "System Controller support for R-Car V4M" if COMPILE_TEST
+	select SYSC_RCAR_GEN4
+
 config SYSC_RMOBILE
 	bool "System Controller support for R-Mobile" if COMPILE_TEST
 
diff --git a/drivers/pmdomain/renesas/Makefile b/drivers/pmdomain/renesas/Makefile
index e306e396fc8c10..89180f19c23be7 100644
--- a/drivers/pmdomain/renesas/Makefile
+++ b/drivers/pmdomain/renesas/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_SYSC_R8A77995)	+= r8a77995-sysc.o
 obj-$(CONFIG_SYSC_R8A779A0)	+= r8a779a0-sysc.o
 obj-$(CONFIG_SYSC_R8A779F0)	+= r8a779f0-sysc.o
 obj-$(CONFIG_SYSC_R8A779G0)	+= r8a779g0-sysc.o
+obj-$(CONFIG_SYSC_R8A779H0)     += r8a779h0-sysc.o
 # Family
 obj-$(CONFIG_SYSC_RCAR)		+= rcar-sysc.o
 obj-$(CONFIG_SYSC_RCAR_GEN4)	+= rcar-gen4-sysc.o
diff --git a/drivers/pmdomain/renesas/r8a779h0-sysc.c b/drivers/pmdomain/renesas/r8a779h0-sysc.c
new file mode 100644
index 00000000000000..e13372cb80eea8
--- /dev/null
+++ b/drivers/pmdomain/renesas/r8a779h0-sysc.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Renesas R-Car V4M System Controller
+ *
+ * Copyright (C) 2023 Renesas Electronics Corp
+ */
+
+#include <linux/kernel.h>
+
+#include <dt-bindings/power/renesas,r8a779h0-sysc.h>
+
+#include "rcar-gen4-sysc.h"
+
+static struct rcar_gen4_sysc_area r8a779h0_areas[] __initdata = {
+	{ "always-on",	R8A779H0_PD_ALWAYS_ON, -1, PD_ALWAYS_ON },
+	{ "c4",		R8A779H0_PD_C4, R8A779H0_PD_ALWAYS_ON },
+	{ "a2e0d0",	R8A779H0_PD_A2E0D0, R8A779H0_PD_C4, PD_SCU },
+	{ "a1e0d0c0",	R8A779H0_PD_A1E0D0C0, R8A779H0_PD_A2E0D0, PD_CPU_NOCR },
+	{ "a1e0d0c1",	R8A779H0_PD_A1E0D0C1, R8A779H0_PD_A2E0D0, PD_CPU_NOCR },
+	{ "a1e0d0c2",	R8A779H0_PD_A1E0D0C2, R8A779H0_PD_A2E0D0, PD_CPU_NOCR },
+	{ "a1e0d0c3",	R8A779H0_PD_A1E0D0C3, R8A779H0_PD_A2E0D0, PD_CPU_NOCR },
+	{ "a3cr0",	R8A779H0_PD_A3CR0, R8A779H0_PD_ALWAYS_ON, PD_CPU_NOCR },
+	{ "a3cr1",	R8A779H0_PD_A3CR1, R8A779H0_PD_ALWAYS_ON, PD_CPU_NOCR },
+	{ "a3cr2",	R8A779H0_PD_A3CR2, R8A779H0_PD_ALWAYS_ON, PD_CPU_NOCR },
+	{ "a33dga",	R8A779H0_PD_A33DGA, R8A779H0_PD_C4 },
+	{ "a23dgb",	R8A779H0_PD_A23DGB, R8A779H0_PD_A33DGA },
+	{ "a3vip0",	R8A779H0_PD_A3VIP0, R8A779H0_PD_C4 },
+	{ "a3vip2",	R8A779H0_PD_A3VIP2, R8A779H0_PD_C4 },
+	{ "a3dul",	R8A779H0_PD_A3DUL, R8A779H0_PD_C4 },
+	{ "a3isp0",	R8A779H0_PD_A3ISP0, R8A779H0_PD_C4 },
+	{ "a2cn0",	R8A779H0_PD_A2CN0, R8A779H0_PD_C4 },
+	{ "a1cn0",	R8A779H0_PD_A1CN0, R8A779H0_PD_A2CN0 },
+	{ "a1dsp0",	R8A779H0_PD_A1DSP0, R8A779H0_PD_A2CN0 },
+	{ "a1dsp1",	R8A779H0_PD_A1DSP1, R8A779H0_PD_A2CN0 },
+	{ "a2imp01",	R8A779H0_PD_A2IMP01, R8A779H0_PD_C4 },
+	{ "a2psc",	R8A779H0_PD_A2PSC, R8A779H0_PD_C4 },
+	{ "a2dma",	R8A779H0_PD_A2DMA, R8A779H0_PD_C4 },
+	{ "a2cv0",	R8A779H0_PD_A2CV0, R8A779H0_PD_C4 },
+	{ "a2cv1",	R8A779H0_PD_A2CV1, R8A779H0_PD_C4 },
+	{ "a2cv2",	R8A779H0_PD_A2CV2, R8A779H0_PD_C4 },
+	{ "a2cv3",	R8A779H0_PD_A2CV3, R8A779H0_PD_C4 },
+	{ "a3imr0",	R8A779H0_PD_A3IMR0, R8A779H0_PD_C4 },
+	{ "a3imr1",	R8A779H0_PD_A3IMR1, R8A779H0_PD_C4 },
+	{ "a3imr2",	R8A779H0_PD_A3IMR2, R8A779H0_PD_C4 },
+	{ "a3imr3",	R8A779H0_PD_A3IMR3, R8A779H0_PD_C4 },
+	{ "a3vc",	R8A779H0_PD_A3VC, R8A779H0_PD_C4 },
+	{ "a3pci",	R8A779H0_PD_A3PCI, R8A779H0_PD_C4 },
+	{ "a2pciphy",	R8A779H0_PD_A2PCIPHY, R8A779H0_PD_A3PCI },
+};
+
+const struct rcar_gen4_sysc_info r8a779h0_sysc_info __initconst = {
+	.areas = r8a779h0_areas,
+	.num_areas = ARRAY_SIZE(r8a779h0_areas),
+};
diff --git a/drivers/pmdomain/renesas/rcar-gen4-sysc.c b/drivers/pmdomain/renesas/rcar-gen4-sysc.c
index 9e5e6e077abc08..728248659a97e8 100644
--- a/drivers/pmdomain/renesas/rcar-gen4-sysc.c
+++ b/drivers/pmdomain/renesas/rcar-gen4-sysc.c
@@ -284,6 +284,9 @@ static const struct of_device_id rcar_gen4_sysc_matches[] __initconst = {
 #endif
 #ifdef CONFIG_SYSC_R8A779G0
 	{ .compatible = "renesas,r8a779g0-sysc", .data = &r8a779g0_sysc_info },
+#endif
+#ifdef CONFIG_SYSC_R8A779H0
+	{ .compatible = "renesas,r8a779h0-sysc", .data = &r8a779h0_sysc_info },
 #endif
 	{ /* sentinel */ }
 };
diff --git a/drivers/pmdomain/renesas/rcar-gen4-sysc.h b/drivers/pmdomain/renesas/rcar-gen4-sysc.h
index 388cfa8f8f9fd6..fdf843aa51134f 100644
--- a/drivers/pmdomain/renesas/rcar-gen4-sysc.h
+++ b/drivers/pmdomain/renesas/rcar-gen4-sysc.h
@@ -40,5 +40,6 @@ struct rcar_gen4_sysc_info {
 extern const struct rcar_gen4_sysc_info r8a779a0_sysc_info;
 extern const struct rcar_gen4_sysc_info r8a779f0_sysc_info;
 extern const struct rcar_gen4_sysc_info r8a779g0_sysc_info;
+extern const struct rcar_gen4_sysc_info r8a779h0_sysc_info;
 
 #endif /* __SOC_RENESAS_RCAR_GEN4_SYSC_H__ */

From 5a774e241fdd2e90fd3562a6a9aaa822c848ec71 Mon Sep 17 00:00:00 2001
From: Patrick Rudolph <patrick.rudolph@9elements.com>
Date: Tue, 30 Jan 2024 20:59:03 +0530
Subject: [PATCH 343/707] hwmon: (pmbus_core) Allow to hook PMBUS_SMBALERT_MASK

Use _pmbus_write_word_data to allow intercepting writes to
PMBUS_SMBALERT_MASK in the custom chip specific code.

This is required for MP2971/MP2973 which doesn't follow the
PMBUS specification for PMBUS_SMBALERT_MASK.

Signed-off-by: Patrick Rudolph <patrick.rudolph@9elements.com>
Signed-off-by: Naresh Solanki <naresh.solanki@9elements.com>
Link: https://lore.kernel.org/r/20240130152903.3651341-1-naresh.solanki@9elements.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/pmbus/pmbus_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
index 1363d9f89181d2..cb4c65a7f288c1 100644
--- a/drivers/hwmon/pmbus/pmbus_core.c
+++ b/drivers/hwmon/pmbus/pmbus_core.c
@@ -3188,7 +3188,7 @@ static int pmbus_regulator_notify(struct pmbus_data *data, int page, int event)
 
 static int pmbus_write_smbalert_mask(struct i2c_client *client, u8 page, u8 reg, u8 val)
 {
-	return pmbus_write_word_data(client, page, PMBUS_SMBALERT_MASK, reg | (val << 8));
+	return _pmbus_write_word_data(client, page, PMBUS_SMBALERT_MASK, reg | (val << 8));
 }
 
 static irqreturn_t pmbus_fault_handler(int irq, void *pdata)

From 567f7205dd7a0e168314b480e4bd80c77cbe71cb Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Tue, 5 Dec 2023 02:49:58 -0800
Subject: [PATCH 344/707] x86/opcode: Add ERET[US] instructions to the x86
 opcode map

ERETU returns from an event handler while making a transition to ring 3,
and ERETS returns from an event handler while staying in ring 0.

Add instruction opcodes used by ERET[US] to the x86 opcode map; opcode
numbers are per FRED spec v5.0.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/20231205105030.8698-10-xin3.li@intel.com
---
 arch/x86/lib/x86-opcode-map.txt       | 2 +-
 tools/arch/x86/lib/x86-opcode-map.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 1efe1d9bf5ce4b..12af572201a29f 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1052,7 +1052,7 @@ EndTable
 
 GrpTable: Grp7
 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B)
-1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B)
 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B)
 3: LIDT Ms
 4: SMSW Mw/Rv
diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt
index 1efe1d9bf5ce4b..12af572201a29f 100644
--- a/tools/arch/x86/lib/x86-opcode-map.txt
+++ b/tools/arch/x86/lib/x86-opcode-map.txt
@@ -1052,7 +1052,7 @@ EndTable
 
 GrpTable: Grp7
 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B)
-1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B)
 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B)
 3: LIDT Ms
 4: SMSW Mw/Rv

From 379ae086a73c804df39866d28eb4ce693e7af486 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Tue, 5 Dec 2023 02:49:59 -0800
Subject: [PATCH 345/707] x86/objtool: Teach objtool about ERET[US]

Update the objtool decoder to know about the ERET[US] instructions
(type INSN_CONTEXT_SWITCH).

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-11-xin3.li@intel.com
---
 tools/objtool/arch/x86/decode.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index e327cd82713522..3a1d80a7878d33 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -509,11 +509,20 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 
 		if (op2 == 0x01) {
 
-			if (modrm == 0xca)
-				insn->type = INSN_CLAC;
-			else if (modrm == 0xcb)
-				insn->type = INSN_STAC;
-
+			switch (insn_last_prefix_id(&ins)) {
+			case INAT_PFX_REPE:
+			case INAT_PFX_REPNE:
+				if (modrm == 0xca)
+					/* eretu/erets */
+					insn->type = INSN_CONTEXT_SWITCH;
+				break;
+			default:
+				if (modrm == 0xca)
+					insn->type = INSN_CLAC;
+				else if (modrm == 0xcb)
+					insn->type = INSN_STAC;
+				break;
+			}
 		} else if (op2 >= 0x80 && op2 <= 0x8f) {
 
 			insn->type = INSN_JUMP_CONDITIONAL;

From 95d34efac1a0aff6da00ad177168d99c60a3b7cd Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Tue, 5 Dec 2023 02:50:00 -0800
Subject: [PATCH 346/707] x86/cpu: Add X86_CR4_FRED macro

Add X86_CR4_FRED macro for the FRED bit in %cr4. This bit must not be
changed after initialization, so add it to the pinned CR4 bits.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-12-xin3.li@intel.com
---
 arch/x86/include/uapi/asm/processor-flags.h | 7 +++++++
 arch/x86/kernel/cpu/common.c                | 5 ++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index d898432947ff35..f1a4adc782720b 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -139,6 +139,13 @@
 #define X86_CR4_LAM_SUP_BIT	28 /* LAM for supervisor pointers */
 #define X86_CR4_LAM_SUP		_BITUL(X86_CR4_LAM_SUP_BIT)
 
+#ifdef __x86_64__
+#define X86_CR4_FRED_BIT	32 /* enable FRED kernel entry */
+#define X86_CR4_FRED		_BITUL(X86_CR4_FRED_BIT)
+#else
+#define X86_CR4_FRED		(0)
+#endif
+
 /*
  * x86-64 Task Priority Register, CR8
  */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0b97bcde70c610..c3a175770df032 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -382,9 +382,8 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c)
 }
 
 /* These bits should not change their value after CPU init is finished. */
-static const unsigned long cr4_pinned_mask =
-	X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
-	X86_CR4_FSGSBASE | X86_CR4_CET;
+static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
+					     X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED;
 static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning);
 static unsigned long cr4_pinned_bits __ro_after_init;
 

From 0b2e6c1c724fc7b72c86a72d49b16c7617d6f5f9 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Tue, 5 Dec 2023 02:50:01 -0800
Subject: [PATCH 347/707] x86/cpu: Add MSR numbers for FRED configuration

Add MSR numbers for the FRED configuration registers per FRED spec 5.0.

Originally-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-13-xin3.li@intel.com
---
 arch/x86/include/asm/msr-index.h       | 13 ++++++++++++-
 tools/arch/x86/include/asm/msr-index.h | 13 ++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index f1bd7b91b3c637..1f9dc9bd13eb7e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -36,8 +36,19 @@
 #define EFER_FFXSR		(1<<_EFER_FFXSR)
 #define EFER_AUTOIBRS		(1<<_EFER_AUTOIBRS)
 
-/* Intel MSRs. Some also available on other CPUs */
+/* FRED MSRs */
+#define MSR_IA32_FRED_RSP0	0x1cc			/* Level 0 stack pointer */
+#define MSR_IA32_FRED_RSP1	0x1cd			/* Level 1 stack pointer */
+#define MSR_IA32_FRED_RSP2	0x1ce			/* Level 2 stack pointer */
+#define MSR_IA32_FRED_RSP3	0x1cf			/* Level 3 stack pointer */
+#define MSR_IA32_FRED_STKLVLS	0x1d0			/* Exception stack levels */
+#define MSR_IA32_FRED_SSP0	MSR_IA32_PL0_SSP	/* Level 0 shadow stack pointer */
+#define MSR_IA32_FRED_SSP1	0x1d1			/* Level 1 shadow stack pointer */
+#define MSR_IA32_FRED_SSP2	0x1d2			/* Level 2 shadow stack pointer */
+#define MSR_IA32_FRED_SSP3	0x1d3			/* Level 3 shadow stack pointer */
+#define MSR_IA32_FRED_CONFIG	0x1d4			/* Entrypoint and interrupt stack level */
 
+/* Intel MSRs. Some also available on other CPUs */
 #define MSR_TEST_CTRL				0x00000033
 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT	29
 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT		BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT)
diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h
index 1d51e1850ed03d..74f2c63ce71728 100644
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -36,8 +36,19 @@
 #define EFER_FFXSR		(1<<_EFER_FFXSR)
 #define EFER_AUTOIBRS		(1<<_EFER_AUTOIBRS)
 
-/* Intel MSRs. Some also available on other CPUs */
+/* FRED MSRs */
+#define MSR_IA32_FRED_RSP0	0x1cc			/* Level 0 stack pointer */
+#define MSR_IA32_FRED_RSP1	0x1cd			/* Level 1 stack pointer */
+#define MSR_IA32_FRED_RSP2	0x1ce			/* Level 2 stack pointer */
+#define MSR_IA32_FRED_RSP3	0x1cf			/* Level 3 stack pointer */
+#define MSR_IA32_FRED_STKLVLS	0x1d0			/* Exception stack levels */
+#define MSR_IA32_FRED_SSP0	MSR_IA32_PL0_SSP	/* Level 0 shadow stack pointer */
+#define MSR_IA32_FRED_SSP1	0x1d1			/* Level 1 shadow stack pointer */
+#define MSR_IA32_FRED_SSP2	0x1d2			/* Level 2 shadow stack pointer */
+#define MSR_IA32_FRED_SSP3	0x1d3			/* Level 3 shadow stack pointer */
+#define MSR_IA32_FRED_CONFIG	0x1d4			/* Entrypoint and interrupt stack level */
 
+/* Intel MSRs. Some also available on other CPUs */
 #define MSR_TEST_CTRL				0x00000033
 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT	29
 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT		BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT)

From ed262541af195f452c43cd4f28310a09065039ec Mon Sep 17 00:00:00 2001
From: Xin Li <xin3.li@intel.com>
Date: Tue, 5 Dec 2023 02:50:02 -0800
Subject: [PATCH 348/707] x86/ptrace: Cleanup the definition of the pt_regs
 structure

struct pt_regs is hard to read because the member or section related
comments are not aligned with the members.

The 'cs' and 'ss' members of pt_regs are type of 'unsigned long' while
in reality they are only 16-bit wide. This works so far as the
remaining space is unused, but FRED will use the remaining bits for
other purposes.

To prepare for FRED:

  - Cleanup the formatting
  - Convert 'cs' and 'ss' to u16 and embed them into an union
    with a u64
  - Fixup the related printk() format strings

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Originally-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-14-xin3.li@intel.com
---
 arch/x86/entry/vsyscall/vsyscall_64.c |  2 +-
 arch/x86/include/asm/ptrace.h         | 48 +++++++++++++++++++--------
 arch/x86/kernel/process_64.c          |  2 +-
 3 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index e0ca8120aea876..a3c0df11d0e6d8 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -76,7 +76,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
 	if (!show_unhandled_signals)
 		return;
 
-	printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+	printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n",
 			   level, current->comm, task_pid_nr(current),
 			   message, regs->ip, regs->cs,
 			   regs->sp, regs->ax, regs->si, regs->di);
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index f4db78b09c8f0b..b268cd2a2d01c4 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -57,17 +57,19 @@ struct pt_regs {
 #else /* __i386__ */
 
 struct pt_regs {
-/*
- * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
- * unless syscall needs a complete, fully filled "struct pt_regs".
- */
+	/*
+	 * C ABI says these regs are callee-preserved. They aren't saved on
+	 * kernel entry unless syscall needs a complete, fully filled
+	 * "struct pt_regs".
+	 */
 	unsigned long r15;
 	unsigned long r14;
 	unsigned long r13;
 	unsigned long r12;
 	unsigned long bp;
 	unsigned long bx;
-/* These regs are callee-clobbered. Always saved on kernel entry. */
+
+	/* These regs are callee-clobbered. Always saved on kernel entry. */
 	unsigned long r11;
 	unsigned long r10;
 	unsigned long r9;
@@ -77,18 +79,38 @@ struct pt_regs {
 	unsigned long dx;
 	unsigned long si;
 	unsigned long di;
-/*
- * On syscall entry, this is syscall#. On CPU exception, this is error code.
- * On hw interrupt, it's IRQ number:
- */
+
+	/*
+	 * orig_ax is used on entry for:
+	 * - the syscall number (syscall, sysenter, int80)
+	 * - error_code stored by the CPU on traps and exceptions
+	 * - the interrupt number for device interrupts
+	 */
 	unsigned long orig_ax;
-/* Return frame for iretq */
+
+	/* The IRETQ return frame starts here */
 	unsigned long ip;
-	unsigned long cs;
+
+	union {
+		/* The full 64-bit data slot containing CS */
+		u64		csx;
+		/* CS selector */
+		u16		cs;
+	};
+
 	unsigned long flags;
 	unsigned long sp;
-	unsigned long ss;
-/* top of stack page */
+
+	union {
+		/* The full 64-bit data slot containing SS */
+		u64		ssx;
+		/* SS selector */
+		u16		ss;
+	};
+
+	/*
+	 * Top of stack on IDT systems.
+	 */
 };
 
 #endif /* !__i386__ */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 33b268747bb7bb..0f78b58021bb2a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -117,7 +117,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
 
 	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
-	printk("%sCS:  %04lx DS: %04x ES: %04x CR0: %016lx\n",
+	printk("%sCS:  %04x DS: %04x ES: %04x CR0: %016lx\n",
 		log_lvl, regs->cs, ds, es, cr0);
 	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
 		log_lvl, cr2, cr3, cr4);

From c125443456e97f7bcc87cc7ba1346c2b92c4db94 Mon Sep 17 00:00:00 2001
From: Xin Li <xin3.li@intel.com>
Date: Tue, 5 Dec 2023 02:50:03 -0800
Subject: [PATCH 349/707] x86/ptrace: Add FRED additional information to the
 pt_regs structure

FRED defines additional information in the upper 48 bits of cs/ss
fields. Therefore add the information definitions into the pt_regs
structure.

Specifically introduce a new structure fred_ss to denote the FRED flags
above SS selector, which avoids FRED_SSX_ macros and makes the code
simpler and easier to read.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Originally-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-15-xin3.li@intel.com
---
 arch/x86/include/asm/ptrace.h | 66 ++++++++++++++++++++++++++++++++---
 1 file changed, 61 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index b268cd2a2d01c4..5a83fbd9bc0b44 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -56,6 +56,50 @@ struct pt_regs {
 
 #else /* __i386__ */
 
+struct fred_cs {
+		/* CS selector */
+	u64	cs	: 16,
+		/* Stack level at event time */
+		sl	:  2,
+		/* IBT in WAIT_FOR_ENDBRANCH state */
+		wfe	:  1,
+			: 45;
+};
+
+struct fred_ss {
+		/* SS selector */
+	u64	ss	: 16,
+		/* STI state */
+		sti	:  1,
+		/* Set if syscall, sysenter or INT n */
+		swevent	:  1,
+		/* Event is NMI type */
+		nmi	:  1,
+			: 13,
+		/* Event vector */
+		vector	:  8,
+			:  8,
+		/* Event type */
+		type	:  4,
+			:  4,
+		/* Event was incident to enclave execution */
+		enclave	:  1,
+		/* CPU was in long mode */
+		lm	:  1,
+		/*
+		 * Nested exception during FRED delivery, not set
+		 * for #DF.
+		 */
+		nested	:  1,
+			:  1,
+		/*
+		 * The length of the instruction causing the event.
+		 * Only set for INTO, INT1, INT3, INT n, SYSCALL
+		 * and SYSENTER.  0 otherwise.
+		 */
+		insnlen	:  4;
+};
+
 struct pt_regs {
 	/*
 	 * C ABI says these regs are callee-preserved. They aren't saved on
@@ -85,6 +129,12 @@ struct pt_regs {
 	 * - the syscall number (syscall, sysenter, int80)
 	 * - error_code stored by the CPU on traps and exceptions
 	 * - the interrupt number for device interrupts
+	 *
+	 * A FRED stack frame starts here:
+	 *   1) It _always_ includes an error code;
+	 *
+	 *   2) The return frame for ERET[US] starts here, but
+	 *      the content of orig_ax is ignored.
 	 */
 	unsigned long orig_ax;
 
@@ -92,24 +142,30 @@ struct pt_regs {
 	unsigned long ip;
 
 	union {
-		/* The full 64-bit data slot containing CS */
-		u64		csx;
 		/* CS selector */
 		u16		cs;
+		/* The extended 64-bit data slot containing CS */
+		u64		csx;
+		/* The FRED CS extension */
+		struct fred_cs	fred_cs;
 	};
 
 	unsigned long flags;
 	unsigned long sp;
 
 	union {
-		/* The full 64-bit data slot containing SS */
-		u64		ssx;
 		/* SS selector */
 		u16		ss;
+		/* The extended 64-bit data slot containing SS */
+		u64		ssx;
+		/* The FRED SS extension */
+		struct fred_ss	fred_ss;
 	};
 
 	/*
-	 * Top of stack on IDT systems.
+	 * Top of stack on IDT systems, while FRED systems have extra fields
+	 * defined above for storing exception related information, e.g. CR2 or
+	 * DR6.
 	 */
 };
 

From c413db75cb7db9740330f3375d1854994bd0c8cb Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Tue, 5 Dec 2023 02:50:04 -0800
Subject: [PATCH 350/707] x86/fred: Add a new header file for FRED definitions

Add a header file for FRED prototypes and definitions.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-16-xin3.li@intel.com
---
 arch/x86/include/asm/fred.h | 68 +++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 arch/x86/include/asm/fred.h

diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
new file mode 100644
index 00000000000000..f514fdb5a39f73
--- /dev/null
+++ b/arch/x86/include/asm/fred.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Macros for Flexible Return and Event Delivery (FRED)
+ */
+
+#ifndef ASM_X86_FRED_H
+#define ASM_X86_FRED_H
+
+#include <linux/const.h>
+
+#include <asm/asm.h>
+
+/*
+ * FRED event return instruction opcodes for ERET{S,U}; supported in
+ * binutils >= 2.41.
+ */
+#define ERETS			_ASM_BYTES(0xf2,0x0f,0x01,0xca)
+#define ERETU			_ASM_BYTES(0xf3,0x0f,0x01,0xca)
+
+/*
+ * RSP is aligned to a 64-byte boundary before used to push a new stack frame
+ */
+#define FRED_STACK_FRAME_RSP_MASK	_AT(unsigned long, (~0x3f))
+
+/*
+ * Used for the return address for call emulation during code patching,
+ * and measured in 64-byte cache lines.
+ */
+#define FRED_CONFIG_REDZONE_AMOUNT	1
+#define FRED_CONFIG_REDZONE		(_AT(unsigned long, FRED_CONFIG_REDZONE_AMOUNT) << 6)
+#define FRED_CONFIG_INT_STKLVL(l)	(_AT(unsigned long, l) << 9)
+#define FRED_CONFIG_ENTRYPOINT(p)	_AT(unsigned long, (p))
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_X86_FRED
+#include <linux/kernel.h>
+
+#include <asm/ptrace.h>
+
+struct fred_info {
+	/* Event data: CR2, DR6, ... */
+	unsigned long edata;
+	unsigned long resv;
+};
+
+/* Full format of the FRED stack frame */
+struct fred_frame {
+	struct pt_regs   regs;
+	struct fred_info info;
+};
+
+static __always_inline struct fred_info *fred_info(struct pt_regs *regs)
+{
+	return &container_of(regs, struct fred_frame, regs)->info;
+}
+
+static __always_inline unsigned long fred_event_data(struct pt_regs *regs)
+{
+	return fred_info(regs)->edata;
+}
+
+#else /* CONFIG_X86_FRED */
+static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; }
+#endif /* CONFIG_X86_FRED */
+#endif /* !__ASSEMBLY__ */
+
+#endif /* ASM_X86_FRED_H */

From fcd06abf6de2b81724a1e39c121d288f66b1d392 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Tue, 5 Dec 2023 02:50:05 -0800
Subject: [PATCH 351/707] x86/fred: Reserve space for the FRED stack frame

When using FRED, reserve space at the top of the stack frame, just
like i386 does.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-17-xin3.li@intel.com
---
 arch/x86/include/asm/thread_info.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d63b02940747fa..12da7dfd5ef13b 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -31,7 +31,9 @@
  * In vm86 mode, the hardware frame is much longer still, so add 16
  * bytes to make room for the real-mode segments.
  *
- * x86_64 has a fixed-length stack frame.
+ * x86-64 has a fixed-length stack frame, but it depends on whether
+ * or not FRED is enabled. Future versions of FRED might make this
+ * dynamic, but for now it is always 2 words longer.
  */
 #ifdef CONFIG_X86_32
 # ifdef CONFIG_VM86
@@ -39,8 +41,12 @@
 # else
 #  define TOP_OF_KERNEL_STACK_PADDING 8
 # endif
-#else
-# define TOP_OF_KERNEL_STACK_PADDING 0
+#else /* x86-64 */
+# ifdef CONFIG_X86_FRED
+#  define TOP_OF_KERNEL_STACK_PADDING (2 * 8)
+# else
+#  define TOP_OF_KERNEL_STACK_PADDING 0
+# endif
 #endif
 
 /*

From f393835cbab6184f3ee6ed90499a88e9930a8512 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Tue, 5 Dec 2023 02:50:06 -0800
Subject: [PATCH 352/707] x86/fred: Update MSR_IA32_FRED_RSP0 during task
 switch

MSR_IA32_FRED_RSP0 is used during ring 3 event delivery, and needs to
be updated to point to the top of next task stack during task switch.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-18-xin3.li@intel.com
---
 arch/x86/include/asm/switch_to.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index f42dbf17f52b0e..c3bd0c0758c9a4 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -70,9 +70,13 @@ static inline void update_task_stack(struct task_struct *task)
 #ifdef CONFIG_X86_32
 	this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
 #else
-	/* Xen PV enters the kernel on the thread stack. */
-	if (cpu_feature_enabled(X86_FEATURE_XENPV))
+	if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+		/* WRMSRNS is a baseline feature for FRED. */
+		wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) + THREAD_SIZE);
+	} else if (cpu_feature_enabled(X86_FEATURE_XENPV)) {
+		/* Xen PV enters the kernel on the thread stack. */
 		load_sp0(task_top_of_stack(task));
+	}
 #endif
 }
 

From 5710910a6c94bcb08d1081ca94119220066331ad Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Tue, 5 Dec 2023 02:50:07 -0800
Subject: [PATCH 353/707] x86/fred: Disallow the swapgs instruction when FRED
 is enabled

SWAPGS is no longer needed thus NOT allowed with FRED because FRED
transitions ensure that an operating system can _always_ operate
with its own GS base address:

  - For events that occur in ring 3, FRED event delivery swaps the GS
    base address with the IA32_KERNEL_GS_BASE MSR.

  - ERETU (the FRED transition that returns to ring 3) also swaps the
    GS base address with the IA32_KERNEL_GS_BASE MSR.

And the operating system can still setup the GS segment for a user
thread without the need of loading a user thread GS with:

  - Using LKGS, available with FRED, to modify other attributes of the
    GS segment without compromising its ability always to operate with
    its own GS base address.

  - Accessing the GS segment base address for a user thread as before
    using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR.

Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE MSR
instead of the GS segment's descriptor cache. As such, the operating
system never changes its runtime GS base address.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-19-xin3.li@intel.com
---
 arch/x86/kernel/process_64.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 0f78b58021bb2a..4f87f5987ae8c1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -166,7 +166,29 @@ static noinstr unsigned long __rdgsbase_inactive(void)
 
 	lockdep_assert_irqs_disabled();
 
-	if (!cpu_feature_enabled(X86_FEATURE_XENPV)) {
+	/*
+	 * SWAPGS is no longer needed thus NOT allowed with FRED because
+	 * FRED transitions ensure that an operating system can _always_
+	 * operate with its own GS base address:
+	 * - For events that occur in ring 3, FRED event delivery swaps
+	 *   the GS base address with the IA32_KERNEL_GS_BASE MSR.
+	 * - ERETU (the FRED transition that returns to ring 3) also swaps
+	 *   the GS base address with the IA32_KERNEL_GS_BASE MSR.
+	 *
+	 * And the operating system can still setup the GS segment for a
+	 * user thread without the need of loading a user thread GS with:
+	 * - Using LKGS, available with FRED, to modify other attributes
+	 *   of the GS segment without compromising its ability always to
+	 *   operate with its own GS base address.
+	 * - Accessing the GS segment base address for a user thread as
+	 *   before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR.
+	 *
+	 * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE
+	 * MSR instead of the GS segment’s descriptor cache. As such, the
+	 * operating system never changes its runtime GS base address.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
+	    !cpu_feature_enabled(X86_FEATURE_XENPV)) {
 		native_swapgs();
 		gsbase = rdgsbase();
 		native_swapgs();
@@ -191,7 +213,8 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase)
 {
 	lockdep_assert_irqs_disabled();
 
-	if (!cpu_feature_enabled(X86_FEATURE_XENPV)) {
+	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
+	    !cpu_feature_enabled(X86_FEATURE_XENPV)) {
 		native_swapgs();
 		wrgsbase(gsbase);
 		native_swapgs();

From d0fb796dc3475cf71d788ec960d8ed5de4d7a429 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Tue, 5 Dec 2023 02:50:08 -0800
Subject: [PATCH 354/707] x86/fred: No ESPFIX needed when FRED is enabled

Because FRED always restores the full value of %rsp, ESPFIX is
no longer needed when it's enabled.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-20-xin3.li@intel.com
---
 arch/x86/kernel/espfix_64.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 16f9814c9be02c..6726e0473d0b40 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -106,6 +106,10 @@ void __init init_espfix_bsp(void)
 	pgd_t *pgd;
 	p4d_t *p4d;
 
+	/* FRED systems always restore the full value of %rsp */
+	if (cpu_feature_enabled(X86_FEATURE_FRED))
+		return;
+
 	/* Install the espfix pud into the kernel page directory */
 	pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
 	p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
@@ -129,6 +133,10 @@ void init_espfix_ap(int cpu)
 	void *stack_page;
 	pteval_t ptemask;
 
+	/* FRED systems always restore the full value of %rsp */
+	if (cpu_feature_enabled(X86_FEATURE_FRED))
+		return;
+
 	/* We only have to do this once... */
 	if (likely(per_cpu(espfix_stack, cpu)))
 		return;		/* Already initialized */

From f102fe126d2811eded63d700fbe27527d936af74 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Tue, 5 Dec 2023 02:50:09 -0800
Subject: [PATCH 355/707] x86/fred: Allow single-step trap and NMI when
 starting a new task

Entering a new task is logically speaking a return from a system call
(exec, fork, clone, etc.). As such, if ptrace enables single stepping
a single step exception should be allowed to trigger immediately upon
entering user space. This is not optional.

NMI should *never* be disabled in user space. As such, this is an
optional, opportunistic way to catch errors.

Allow single-step trap and NMI when starting a new task, thus once
the new task enters user space, single-step trap and NMI are both
enabled immediately.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-21-xin3.li@intel.com
---
 arch/x86/kernel/process_64.c | 38 ++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4f87f5987ae8c1..c075591b7b46a6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -56,6 +56,7 @@
 #include <asm/resctrl.h>
 #include <asm/unistd.h>
 #include <asm/fsgsbase.h>
+#include <asm/fred.h>
 #ifdef CONFIG_IA32_EMULATION
 /* Not included via unistd.h */
 #include <asm/unistd_32_ia32.h>
@@ -528,7 +529,7 @@ void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
 static void
 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 		    unsigned long new_sp,
-		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
+		    u16 _cs, u16 _ss, u16 _ds)
 {
 	WARN_ON_ONCE(regs != current_pt_regs());
 
@@ -545,11 +546,36 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 	loadsegment(ds, _ds);
 	load_gs_index(0);
 
-	regs->ip		= new_ip;
-	regs->sp		= new_sp;
-	regs->cs		= _cs;
-	regs->ss		= _ss;
-	regs->flags		= X86_EFLAGS_IF;
+	regs->ip	= new_ip;
+	regs->sp	= new_sp;
+	regs->csx	= _cs;
+	regs->ssx	= _ss;
+	/*
+	 * Allow single-step trap and NMI when starting a new task, thus
+	 * once the new task enters user space, single-step trap and NMI
+	 * are both enabled immediately.
+	 *
+	 * Entering a new task is logically speaking a return from a
+	 * system call (exec, fork, clone, etc.). As such, if ptrace
+	 * enables single stepping a single step exception should be
+	 * allowed to trigger immediately upon entering user space.
+	 * This is not optional.
+	 *
+	 * NMI should *never* be disabled in user space. As such, this
+	 * is an optional, opportunistic way to catch errors.
+	 *
+	 * Paranoia: High-order 48 bits above the lowest 16 bit SS are
+	 * discarded by the legacy IRET instruction on all Intel, AMD,
+	 * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally,
+	 * even when FRED is not enabled. But we choose the safer side
+	 * to use these bits only when FRED is enabled.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+		regs->fred_ss.swevent	= true;
+		regs->fred_ss.nmi	= true;
+	}
+
+	regs->flags	= X86_EFLAGS_IF | X86_EFLAGS_FIXED;
 }
 
 void

From 9f6870bafc183644d20cba702168e37b48e291a7 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Tue, 5 Dec 2023 02:50:10 -0800
Subject: [PATCH 356/707] x86/fred: Make exc_page_fault() work for FRED

On a FRED system, the faulting address (CR2) is passed on the stack,
to avoid the problem of transient state.  Thus the page fault address
is read from the FRED stack frame instead of CR2 when FRED is enabled.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-22-xin3.li@intel.com
---
 arch/x86/mm/fault.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 679b09cfe241c7..fa2d69951f25c1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -34,6 +34,7 @@
 #include <asm/kvm_para.h>		/* kvm_handle_async_pf		*/
 #include <asm/vdso.h>			/* fixup_vdso_exception()	*/
 #include <asm/irq_stack.h>
+#include <asm/fred.h>
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/exceptions.h>
@@ -1518,8 +1519,10 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code,
 
 DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
 {
-	unsigned long address = read_cr2();
 	irqentry_state_t state;
+	unsigned long address;
+
+	address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2();
 
 	prefetchw(&current->mm->mmap_lock);
 

From 2ad2917c6f50c707fc9872f6885807e4133bd882 Mon Sep 17 00:00:00 2001
From: Xin Li <xin3.li@intel.com>
Date: Tue, 5 Dec 2023 02:50:11 -0800
Subject: [PATCH 357/707] x86/idtentry: Incorporate definitions/declarations of
 the FRED entries

FRED and IDT can share most of the definitions and declarations so
that in the majority of cases the actual handler implementation is the
same.

The differences are the exceptions where FRED stores exception related
information on the stack and the sysvec implementations as FRED can
handle irqentry/exit() in the dispatcher instead of having it in each
handler.

Also add stub defines for vectors which are not used due to Kconfig
decisions to spare the ifdeffery in the actual FRED dispatch code.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-23-xin3.li@intel.com
---
 arch/x86/include/asm/idtentry.h | 71 +++++++++++++++++++++++++++++----
 1 file changed, 63 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index e9f71b3217c2d2..570f286ca7ddae 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -13,15 +13,18 @@
 
 #include <asm/irq_stack.h>
 
+typedef void (*idtentry_t)(struct pt_regs *regs);
+
 /**
  * DECLARE_IDTENTRY - Declare functions for simple IDT entry points
  *		      No error code pushed by hardware
  * @vector:	Vector number (ignored for C)
  * @func:	Function name of the entry point
  *
- * Declares three functions:
+ * Declares four functions:
  * - The ASM entry point: asm_##func
  * - The XEN PV trap entry point: xen_##func (maybe unused)
+ * - The C handler called from the FRED event dispatcher (maybe unused)
  * - The C handler called from the ASM entry point
  *
  * Note: This is the C variant of DECLARE_IDTENTRY(). As the name says it
@@ -31,6 +34,7 @@
 #define DECLARE_IDTENTRY(vector, func)					\
 	asmlinkage void asm_##func(void);				\
 	asmlinkage void xen_asm_##func(void);				\
+	void fred_##func(struct pt_regs *regs);				\
 	__visible void func(struct pt_regs *regs)
 
 /**
@@ -137,6 +141,17 @@ static __always_inline void __##func(struct pt_regs *regs,		\
 #define DEFINE_IDTENTRY_RAW(func)					\
 __visible noinstr void func(struct pt_regs *regs)
 
+/**
+ * DEFINE_FREDENTRY_RAW - Emit code for raw FRED entry points
+ * @func:	Function name of the entry point
+ *
+ * @func is called from the FRED event dispatcher with interrupts disabled.
+ *
+ * See @DEFINE_IDTENTRY_RAW for further details.
+ */
+#define DEFINE_FREDENTRY_RAW(func)					\
+noinstr void fred_##func(struct pt_regs *regs)
+
 /**
  * DECLARE_IDTENTRY_RAW_ERRORCODE - Declare functions for raw IDT entry points
  *				    Error code pushed by hardware
@@ -233,17 +248,27 @@ static noinline void __##func(struct pt_regs *regs, u32 vector)
 #define DEFINE_IDTENTRY_SYSVEC(func)					\
 static void __##func(struct pt_regs *regs);				\
 									\
+static __always_inline void instr_##func(struct pt_regs *regs)		\
+{									\
+	kvm_set_cpu_l1tf_flush_l1d();					\
+	run_sysvec_on_irqstack_cond(__##func, regs);			\
+}									\
+									\
 __visible noinstr void func(struct pt_regs *regs)			\
 {									\
 	irqentry_state_t state = irqentry_enter(regs);			\
 									\
 	instrumentation_begin();					\
-	kvm_set_cpu_l1tf_flush_l1d();					\
-	run_sysvec_on_irqstack_cond(__##func, regs);			\
+	instr_##func (regs);						\
 	instrumentation_end();						\
 	irqentry_exit(regs, state);					\
 }									\
 									\
+void fred_##func(struct pt_regs *regs)					\
+{									\
+	instr_##func (regs);						\
+}									\
+									\
 static noinline void __##func(struct pt_regs *regs)
 
 /**
@@ -260,19 +285,29 @@ static noinline void __##func(struct pt_regs *regs)
 #define DEFINE_IDTENTRY_SYSVEC_SIMPLE(func)				\
 static __always_inline void __##func(struct pt_regs *regs);		\
 									\
-__visible noinstr void func(struct pt_regs *regs)			\
+static __always_inline void instr_##func(struct pt_regs *regs)		\
 {									\
-	irqentry_state_t state = irqentry_enter(regs);			\
-									\
-	instrumentation_begin();					\
 	__irq_enter_raw();						\
 	kvm_set_cpu_l1tf_flush_l1d();					\
 	__##func (regs);						\
 	__irq_exit_raw();						\
+}									\
+									\
+__visible noinstr void func(struct pt_regs *regs)			\
+{									\
+	irqentry_state_t state = irqentry_enter(regs);			\
+									\
+	instrumentation_begin();					\
+	instr_##func (regs);						\
 	instrumentation_end();						\
 	irqentry_exit(regs, state);					\
 }									\
 									\
+void fred_##func(struct pt_regs *regs)					\
+{									\
+	instr_##func (regs);						\
+}									\
+									\
 static __always_inline void __##func(struct pt_regs *regs)
 
 /**
@@ -410,15 +445,18 @@ __visible noinstr void func(struct pt_regs *regs,			\
 /* C-Code mapping */
 #define DECLARE_IDTENTRY_NMI		DECLARE_IDTENTRY_RAW
 #define DEFINE_IDTENTRY_NMI		DEFINE_IDTENTRY_RAW
+#define DEFINE_FREDENTRY_NMI		DEFINE_FREDENTRY_RAW
 
 #ifdef CONFIG_X86_64
 #define DECLARE_IDTENTRY_MCE		DECLARE_IDTENTRY_IST
 #define DEFINE_IDTENTRY_MCE		DEFINE_IDTENTRY_IST
 #define DEFINE_IDTENTRY_MCE_USER	DEFINE_IDTENTRY_NOIST
+#define DEFINE_FREDENTRY_MCE		DEFINE_FREDENTRY_RAW
 
 #define DECLARE_IDTENTRY_DEBUG		DECLARE_IDTENTRY_IST
 #define DEFINE_IDTENTRY_DEBUG		DEFINE_IDTENTRY_IST
 #define DEFINE_IDTENTRY_DEBUG_USER	DEFINE_IDTENTRY_NOIST
+#define DEFINE_FREDENTRY_DEBUG		DEFINE_FREDENTRY_RAW
 #endif
 
 #else /* !__ASSEMBLY__ */
@@ -655,23 +693,36 @@ DECLARE_IDTENTRY(RESCHEDULE_VECTOR,			sysvec_reschedule_ipi);
 DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR,			sysvec_reboot);
 DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR,	sysvec_call_function_single);
 DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR,		sysvec_call_function);
+#else
+# define fred_sysvec_reschedule_ipi			NULL
+# define fred_sysvec_reboot				NULL
+# define fred_sysvec_call_function_single		NULL
+# define fred_sysvec_call_function			NULL
 #endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
 # ifdef CONFIG_X86_MCE_THRESHOLD
 DECLARE_IDTENTRY_SYSVEC(THRESHOLD_APIC_VECTOR,		sysvec_threshold);
+# else
+# define fred_sysvec_threshold				NULL
 # endif
 
 # ifdef CONFIG_X86_MCE_AMD
 DECLARE_IDTENTRY_SYSVEC(DEFERRED_ERROR_VECTOR,		sysvec_deferred_error);
+# else
+# define fred_sysvec_deferred_error			NULL
 # endif
 
 # ifdef CONFIG_X86_THERMAL_VECTOR
 DECLARE_IDTENTRY_SYSVEC(THERMAL_APIC_VECTOR,		sysvec_thermal);
+# else
+# define fred_sysvec_thermal				NULL
 # endif
 
 # ifdef CONFIG_IRQ_WORK
 DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR,		sysvec_irq_work);
+# else
+# define fred_sysvec_irq_work				NULL
 # endif
 #endif
 
@@ -679,12 +730,16 @@ DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR,		sysvec_irq_work);
 DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR,		sysvec_kvm_posted_intr_ipi);
 DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR,	sysvec_kvm_posted_intr_wakeup_ipi);
 DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR,	sysvec_kvm_posted_intr_nested_ipi);
+#else
+# define fred_sysvec_kvm_posted_intr_ipi		NULL
+# define fred_sysvec_kvm_posted_intr_wakeup_ipi		NULL
+# define fred_sysvec_kvm_posted_intr_nested_ipi		NULL
 #endif
 
 #if IS_ENABLED(CONFIG_HYPERV)
 DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR,	sysvec_hyperv_callback);
 DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR,	sysvec_hyperv_reenlightenment);
-DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR,	sysvec_hyperv_stimer0);
+DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR,		sysvec_hyperv_stimer0);
 #endif
 
 #if IS_ENABLED(CONFIG_ACRN_GUEST)

From 4af12f6a393ca2be76de6c5484f79acc1167e1c8 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Tue, 5 Dec 2023 02:50:12 -0800
Subject: [PATCH 358/707] x86/fred: Add a debug fault entry stub for FRED

When occurred on different ring level, i.e., from user or kernel context,
#DB needs to be handled on different stack: User #DB on current task
stack, while kernel #DB on a dedicated stack. This is exactly how FRED
event delivery invokes an exception handler: ring 3 event on level 0
stack, i.e., current task stack; ring 0 event on the #DB dedicated stack
specified in the IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug
exception entry stub doesn't do stack switch.

On a FRED system, the debug trap status information (DR6) is passed on
the stack, to avoid the problem of transient state. Furthermore, FRED
transitions avoid a lot of ugly corner cases the handling of which can,
and should be, skipped.

The FRED debug trap status information saved on the stack differs from
DR6 in both stickiness and polarity; it is exactly in the format which
debug_read_clear_dr6() returns for the IDT entry points.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-24-xin3.li@intel.com
---
 arch/x86/kernel/traps.c | 43 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 3c37489018256f..1b19a170f5e35e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -51,6 +51,7 @@
 #include <asm/ftrace.h>
 #include <asm/traps.h>
 #include <asm/desc.h>
+#include <asm/fred.h>
 #include <asm/fpu/api.h>
 #include <asm/cpu.h>
 #include <asm/cpu_entry_area.h>
@@ -935,8 +936,7 @@ static bool notify_debug(struct pt_regs *regs, unsigned long *dr6)
 	return false;
 }
 
-static __always_inline void exc_debug_kernel(struct pt_regs *regs,
-					     unsigned long dr6)
+static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6)
 {
 	/*
 	 * Disable breakpoints during exception handling; recursive exceptions
@@ -948,6 +948,11 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
 	 *
 	 * Entry text is excluded for HW_BP_X and cpu_entry_area, which
 	 * includes the entry stack is excluded for everything.
+	 *
+	 * For FRED, nested #DB should just work fine. But when a watchpoint or
+	 * breakpoint is set in the code path which is executed by #DB handler,
+	 * it results in an endless recursion and stack overflow. Thus we stay
+	 * with the IDT approach, i.e., save DR7 and disable #DB.
 	 */
 	unsigned long dr7 = local_db_save();
 	irqentry_state_t irq_state = irqentry_nmi_enter(regs);
@@ -977,7 +982,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
 	 * Catch SYSENTER with TF set and clear DR_STEP. If this hit a
 	 * watchpoint at the same time then that will still be handled.
 	 */
-	if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs))
+	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
+	    (dr6 & DR_STEP) && is_sysenter_singlestep(regs))
 		dr6 &= ~DR_STEP;
 
 	/*
@@ -1009,8 +1015,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
 	local_db_restore(dr7);
 }
 
-static __always_inline void exc_debug_user(struct pt_regs *regs,
-					   unsigned long dr6)
+static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6)
 {
 	bool icebp;
 
@@ -1094,6 +1099,34 @@ DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
 {
 	exc_debug_user(regs, debug_read_clear_dr6());
 }
+
+#ifdef CONFIG_X86_FRED
+/*
+ * When occurred on different ring level, i.e., from user or kernel
+ * context, #DB needs to be handled on different stack: User #DB on
+ * current task stack, while kernel #DB on a dedicated stack.
+ *
+ * This is exactly how FRED event delivery invokes an exception
+ * handler: ring 3 event on level 0 stack, i.e., current task stack;
+ * ring 0 event on the #DB dedicated stack specified in the
+ * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception
+ * entry stub doesn't do stack switch.
+ */
+DEFINE_FREDENTRY_DEBUG(exc_debug)
+{
+	/*
+	 * FRED #DB stores DR6 on the stack in the format which
+	 * debug_read_clear_dr6() returns for the IDT entry points.
+	 */
+	unsigned long dr6 = fred_event_data(regs);
+
+	if (user_mode(regs))
+		exc_debug_user(regs, dr6);
+	else
+		exc_debug_kernel(regs, dr6);
+}
+#endif /* CONFIG_X86_FRED */
+
 #else
 /* 32 bit does not have separate entry points. */
 DEFINE_IDTENTRY_RAW(exc_debug)

From 3e91abaa567300fd48a0fac4c9aaedd30fa2f3f9 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Fri, 15 Dec 2023 22:31:39 -0800
Subject: [PATCH 359/707] x86/fred: Add a NMI entry stub for FRED

On a FRED system, NMIs nest both with themselves and faults, transient
information is saved into the stack frame, and NMI unblocking only
happens when the stack frame indicates that so should happen.

Thus, the NMI entry stub for FRED is really quite small...

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231216063139.25567-1-xin3.li@intel.com
---
 arch/x86/kernel/nmi.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 17e955ab69feda..3130a66b0f48d0 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -35,6 +35,7 @@
 #include <asm/nospec-branch.h>
 #include <asm/microcode.h>
 #include <asm/sev.h>
+#include <asm/fred.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/nmi.h>
@@ -651,6 +652,47 @@ void nmi_backtrace_stall_check(const struct cpumask *btp)
 
 #endif
 
+#ifdef CONFIG_X86_FRED
+/*
+ * With FRED, CR2/DR6 is pushed to #PF/#DB stack frame during FRED
+ * event delivery, i.e., there is no problem of transient states.
+ * And NMI unblocking only happens when the stack frame indicates
+ * that so should happen.
+ *
+ * Thus, the NMI entry stub for FRED is really straightforward and
+ * as simple as most exception handlers. As such, #DB is allowed
+ * during NMI handling.
+ */
+DEFINE_FREDENTRY_NMI(exc_nmi)
+{
+	irqentry_state_t irq_state;
+
+	if (arch_cpu_is_offline(smp_processor_id())) {
+		if (microcode_nmi_handler_enabled())
+			microcode_offline_nmi_handler();
+		return;
+	}
+
+	/*
+	 * Save CR2 for eventual restore to cover the case where the NMI
+	 * hits the VMENTER/VMEXIT region where guest CR2 is life. This
+	 * prevents guest state corruption in case that the NMI handler
+	 * takes a page fault.
+	 */
+	this_cpu_write(nmi_cr2, read_cr2());
+
+	irq_state = irqentry_nmi_enter(regs);
+
+	inc_irq_stat(__nmi_count);
+	default_do_nmi(regs);
+
+	irqentry_nmi_exit(regs, irq_state);
+
+	if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+		write_cr2(this_cpu_read(nmi_cr2));
+}
+#endif
+
 void stop_nmi(void)
 {
 	ignore_nmis++;

From 5dd56c94ca2f8834e7689cac0045d312ef3ac9c6 Mon Sep 17 00:00:00 2001
From: Xin Li <xin3.li@intel.com>
Date: Tue, 5 Dec 2023 02:50:14 -0800
Subject: [PATCH 360/707] x86/fred: Add a machine check entry stub for FRED

Like #DB, when occurred on different ring level, i.e., from user or kernel
context, #MCE needs to be handled on different stack: User #MCE on current
task stack, while kernel #MCE on a dedicated stack.

This is exactly how FRED event delivery invokes an exception handler: ring
3 event on level 0 stack, i.e., current task stack; ring 0 event on the
#MCE dedicated stack specified in the IA32_FRED_STKLVLS MSR. So unlike IDT,
the FRED machine check entry stub doesn't do stack switch.

Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-26-xin3.li@intel.com
---
 arch/x86/kernel/cpu/mce/core.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index bc39252bc54f2e..04acdc3534c81a 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -46,6 +46,7 @@
 #include <linux/hardirq.h>
 #include <linux/kexec.h>
 
+#include <asm/fred.h>
 #include <asm/intel-family.h>
 #include <asm/processor.h>
 #include <asm/traps.h>
@@ -2166,6 +2167,31 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
 	exc_machine_check_user(regs);
 	local_db_restore(dr7);
 }
+
+#ifdef CONFIG_X86_FRED
+/*
+ * When occurred on different ring level, i.e., from user or kernel
+ * context, #MCE needs to be handled on different stack: User #MCE
+ * on current task stack, while kernel #MCE on a dedicated stack.
+ *
+ * This is exactly how FRED event delivery invokes an exception
+ * handler: ring 3 event on level 0 stack, i.e., current task stack;
+ * ring 0 event on the #MCE dedicated stack specified in the
+ * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry
+ * stub doesn't do stack switch.
+ */
+DEFINE_FREDENTRY_MCE(exc_machine_check)
+{
+	unsigned long dr7;
+
+	dr7 = local_db_save();
+	if (user_mode(regs))
+		exc_machine_check_user(regs);
+	else
+		exc_machine_check_kernel(regs);
+	local_db_restore(dr7);
+}
+#endif
 #else
 /* 32bit unified entry point */
 DEFINE_IDTENTRY_RAW(exc_machine_check)

From 6786137bf8fd717bed7ff9ce4eee34ce03a26631 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Sat, 9 Dec 2023 13:42:14 -0800
Subject: [PATCH 361/707] x86/fred: FRED entry/exit and dispatch code

The code to actually handle kernel and event entry/exit using
FRED. It is split up into two files thus:

 - entry_64_fred.S contains the actual entrypoints and exit code, and
   saves and restores registers.

 - entry_fred.c contains the two-level event dispatch code for FRED.
   The first-level dispatch is on the event type, and the second-level
   is on the event vector.

  [ bp: Fold in an allmodconfig clang build fix:
    https://lore.kernel.org/r/20240129064521.5168-1-xin3.li@intel.com
    and a CONFIG_IA32_EMULATION=n build fix:
    https://lore.kernel.org/r/20240127093728.1323-3-xin3.li@intel.com]

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Originally-by: Megha Dey <megha.dey@intel.com>
Co-developed-by: Xin Li <xin3.li@intel.com>
Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231209214214.2932-1-xin3.li@intel.com
---
 arch/x86/entry/Makefile               |   5 +-
 arch/x86/entry/entry_64_fred.S        |  50 ++++++
 arch/x86/entry/entry_fred.c           | 245 ++++++++++++++++++++++++++
 arch/x86/include/asm/asm-prototypes.h |   1 +
 arch/x86/include/asm/fred.h           |   6 +
 arch/x86/include/asm/ia32.h           |   4 +-
 6 files changed, 308 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/entry/entry_64_fred.S
 create mode 100644 arch/x86/entry/entry_fred.c

diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index ca2fe186994b0a..c93e7f5c2a0652 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -18,6 +18,9 @@ obj-y				+= vdso/
 obj-y				+= vsyscall/
 
 obj-$(CONFIG_PREEMPTION)	+= thunk_$(BITS).o
+CFLAGS_entry_fred.o		+= -fno-stack-protector
+CFLAGS_REMOVE_entry_fred.o	+= -pg $(CC_FLAGS_FTRACE)
+obj-$(CONFIG_X86_FRED)		+= entry_64_fred.o entry_fred.o
+
 obj-$(CONFIG_IA32_EMULATION)	+= entry_64_compat.o syscall_32.o
 obj-$(CONFIG_X86_X32_ABI)	+= syscall_x32.o
-
diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
new file mode 100644
index 00000000000000..c1ddaf6b068f4d
--- /dev/null
+++ b/arch/x86/entry/entry_64_fred.S
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The actual FRED entry points.
+ */
+
+#include <asm/fred.h>
+
+#include "calling.h"
+
+	.code64
+	.section .noinstr.text, "ax"
+
+.macro FRED_ENTER
+	UNWIND_HINT_END_OF_STACK
+	ENDBR
+	PUSH_AND_CLEAR_REGS
+	movq	%rsp, %rdi	/* %rdi -> pt_regs */
+.endm
+
+.macro FRED_EXIT
+	UNWIND_HINT_REGS
+	POP_REGS
+.endm
+
+/*
+ * The new RIP value that FRED event delivery establishes is
+ * IA32_FRED_CONFIG & ~FFFH for events that occur in ring 3.
+ * Thus the FRED ring 3 entry point must be 4K page aligned.
+ */
+	.align 4096
+
+SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user)
+	FRED_ENTER
+	call	fred_entry_from_user
+	FRED_EXIT
+	ERETU
+SYM_CODE_END(asm_fred_entrypoint_user)
+
+/*
+ * The new RIP value that FRED event delivery establishes is
+ * (IA32_FRED_CONFIG & ~FFFH) + 256 for events that occur in
+ * ring 0, i.e., asm_fred_entrypoint_user + 256.
+ */
+	.org asm_fred_entrypoint_user + 256, 0xcc
+SYM_CODE_START_NOALIGN(asm_fred_entrypoint_kernel)
+	FRED_ENTER
+	call	fred_entry_from_kernel
+	FRED_EXIT
+	ERETS
+SYM_CODE_END(asm_fred_entrypoint_kernel)
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
new file mode 100644
index 00000000000000..125b62311b31f9
--- /dev/null
+++ b/arch/x86/entry/entry_fred.c
@@ -0,0 +1,245 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The FRED specific kernel/user entry functions which are invoked from
+ * assembly code and dispatch to the associated handlers.
+ */
+#include <linux/kernel.h>
+#include <linux/kdebug.h>
+#include <linux/nospec.h>
+
+#include <asm/desc.h>
+#include <asm/fred.h>
+#include <asm/idtentry.h>
+#include <asm/syscall.h>
+#include <asm/trapnr.h>
+#include <asm/traps.h>
+
+/* FRED EVENT_TYPE_OTHER vector numbers */
+#define FRED_SYSCALL			1
+#define FRED_SYSENTER			2
+
+static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code)
+{
+	irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+
+	instrumentation_begin();
+
+	/* Panic on events from a high stack level */
+	if (regs->fred_cs.sl > 0) {
+		pr_emerg("PANIC: invalid or fatal FRED event; event type %u "
+			 "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n",
+			 regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax,
+			 fred_event_data(regs), regs->cs, regs->ip);
+		die("invalid or fatal FRED event", regs, regs->orig_ax);
+		panic("invalid or fatal FRED event");
+	} else {
+		unsigned long flags = oops_begin();
+		int sig = SIGKILL;
+
+		pr_alert("BUG: invalid or fatal FRED event; event type %u "
+			 "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n",
+			 regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax,
+			 fred_event_data(regs), regs->cs, regs->ip);
+
+		if (__die("Invalid or fatal FRED event", regs, regs->orig_ax))
+			sig = 0;
+
+		oops_end(flags, regs, sig);
+	}
+
+	instrumentation_end();
+	irqentry_nmi_exit(regs, irq_state);
+}
+
+static noinstr void fred_intx(struct pt_regs *regs)
+{
+	switch (regs->fred_ss.vector) {
+	/* Opcode 0xcd, 0x3, NOT INT3 (opcode 0xcc) */
+	case X86_TRAP_BP:
+		return exc_int3(regs);
+
+	/* Opcode 0xcd, 0x4, NOT INTO (opcode 0xce) */
+	case X86_TRAP_OF:
+		return exc_overflow(regs);
+
+#ifdef CONFIG_IA32_EMULATION
+	/* INT80 */
+	case IA32_SYSCALL_VECTOR:
+		if (ia32_enabled())
+			return int80_emulation(regs);
+		fallthrough;
+#endif
+
+	default:
+		return exc_general_protection(regs, 0);
+	}
+}
+
+static __always_inline void fred_other(struct pt_regs *regs)
+{
+	/* The compiler can fold these conditions into a single test */
+	if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.lm)) {
+		regs->orig_ax = regs->ax;
+		regs->ax = -ENOSYS;
+		do_syscall_64(regs, regs->orig_ax);
+		return;
+	} else if (ia32_enabled() &&
+		   likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.lm)) {
+		regs->orig_ax = regs->ax;
+		regs->ax = -ENOSYS;
+		do_fast_syscall_32(regs);
+		return;
+	} else {
+		exc_invalid_op(regs);
+		return;
+	}
+}
+
+#define SYSVEC(_vector, _function) [_vector - FIRST_SYSTEM_VECTOR] = fred_sysvec_##_function
+
+static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = {
+	SYSVEC(ERROR_APIC_VECTOR,		error_interrupt),
+	SYSVEC(SPURIOUS_APIC_VECTOR,		spurious_apic_interrupt),
+	SYSVEC(LOCAL_TIMER_VECTOR,		apic_timer_interrupt),
+	SYSVEC(X86_PLATFORM_IPI_VECTOR,		x86_platform_ipi),
+
+	SYSVEC(RESCHEDULE_VECTOR,		reschedule_ipi),
+	SYSVEC(CALL_FUNCTION_SINGLE_VECTOR,	call_function_single),
+	SYSVEC(CALL_FUNCTION_VECTOR,		call_function),
+	SYSVEC(REBOOT_VECTOR,			reboot),
+
+	SYSVEC(THRESHOLD_APIC_VECTOR,		threshold),
+	SYSVEC(DEFERRED_ERROR_VECTOR,		deferred_error),
+	SYSVEC(THERMAL_APIC_VECTOR,		thermal),
+
+	SYSVEC(IRQ_WORK_VECTOR,			irq_work),
+
+	SYSVEC(POSTED_INTR_VECTOR,		kvm_posted_intr_ipi),
+	SYSVEC(POSTED_INTR_WAKEUP_VECTOR,	kvm_posted_intr_wakeup_ipi),
+	SYSVEC(POSTED_INTR_NESTED_VECTOR,	kvm_posted_intr_nested_ipi),
+};
+
+static noinstr void fred_extint(struct pt_regs *regs)
+{
+	unsigned int vector = regs->fred_ss.vector;
+	unsigned int index = array_index_nospec(vector - FIRST_SYSTEM_VECTOR,
+						NR_SYSTEM_VECTORS);
+
+	if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR))
+		return;
+
+	if (likely(vector >= FIRST_SYSTEM_VECTOR)) {
+		irqentry_state_t state = irqentry_enter(regs);
+
+		instrumentation_begin();
+		sysvec_table[index](regs);
+		instrumentation_end();
+		irqentry_exit(regs, state);
+	} else {
+		common_interrupt(regs, vector);
+	}
+}
+
+static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code)
+{
+	/* Optimize for #PF. That's the only exception which matters performance wise */
+	if (likely(regs->fred_ss.vector == X86_TRAP_PF))
+		return exc_page_fault(regs, error_code);
+
+	switch (regs->fred_ss.vector) {
+	case X86_TRAP_DE: return exc_divide_error(regs);
+	case X86_TRAP_DB: return fred_exc_debug(regs);
+	case X86_TRAP_BR: return exc_bounds(regs);
+	case X86_TRAP_UD: return exc_invalid_op(regs);
+	case X86_TRAP_NM: return exc_device_not_available(regs);
+	case X86_TRAP_DF: return exc_double_fault(regs, error_code);
+	case X86_TRAP_TS: return exc_invalid_tss(regs, error_code);
+	case X86_TRAP_NP: return exc_segment_not_present(regs, error_code);
+	case X86_TRAP_SS: return exc_stack_segment(regs, error_code);
+	case X86_TRAP_GP: return exc_general_protection(regs, error_code);
+	case X86_TRAP_MF: return exc_coprocessor_error(regs);
+	case X86_TRAP_AC: return exc_alignment_check(regs, error_code);
+	case X86_TRAP_XF: return exc_simd_coprocessor_error(regs);
+
+#ifdef CONFIG_X86_MCE
+	case X86_TRAP_MC: return fred_exc_machine_check(regs);
+#endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+	case X86_TRAP_VE: return exc_virtualization_exception(regs);
+#endif
+#ifdef CONFIG_X86_CET
+	case X86_TRAP_CP: return exc_control_protection(regs, error_code);
+#endif
+	default: return fred_bad_type(regs, error_code);
+	}
+
+}
+
+static noinstr void fred_swexc(struct pt_regs *regs, unsigned long error_code)
+{
+	switch (regs->fred_ss.vector) {
+	case X86_TRAP_BP: return exc_int3(regs);
+	case X86_TRAP_OF: return exc_overflow(regs);
+	default: return fred_bad_type(regs, error_code);
+	}
+}
+
+__visible noinstr void fred_entry_from_user(struct pt_regs *regs)
+{
+	unsigned long error_code = regs->orig_ax;
+
+	/* Invalidate orig_ax so that syscall_get_nr() works correctly */
+	regs->orig_ax = -1;
+
+	switch (regs->fred_ss.type) {
+	case EVENT_TYPE_EXTINT:
+		return fred_extint(regs);
+	case EVENT_TYPE_NMI:
+		if (likely(regs->fred_ss.vector == X86_TRAP_NMI))
+			return fred_exc_nmi(regs);
+		break;
+	case EVENT_TYPE_HWEXC:
+		return fred_hwexc(regs, error_code);
+	case EVENT_TYPE_SWINT:
+		return fred_intx(regs);
+	case EVENT_TYPE_PRIV_SWEXC:
+		if (likely(regs->fred_ss.vector == X86_TRAP_DB))
+			return fred_exc_debug(regs);
+		break;
+	case EVENT_TYPE_SWEXC:
+		return fred_swexc(regs, error_code);
+	case EVENT_TYPE_OTHER:
+		return fred_other(regs);
+	default: break;
+	}
+
+	return fred_bad_type(regs, error_code);
+}
+
+__visible noinstr void fred_entry_from_kernel(struct pt_regs *regs)
+{
+	unsigned long error_code = regs->orig_ax;
+
+	/* Invalidate orig_ax so that syscall_get_nr() works correctly */
+	regs->orig_ax = -1;
+
+	switch (regs->fred_ss.type) {
+	case EVENT_TYPE_EXTINT:
+		return fred_extint(regs);
+	case EVENT_TYPE_NMI:
+		if (likely(regs->fred_ss.vector == X86_TRAP_NMI))
+			return fred_exc_nmi(regs);
+		break;
+	case EVENT_TYPE_HWEXC:
+		return fred_hwexc(regs, error_code);
+	case EVENT_TYPE_PRIV_SWEXC:
+		if (likely(regs->fred_ss.vector == X86_TRAP_DB))
+			return fred_exc_debug(regs);
+		break;
+	case EVENT_TYPE_SWEXC:
+		return fred_swexc(regs, error_code);
+	default: break;
+	}
+
+	return fred_bad_type(regs, error_code);
+}
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index b1a98fa38828e2..076bf8dee70264 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -12,6 +12,7 @@
 #include <asm/special_insns.h>
 #include <asm/preempt.h>
 #include <asm/asm.h>
+#include <asm/fred.h>
 #include <asm/gsseg.h>
 
 #ifndef CONFIG_X86_CMPXCHG64
diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
index f514fdb5a39f73..16a64ffecbf8d5 100644
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -60,6 +60,12 @@ static __always_inline unsigned long fred_event_data(struct pt_regs *regs)
 	return fred_info(regs)->edata;
 }
 
+void asm_fred_entrypoint_user(void);
+void asm_fred_entrypoint_kernel(void);
+
+__visible void fred_entry_from_user(struct pt_regs *regs);
+__visible void fred_entry_from_kernel(struct pt_regs *regs);
+
 #else /* CONFIG_X86_FRED */
 static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; }
 #endif /* CONFIG_X86_FRED */
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index c7ef6ea2fa993c..4212c00c9708d4 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -69,7 +69,7 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm);
 
 extern bool __ia32_enabled;
 
-static inline bool ia32_enabled(void)
+static __always_inline bool ia32_enabled(void)
 {
 	return __ia32_enabled;
 }
@@ -81,7 +81,7 @@ static inline void ia32_disable(void)
 
 #else /* !CONFIG_IA32_EMULATION */
 
-static inline bool ia32_enabled(void)
+static __always_inline bool ia32_enabled(void)
 {
 	return IS_ENABLED(CONFIG_X86_32);
 }

From db7c787d8ba268a8d8beabb0027715246375c6e0 Mon Sep 17 00:00:00 2001
From: Xin Li <xin3.li@intel.com>
Date: Tue, 5 Dec 2023 02:50:16 -0800
Subject: [PATCH 362/707] x86/traps: Add sysvec_install() to install a system
 interrupt handler

Add sysvec_install() to install a system interrupt handler into the IDT
or the FRED system interrupt handler table.

Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-28-xin3.li@intel.com
---
 arch/x86/entry/entry_fred.c      | 14 ++++++++++++++
 arch/x86/include/asm/desc.h      |  2 --
 arch/x86/include/asm/idtentry.h  | 15 +++++++++++++++
 arch/x86/kernel/cpu/acrn.c       |  4 ++--
 arch/x86/kernel/cpu/mshyperv.c   | 15 +++++++--------
 arch/x86/kernel/idt.c            |  4 ++--
 arch/x86/kernel/kvm.c            |  2 +-
 drivers/xen/events/events_base.c |  2 +-
 8 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
index 125b62311b31f9..3be0269bc0d465 100644
--- a/arch/x86/entry/entry_fred.c
+++ b/arch/x86/entry/entry_fred.c
@@ -119,6 +119,20 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = {
 	SYSVEC(POSTED_INTR_NESTED_VECTOR,	kvm_posted_intr_nested_ipi),
 };
 
+static bool fred_setup_done __initdata;
+
+void __init fred_install_sysvec(unsigned int sysvec, idtentry_t handler)
+{
+	if (WARN_ON_ONCE(sysvec < FIRST_SYSTEM_VECTOR))
+		return;
+
+	if (WARN_ON_ONCE(fred_setup_done))
+		return;
+
+	if (!WARN_ON_ONCE(sysvec_table[sysvec - FIRST_SYSTEM_VECTOR]))
+		 sysvec_table[sysvec - FIRST_SYSTEM_VECTOR] = handler;
+}
+
 static noinstr void fred_extint(struct pt_regs *regs)
 {
 	unsigned int vector = regs->fred_ss.vector;
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index ab97b22ac04a26..ec95fe44fa3a03 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -402,8 +402,6 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
 	desc->limit1 = (limit >> 16) & 0xf;
 }
 
-void alloc_intr_gate(unsigned int n, const void *addr);
-
 static inline void init_idt_data(struct idt_data *data, unsigned int n,
 				 const void *addr)
 {
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 570f286ca7ddae..47d4c04d103df4 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -459,6 +459,21 @@ __visible noinstr void func(struct pt_regs *regs,			\
 #define DEFINE_FREDENTRY_DEBUG		DEFINE_FREDENTRY_RAW
 #endif
 
+void idt_install_sysvec(unsigned int n, const void *function);
+
+#ifdef CONFIG_X86_FRED
+void fred_install_sysvec(unsigned int vector, const idtentry_t function);
+#else
+static inline void fred_install_sysvec(unsigned int vector, const idtentry_t function) { }
+#endif
+
+#define sysvec_install(vector, function) {				\
+	if (cpu_feature_enabled(X86_FEATURE_FRED))			\
+		fred_install_sysvec(vector, function);			\
+	else								\
+		idt_install_sysvec(vector, asm_##function);		\
+}
+
 #else /* !__ASSEMBLY__ */
 
 /*
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index bfeb18fad63f15..2c5b51aad91a03 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -26,8 +26,8 @@ static u32 __init acrn_detect(void)
 
 static void __init acrn_init_platform(void)
 {
-	/* Setup the IDT for ACRN hypervisor callback */
-	alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback);
+	/* Install system interrupt handler for ACRN hypervisor callback */
+	sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
 
 	x86_platform.calibrate_tsc = acrn_get_tsc_khz;
 	x86_platform.calibrate_cpu = acrn_get_tsc_khz;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 01fa06dd06b66c..45e0e70e238cf3 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -539,19 +539,18 @@ static void __init ms_hyperv_init_platform(void)
 	 */
 	x86_platform.apic_post_init = hyperv_init;
 	hyperv_setup_mmu_ops();
-	/* Setup the IDT for hypervisor callback */
-	alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback);
 
-	/* Setup the IDT for reenlightenment notifications */
+	/* Install system interrupt handler for hypervisor callback */
+	sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback);
+
+	/* Install system interrupt handler for reenlightenment notifications */
 	if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) {
-		alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR,
-				asm_sysvec_hyperv_reenlightenment);
+		sysvec_install(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);
 	}
 
-	/* Setup the IDT for stimer0 */
+	/* Install system interrupt handler for stimer0 */
 	if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) {
-		alloc_intr_gate(HYPERV_STIMER0_VECTOR,
-				asm_sysvec_hyperv_stimer0);
+		sysvec_install(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0);
 	}
 
 # ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 660b601f1d6c33..0cd53fa8c65d1d 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -337,7 +337,7 @@ void idt_invalidate(void)
 	load_idt(&idt);
 }
 
-void __init alloc_intr_gate(unsigned int n, const void *addr)
+void __init idt_install_sysvec(unsigned int n, const void *function)
 {
 	if (WARN_ON(n < FIRST_SYSTEM_VECTOR))
 		return;
@@ -346,5 +346,5 @@ void __init alloc_intr_gate(unsigned int n, const void *addr)
 		return;
 
 	if (!WARN_ON(test_and_set_bit(n, system_vectors)))
-		set_intr_gate(n, addr);
+		set_intr_gate(n, function);
 }
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index dfe9945b9becee..b05557918ae20a 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -829,7 +829,7 @@ static void __init kvm_guest_init(void)
 
 	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
 		static_branch_enable(&kvm_async_pf_enabled);
-		alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt);
+		sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
 	}
 
 #ifdef CONFIG_SMP
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6b6..e2813bac92d40c 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -2216,7 +2216,7 @@ static __init void xen_alloc_callback_vector(void)
 		return;
 
 	pr_info("Xen HVM callback vector for event delivery is enabled\n");
-	alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_xen_hvm_callback);
+	sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback);
 }
 #else
 void xen_setup_callback_vector(void) {}

From 531ff17a705a0f0ecafb8823956f69d5fbfda6fd Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Tue, 5 Dec 2023 02:50:17 -0800
Subject: [PATCH 363/707] x86/fred: Let ret_from_fork_asm() jmp to
 asm_fred_exit_user when FRED is enabled

Let ret_from_fork_asm() jmp to asm_fred_exit_user when FRED is enabled,
otherwise the existing IDT code is chosen.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-29-xin3.li@intel.com
---
 arch/x86/entry/entry_64.S      | 6 ++++++
 arch/x86/entry/entry_64_fred.S | 1 +
 2 files changed, 7 insertions(+)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 29ce68f8ede043..7c4b7263b8571e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -247,7 +247,13 @@ SYM_CODE_START(ret_from_fork_asm)
 	 * and unwind should work normally.
 	 */
 	UNWIND_HINT_REGS
+
+#ifdef CONFIG_X86_FRED
+	ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \
+		    "jmp asm_fred_exit_user", X86_FEATURE_FRED
+#else
 	jmp	swapgs_restore_regs_and_return_to_usermode
+#endif
 SYM_CODE_END(ret_from_fork_asm)
 .popsection
 
diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
index c1ddaf6b068f4d..2271a1c690dc66 100644
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -32,6 +32,7 @@
 SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user)
 	FRED_ENTER
 	call	fred_entry_from_user
+SYM_INNER_LABEL(asm_fred_exit_user, SYM_L_GLOBAL)
 	FRED_EXIT
 	ERETU
 SYM_CODE_END(asm_fred_entrypoint_user)

From ed63bc7d4953bd5fe93a5c3acef7f485fb216208 Mon Sep 17 00:00:00 2001
From: Xin Li <xin3.li@intel.com>
Date: Tue, 5 Dec 2023 02:50:18 -0800
Subject: [PATCH 364/707] x86/fred: Fixup fault on ERETU by jumping to
 fred_entrypoint_user

If the stack frame contains an invalid user context (e.g. due to invalid SS,
a non-canonical RIP, etc.) the ERETU instruction will trap (#SS or #GP).

From a Linux point of view, this really should be considered a user space
failure, so use the standard fault fixup mechanism to intercept the fault,
fix up the exception frame, and redirect execution to fred_entrypoint_user.
The end result is that it appears just as if the hardware had taken the
exception immediately after completing the transition to user space.

Suggested-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-30-xin3.li@intel.com
---
 arch/x86/entry/entry_64_fred.S             |  5 +-
 arch/x86/include/asm/extable_fixup_types.h |  4 +-
 arch/x86/mm/extable.c                      | 78 ++++++++++++++++++++++
 3 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
index 2271a1c690dc66..7fe2722ad90c16 100644
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -3,6 +3,7 @@
  * The actual FRED entry points.
  */
 
+#include <asm/asm.h>
 #include <asm/fred.h>
 
 #include "calling.h"
@@ -34,7 +35,9 @@ SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user)
 	call	fred_entry_from_user
 SYM_INNER_LABEL(asm_fred_exit_user, SYM_L_GLOBAL)
 	FRED_EXIT
-	ERETU
+1:	ERETU
+
+	_ASM_EXTABLE_TYPE(1b, asm_fred_entrypoint_user, EX_TYPE_ERETU)
 SYM_CODE_END(asm_fred_entrypoint_user)
 
 /*
diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h
index fe6312045042f8..7acf0383be8022 100644
--- a/arch/x86/include/asm/extable_fixup_types.h
+++ b/arch/x86/include/asm/extable_fixup_types.h
@@ -64,6 +64,8 @@
 #define	EX_TYPE_UCOPY_LEN4		(EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4))
 #define	EX_TYPE_UCOPY_LEN8		(EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8))
 
-#define EX_TYPE_ZEROPAD			20 /* longword load with zeropad on fault */
+#define	EX_TYPE_ZEROPAD			20 /* longword load with zeropad on fault */
+
+#define	EX_TYPE_ERETU			21
 
 #endif
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 271dcb2deabc31..b522933bfa56e8 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -6,6 +6,7 @@
 #include <xen/xen.h>
 
 #include <asm/fpu/api.h>
+#include <asm/fred.h>
 #include <asm/sev.h>
 #include <asm/traps.h>
 #include <asm/kdebug.h>
@@ -223,6 +224,79 @@ static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup,
 	return ex_handler_uaccess(fixup, regs, trapnr, fault_address);
 }
 
+#ifdef CONFIG_X86_FRED
+static bool ex_handler_eretu(const struct exception_table_entry *fixup,
+			     struct pt_regs *regs, unsigned long error_code)
+{
+	struct pt_regs *uregs = (struct pt_regs *)(regs->sp - offsetof(struct pt_regs, orig_ax));
+	unsigned short ss = uregs->ss;
+	unsigned short cs = uregs->cs;
+
+	/*
+	 * Move the NMI bit from the invalid stack frame, which caused ERETU
+	 * to fault, to the fault handler's stack frame, thus to unblock NMI
+	 * with the fault handler's ERETS instruction ASAP if NMI is blocked.
+	 */
+	regs->fred_ss.nmi = uregs->fred_ss.nmi;
+
+	/*
+	 * Sync event information to uregs, i.e., the ERETU return frame, but
+	 * is it safe to write to the ERETU return frame which is just above
+	 * current event stack frame?
+	 *
+	 * The RSP used by FRED to push a stack frame is not the value in %rsp,
+	 * it is calculated from %rsp with the following 2 steps:
+	 * 1) RSP = %rsp - (IA32_FRED_CONFIG & 0x1c0)	// Reserve N*64 bytes
+	 * 2) RSP = RSP & ~0x3f		// Align to a 64-byte cache line
+	 * when an event delivery doesn't trigger a stack level change.
+	 *
+	 * Here is an example with N*64 (N=1) bytes reserved:
+	 *
+	 *  64-byte cache line ==>  ______________
+	 *                         |___Reserved___|
+	 *                         |__Event_data__|
+	 *                         |_____SS_______|
+	 *                         |_____RSP______|
+	 *                         |_____FLAGS____|
+	 *                         |_____CS_______|
+	 *                         |_____IP_______|
+	 *  64-byte cache line ==> |__Error_code__| <== ERETU return frame
+	 *                         |______________|
+	 *                         |______________|
+	 *                         |______________|
+	 *                         |______________|
+	 *                         |______________|
+	 *                         |______________|
+	 *                         |______________|
+	 *  64-byte cache line ==> |______________| <== RSP after step 1) and 2)
+	 *                         |___Reserved___|
+	 *                         |__Event_data__|
+	 *                         |_____SS_______|
+	 *                         |_____RSP______|
+	 *                         |_____FLAGS____|
+	 *                         |_____CS_______|
+	 *                         |_____IP_______|
+	 *  64-byte cache line ==> |__Error_code__| <== ERETS return frame
+	 *
+	 * Thus a new FRED stack frame will always be pushed below a previous
+	 * FRED stack frame ((N*64) bytes may be reserved between), and it is
+	 * safe to write to a previous FRED stack frame as they never overlap.
+	 */
+	fred_info(uregs)->edata = fred_event_data(regs);
+	uregs->ssx = regs->ssx;
+	uregs->fred_ss.ss = ss;
+	/* The NMI bit was moved away above */
+	uregs->fred_ss.nmi = 0;
+	uregs->csx = regs->csx;
+	uregs->fred_cs.sl = 0;
+	uregs->fred_cs.wfe = 0;
+	uregs->cs = cs;
+	uregs->orig_ax = error_code;
+
+	return ex_handler_default(fixup, regs);
+}
+#endif
+
 int ex_get_fixup_type(unsigned long ip)
 {
 	const struct exception_table_entry *e = search_exception_tables(ip);
@@ -300,6 +374,10 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
 		return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm);
 	case EX_TYPE_ZEROPAD:
 		return ex_handler_zeropad(e, regs, fault_addr);
+#ifdef CONFIG_X86_FRED
+	case EX_TYPE_ERETU:
+		return ex_handler_eretu(e, regs, error_code);
+#endif
 	}
 	BUG();
 }

From 8c968f4df73c62be94229c7dbbb330ba9fadbd50 Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Tue, 5 Dec 2023 02:50:19 -0800
Subject: [PATCH 365/707] x86/entry/calling: Allow PUSH_AND_CLEAR_REGS being
 used beyond actual entry code

PUSH_AND_CLEAR_REGS could be used besides actual entry code; in that case
%rbp shouldn't be cleared (otherwise the frame pointer is destroyed) and
UNWIND_HINT shouldn't be added.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-31-xin3.li@intel.com
---
 arch/x86/entry/calling.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 9f1d94790a5491..3ff925b17b7ed5 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -65,7 +65,7 @@ For 32-bit we have the following conventions - kernel is built with
  * for assembly code:
  */
 
-.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0
+.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 unwind_hint=1
 	.if \save_ret
 	pushq	%rsi		/* pt_regs->si */
 	movq	8(%rsp), %rsi	/* temporarily store the return address in %rsi */
@@ -87,14 +87,17 @@ For 32-bit we have the following conventions - kernel is built with
 	pushq	%r13		/* pt_regs->r13 */
 	pushq	%r14		/* pt_regs->r14 */
 	pushq	%r15		/* pt_regs->r15 */
+
+	.if \unwind_hint
 	UNWIND_HINT_REGS
+	.endif
 
 	.if \save_ret
 	pushq	%rsi		/* return address on top of stack */
 	.endif
 .endm
 
-.macro CLEAR_REGS
+.macro CLEAR_REGS clear_bp=1
 	/*
 	 * Sanitize registers of values that a speculation attack might
 	 * otherwise want to exploit. The lower registers are likely clobbered
@@ -109,7 +112,9 @@ For 32-bit we have the following conventions - kernel is built with
 	xorl	%r10d, %r10d	/* nospec r10 */
 	xorl	%r11d, %r11d	/* nospec r11 */
 	xorl	%ebx,  %ebx	/* nospec rbx */
+	.if \clear_bp
 	xorl	%ebp,  %ebp	/* nospec rbp */
+	.endif
 	xorl	%r12d, %r12d	/* nospec r12 */
 	xorl	%r13d, %r13d	/* nospec r13 */
 	xorl	%r14d, %r14d	/* nospec r14 */
@@ -117,9 +122,9 @@ For 32-bit we have the following conventions - kernel is built with
 
 .endm
 
-.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0
-	PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret
-	CLEAR_REGS
+.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 clear_bp=1 unwind_hint=1
+	PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret unwind_hint=\unwind_hint
+	CLEAR_REGS clear_bp=\clear_bp
 .endm
 
 .macro POP_REGS pop_rdi=1

From d8fbd04962865730bb67106e862bfbe363a9c284 Mon Sep 17 00:00:00 2001
From: Xin Li <xin3.li@intel.com>
Date: Tue, 5 Dec 2023 02:50:20 -0800
Subject: [PATCH 366/707] x86/entry: Add fred_entry_from_kvm() for VMX to
 handle IRQ/NMI

In IRQ/NMI induced VM exits, KVM VMX needs to execute the respective
handlers, which requires the software to create a FRED stack frame,
and use it to invoke the handlers. Add fred_irq_entry_from_kvm() for
this job.

Export fred_entry_from_kvm() because VMX can be compiled as a module.

Suggested-by: Sean Christopherson <seanjc@google.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-32-xin3.li@intel.com
---
 arch/x86/entry/entry_64_fred.S | 77 ++++++++++++++++++++++++++++++++++
 arch/x86/entry/entry_fred.c    | 14 +++++++
 arch/x86/include/asm/fred.h    | 18 ++++++++
 3 files changed, 109 insertions(+)

diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
index 7fe2722ad90c16..a02bc6f3d2e6a4 100644
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -3,8 +3,11 @@
  * The actual FRED entry points.
  */
 
+#include <linux/export.h>
+
 #include <asm/asm.h>
 #include <asm/fred.h>
+#include <asm/segment.h>
 
 #include "calling.h"
 
@@ -52,3 +55,77 @@ SYM_CODE_START_NOALIGN(asm_fred_entrypoint_kernel)
 	FRED_EXIT
 	ERETS
 SYM_CODE_END(asm_fred_entrypoint_kernel)
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+SYM_FUNC_START(asm_fred_entry_from_kvm)
+	push %rbp
+	mov %rsp, %rbp
+
+	UNWIND_HINT_SAVE
+
+	/*
+	 * Both IRQ and NMI from VMX can be handled on current task stack
+	 * because there is no need to protect from reentrancy and the call
+	 * stack leading to this helper is effectively constant and shallow
+	 * (relatively speaking). Do the same when FRED is active, i.e., no
+	 * need to check current stack level for a stack switch.
+	 *
+	 * Emulate the FRED-defined redzone and stack alignment.
+	 */
+	sub $(FRED_CONFIG_REDZONE_AMOUNT << 6), %rsp
+	and $FRED_STACK_FRAME_RSP_MASK, %rsp
+
+	/*
+	 * Start to push a FRED stack frame, which is always 64 bytes:
+	 *
+	 * +--------+-----------------+
+	 * | Bytes  | Usage           |
+	 * +--------+-----------------+
+	 * | 63:56  | Reserved        |
+	 * | 55:48  | Event Data      |
+	 * | 47:40  | SS + Event Info |
+	 * | 39:32  | RSP             |
+	 * | 31:24  | RFLAGS          |
+	 * | 23:16  | CS + Aux Info   |
+	 * |  15:8  | RIP             |
+	 * |   7:0  | Error Code      |
+	 * +--------+-----------------+
+	 */
+	push $0				/* Reserved, must be 0 */
+	push $0				/* Event data, 0 for IRQ/NMI */
+	push %rdi			/* fred_ss handed in by the caller */
+	push %rbp
+	pushf
+	mov $__KERNEL_CS, %rax
+	push %rax
+
+	/*
+	 * Unlike the IDT event delivery, FRED _always_ pushes an error code
+	 * after pushing the return RIP, thus the CALL instruction CANNOT be
+	 * used here to push the return RIP, otherwise there is no chance to
+	 * push an error code before invoking the IRQ/NMI handler.
+	 *
+	 * Use LEA to get the return RIP and push it, then push an error code.
+	 */
+	lea 1f(%rip), %rax
+	push %rax				/* Return RIP */
+	push $0					/* Error code, 0 for IRQ/NMI */
+
+	PUSH_AND_CLEAR_REGS clear_bp=0 unwind_hint=0
+	movq %rsp, %rdi				/* %rdi -> pt_regs */
+	call __fred_entry_from_kvm		/* Call the C entry point */
+	POP_REGS
+	ERETS
+1:
+	/*
+	 * Objtool doesn't understand what ERETS does, this hint tells it that
+	 * yes, we'll reach here and with what stack state. A save/restore pair
+	 * isn't strictly needed, but it's the simplest form.
+	 */
+	UNWIND_HINT_RESTORE
+	pop %rbp
+	RET
+
+SYM_FUNC_END(asm_fred_entry_from_kvm)
+EXPORT_SYMBOL_GPL(asm_fred_entry_from_kvm);
+#endif
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
index 3be0269bc0d465..6ecc08b6d72a24 100644
--- a/arch/x86/entry/entry_fred.c
+++ b/arch/x86/entry/entry_fred.c
@@ -257,3 +257,17 @@ __visible noinstr void fred_entry_from_kernel(struct pt_regs *regs)
 
 	return fred_bad_type(regs, error_code);
 }
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+__visible noinstr void __fred_entry_from_kvm(struct pt_regs *regs)
+{
+	switch (regs->fred_ss.type) {
+	case EVENT_TYPE_EXTINT:
+		return fred_extint(regs);
+	case EVENT_TYPE_NMI:
+		return fred_exc_nmi(regs);
+	default:
+		WARN_ON_ONCE(1);
+	}
+}
+#endif
diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
index 16a64ffecbf8d5..2fa9f34e5c95f5 100644
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -9,6 +9,7 @@
 #include <linux/const.h>
 
 #include <asm/asm.h>
+#include <asm/trapnr.h>
 
 /*
  * FRED event return instruction opcodes for ERET{S,U}; supported in
@@ -62,12 +63,29 @@ static __always_inline unsigned long fred_event_data(struct pt_regs *regs)
 
 void asm_fred_entrypoint_user(void);
 void asm_fred_entrypoint_kernel(void);
+void asm_fred_entry_from_kvm(struct fred_ss);
 
 __visible void fred_entry_from_user(struct pt_regs *regs);
 __visible void fred_entry_from_kernel(struct pt_regs *regs);
+__visible void __fred_entry_from_kvm(struct pt_regs *regs);
+
+/* Can be called from noinstr code, thus __always_inline */
+static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector)
+{
+	struct fred_ss ss = {
+		.ss     =__KERNEL_DS,
+		.type   = type,
+		.vector = vector,
+		.nmi    = type == EVENT_TYPE_NMI,
+		.lm     = 1,
+	};
+
+	asm_fred_entry_from_kvm(ss);
+}
 
 #else /* CONFIG_X86_FRED */
 static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; }
+static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
 #endif /* CONFIG_X86_FRED */
 #endif /* !__ASSEMBLY__ */
 

From cb5429aaa0c53d60414a08fb40f8d15d748c4cda Mon Sep 17 00:00:00 2001
From: Xin Li <xin3.li@intel.com>
Date: Tue, 5 Dec 2023 02:50:21 -0800
Subject: [PATCH 367/707] KVM: VMX: Call fred_entry_from_kvm() for IRQ/NMI
 handling

When FRED is enabled, call fred_entry_from_kvm() to handle IRQ/NMI in
IRQ/NMI induced VM exits.

Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Link: https://lore.kernel.org/r/20231205105030.8698-33-xin3.li@intel.com
---
 arch/x86/kvm/vmx/vmx.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e262bc2ba4e569..cce92f701deeed 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -38,6 +38,7 @@
 #include <asm/desc.h>
 #include <asm/fpu/api.h>
 #include <asm/fpu/xstate.h>
+#include <asm/fred.h>
 #include <asm/idtentry.h>
 #include <asm/io.h>
 #include <asm/irq_remapping.h>
@@ -6960,14 +6961,16 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
 {
 	u32 intr_info = vmx_get_intr_info(vcpu);
 	unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
-	gate_desc *desc = (gate_desc *)host_idt_base + vector;
 
 	if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
 	    "unexpected VM-Exit interrupt info: 0x%x", intr_info))
 		return;
 
 	kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
-	vmx_do_interrupt_irqoff(gate_offset(desc));
+	if (cpu_feature_enabled(X86_FEATURE_FRED))
+		fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
+	else
+		vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
 	kvm_after_interrupt(vcpu);
 
 	vcpu->arch.at_instruction_boundary = true;
@@ -7260,7 +7263,10 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
 	if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
 	    is_nmi(vmx_get_intr_info(vcpu))) {
 		kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
-		vmx_do_nmi_irqoff();
+		if (cpu_feature_enabled(X86_FEATURE_FRED))
+			fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
+		else
+			vmx_do_nmi_irqoff();
 		kvm_after_interrupt(vcpu);
 	}
 

From ae46f3978ae4eb9da013ff9963f105de2db2f8ec Mon Sep 17 00:00:00 2001
From: Xin Li <xin3.li@intel.com>
Date: Tue, 5 Dec 2023 02:50:22 -0800
Subject: [PATCH 368/707] x86/syscall: Split IDT syscall setup code into
 idt_syscall_init()

Because FRED uses the ring 3 FRED entrypoint for SYSCALL and SYSENTER and
ERETU is the only legit instruction to return to ring 3, there is NO need
to setup SYSCALL and SYSENTER MSRs for FRED, except the IA32_STAR MSR.

Split IDT syscall setup code into idt_syscall_init() to make it easy to
skip syscall setup code when FRED is enabled.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-34-xin3.li@intel.com
---
 arch/x86/kernel/cpu/common.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c3a175770df032..4f5e4aa35e5a9e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2066,10 +2066,8 @@ static void wrmsrl_cstar(unsigned long val)
 		wrmsrl(MSR_CSTAR, val);
 }
 
-/* May not be marked __init: used by software suspend */
-void syscall_init(void)
+static inline void idt_syscall_init(void)
 {
-	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
 	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
 
 	if (ia32_enabled()) {
@@ -2103,6 +2101,15 @@ void syscall_init(void)
 	       X86_EFLAGS_AC|X86_EFLAGS_ID);
 }
 
+/* May not be marked __init: used by software suspend */
+void syscall_init(void)
+{
+	/* The default user and kernel segments */
+	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
+
+	idt_syscall_init();
+}
+
 #else	/* CONFIG_X86_64 */
 
 #ifdef CONFIG_STACKPROTECTOR

From 43ca697baecf3c90fe108a61cf444de20bbfa5b9 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Tue, 5 Dec 2023 02:50:23 -0800
Subject: [PATCH 369/707] x86/fred: Add FRED initialization functions

Add cpu_init_fred_exceptions() to:
  - Set FRED entrypoints for events happening in ring 0 and 3.
  - Specify the stack level for IRQs occurred ring 0.
  - Specify dedicated event stacks for #DB/NMI/#MCE/#DF.
  - Enable FRED and invalidtes IDT.
  - Force 32-bit system calls to use "int $0x80" only.

Add fred_complete_exception_setup() to:
  - Initialize system_vectors as done for IDT systems.
  - Set unused sysvec_table entries to fred_handle_spurious_interrupt().

Co-developed-by: Xin Li <xin3.li@intel.com>
Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-35-xin3.li@intel.com
---
 arch/x86/entry/entry_fred.c | 21 +++++++++++++
 arch/x86/include/asm/fred.h |  5 ++++
 arch/x86/kernel/Makefile    |  1 +
 arch/x86/kernel/fred.c      | 59 +++++++++++++++++++++++++++++++++++++
 4 files changed, 86 insertions(+)
 create mode 100644 arch/x86/kernel/fred.c

diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
index 6ecc08b6d72a24..ac120cbdaaf2b4 100644
--- a/arch/x86/entry/entry_fred.c
+++ b/arch/x86/entry/entry_fred.c
@@ -133,6 +133,27 @@ void __init fred_install_sysvec(unsigned int sysvec, idtentry_t handler)
 		 sysvec_table[sysvec - FIRST_SYSTEM_VECTOR] = handler;
 }
 
+static noinstr void fred_handle_spurious_interrupt(struct pt_regs *regs)
+{
+	spurious_interrupt(regs, regs->fred_ss.vector);
+}
+
+void __init fred_complete_exception_setup(void)
+{
+	unsigned int vector;
+
+	for (vector = 0; vector < FIRST_EXTERNAL_VECTOR; vector++)
+		set_bit(vector, system_vectors);
+
+	for (vector = 0; vector < NR_SYSTEM_VECTORS; vector++) {
+		if (sysvec_table[vector])
+			set_bit(vector + FIRST_SYSTEM_VECTOR, system_vectors);
+		else
+			sysvec_table[vector] = fred_handle_spurious_interrupt;
+	}
+	fred_setup_done = true;
+}
+
 static noinstr void fred_extint(struct pt_regs *regs)
 {
 	unsigned int vector = regs->fred_ss.vector;
diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
index 2fa9f34e5c95f5..e86c7ba32435f5 100644
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -83,8 +83,13 @@ static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int
 	asm_fred_entry_from_kvm(ss);
 }
 
+void cpu_init_fred_exceptions(void);
+void fred_complete_exception_setup(void);
+
 #else /* CONFIG_X86_FRED */
 static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; }
+static inline void cpu_init_fred_exceptions(void) { }
+static inline void fred_complete_exception_setup(void) { }
 static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
 #endif /* CONFIG_X86_FRED */
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0000325ab98f4d..0dcbfc1a4c419f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -48,6 +48,7 @@ obj-y			+= platform-quirks.o
 obj-y			+= process_$(BITS).o signal.o signal_$(BITS).o
 obj-y			+= traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_X86_FRED)	+= fred.o
 obj-$(CONFIG_MODIFY_LDT_SYSCALL)	+= ldt.o
 obj-$(CONFIG_X86_KERNEL_IBT)		+= ibt_selftest.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o
diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c
new file mode 100644
index 00000000000000..4bcd8791ad96ad
--- /dev/null
+++ b/arch/x86/kernel/fred.c
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+
+#include <asm/desc.h>
+#include <asm/fred.h>
+#include <asm/tlbflush.h>
+#include <asm/traps.h>
+
+/* #DB in the kernel would imply the use of a kernel debugger. */
+#define FRED_DB_STACK_LEVEL		1UL
+#define FRED_NMI_STACK_LEVEL		2UL
+#define FRED_MC_STACK_LEVEL		2UL
+/*
+ * #DF is the highest level because a #DF means "something went wrong
+ * *while delivering an exception*." The number of cases for which that
+ * can happen with FRED is drastically reduced and basically amounts to
+ * "the stack you pointed me to is broken." Thus, always change stacks
+ * on #DF, which means it should be at the highest level.
+ */
+#define FRED_DF_STACK_LEVEL		3UL
+
+#define FRED_STKLVL(vector, lvl)	((lvl) << (2 * (vector)))
+
+void cpu_init_fred_exceptions(void)
+{
+	/* When FRED is enabled by default, remove this log message */
+	pr_info("Initialize FRED on CPU%d\n", smp_processor_id());
+
+	wrmsrl(MSR_IA32_FRED_CONFIG,
+	       /* Reserve for CALL emulation */
+	       FRED_CONFIG_REDZONE |
+	       FRED_CONFIG_INT_STKLVL(0) |
+	       FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user));
+
+	/*
+	 * The purpose of separate stacks for NMI, #DB and #MC *in the kernel*
+	 * (remember that user space faults are always taken on stack level 0)
+	 * is to avoid overflowing the kernel stack.
+	 */
+	wrmsrl(MSR_IA32_FRED_STKLVLS,
+	       FRED_STKLVL(X86_TRAP_DB,  FRED_DB_STACK_LEVEL) |
+	       FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) |
+	       FRED_STKLVL(X86_TRAP_MC,  FRED_MC_STACK_LEVEL) |
+	       FRED_STKLVL(X86_TRAP_DF,  FRED_DF_STACK_LEVEL));
+
+	/* The FRED equivalents to IST stacks... */
+	wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB));
+	wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI));
+	wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF));
+
+	/* Enable FRED */
+	cr4_set_bits(X86_CR4_FRED);
+	/* Any further IDT use is a bug */
+	idt_invalidate();
+
+	/* Use int $0x80 for 32-bit system calls in FRED mode */
+	setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
+	setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+}

From b564b0111a3f03d1a92ba87c4b0f054ad1845963 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>
Date: Tue, 5 Dec 2023 02:50:24 -0800
Subject: [PATCH 370/707] x86/fred: Invoke FRED initialization code to enable
 FRED

Let cpu_init_exception_handling() call cpu_init_fred_exceptions() to
initialize FRED. However if FRED is unavailable or disabled, it falls
back to set up TSS IST and initialize IDT.

Co-developed-by: Xin Li <xin3.li@intel.com>
Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Link: https://lore.kernel.org/r/20231205105030.8698-36-xin3.li@intel.com
---
 arch/x86/kernel/cpu/common.c | 22 +++++++++++++++++-----
 arch/x86/kernel/irqinit.c    |  7 ++++++-
 arch/x86/kernel/traps.c      |  5 ++++-
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4f5e4aa35e5a9e..cf82e3181f7a61 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -61,6 +61,7 @@
 #include <asm/microcode.h>
 #include <asm/intel-family.h>
 #include <asm/cpu_device_id.h>
+#include <asm/fred.h>
 #include <asm/uv/uv.h>
 #include <asm/ia32.h>
 #include <asm/set_memory.h>
@@ -2107,7 +2108,15 @@ void syscall_init(void)
 	/* The default user and kernel segments */
 	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
 
-	idt_syscall_init();
+	/*
+	 * Except the IA32_STAR MSR, there is NO need to setup SYSCALL and
+	 * SYSENTER MSRs for FRED, because FRED uses the ring 3 FRED
+	 * entrypoint for SYSCALL and SYSENTER, and ERETU is the only legit
+	 * instruction to return to ring 3 (both sysexit and sysret cause
+	 * #UD when FRED is enabled).
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_FRED))
+		idt_syscall_init();
 }
 
 #else	/* CONFIG_X86_64 */
@@ -2213,8 +2222,9 @@ void cpu_init_exception_handling(void)
 	/* paranoid_entry() gets the CPU number from the GDT */
 	setup_getcpu(cpu);
 
-	/* IST vectors need TSS to be set up. */
-	tss_setup_ist(tss);
+	/* For IDT mode, IST vectors need to be set in TSS. */
+	if (!cpu_feature_enabled(X86_FEATURE_FRED))
+		tss_setup_ist(tss);
 	tss_setup_io_bitmap(tss);
 	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 
@@ -2223,8 +2233,10 @@ void cpu_init_exception_handling(void)
 	/* GHCB needs to be setup to handle #VC. */
 	setup_ghcb();
 
-	/* Finally load the IDT */
-	load_current_idt();
+	if (cpu_feature_enabled(X86_FEATURE_FRED))
+		cpu_init_fred_exceptions();
+	else
+		load_current_idt();
 }
 
 /*
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index c683666876f1c7..f79c5edc0b892d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -28,6 +28,7 @@
 #include <asm/setup.h>
 #include <asm/i8259.h>
 #include <asm/traps.h>
+#include <asm/fred.h>
 #include <asm/prom.h>
 
 /*
@@ -96,7 +97,11 @@ void __init native_init_IRQ(void)
 	/* Execute any quirks before the call gates are initialised: */
 	x86_init.irqs.pre_vector_init();
 
-	idt_setup_apic_and_irq_gates();
+	if (cpu_feature_enabled(X86_FEATURE_FRED))
+		fred_complete_exception_setup();
+	else
+		idt_setup_apic_and_irq_gates();
+
 	lapic_assign_system_vectors();
 
 	if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) {
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1b19a170f5e35e..6cb31df3d5ffbe 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -1438,7 +1438,10 @@ void __init trap_init(void)
 
 	/* Initialize TSS before setting up traps so ISTs work */
 	cpu_init_exception_handling();
+
 	/* Setup traps as cpu_init() might #GP */
-	idt_setup_traps();
+	if (!cpu_feature_enabled(X86_FEATURE_FRED))
+		idt_setup_traps();
+
 	cpu_init();
 }

From 8e7d967f04df0fa2c2db00f47ac4cd5ea16ade91 Mon Sep 17 00:00:00 2001
From: Patrick Rudolph <patrick.rudolph@9elements.com>
Date: Tue, 30 Jan 2024 17:49:00 +0530
Subject: [PATCH 371/707] dt-bindings: i2c: pca954x: Add custom properties for
 MAX7357

Maxim Max7357 has a configuration register to enable additional
features. These features aren't enabled by default & its up to
board designer to enable the same as it may have unexpected side effects.

These should be validated for proper functioning & detection of devices
in secondary bus as sometimes it can cause secondary bus being disabled.

Add booleans for:
 - maxim,isolate-stuck-channel
 - maxim,send-flush-out-sequence
 - maxim,preconnection-wiggle-test-enable

Signed-off-by: Patrick Rudolph <patrick.rudolph@9elements.com>
Signed-off-by: Naresh Solanki <naresh.solanki@9elements.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Andi Shyti <andi.shyti@kernel.org>
---
 .../bindings/i2c/i2c-mux-pca954x.yaml         | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/Documentation/devicetree/bindings/i2c/i2c-mux-pca954x.yaml b/Documentation/devicetree/bindings/i2c/i2c-mux-pca954x.yaml
index 2d7bb998b0e9d2..9aa0585200c9cd 100644
--- a/Documentation/devicetree/bindings/i2c/i2c-mux-pca954x.yaml
+++ b/Documentation/devicetree/bindings/i2c/i2c-mux-pca954x.yaml
@@ -71,6 +71,23 @@ properties:
     description: A voltage regulator supplying power to the chip. On PCA9846
       the regulator supplies power to VDD2 (core logic) and optionally to VDD1.
 
+  maxim,isolate-stuck-channel:
+    type: boolean
+    description: Allows to use non faulty channels while a stuck channel is
+      isolated from the upstream bus. If not set all channels are isolated from
+      the upstream bus until the fault is cleared.
+
+  maxim,send-flush-out-sequence:
+    type: boolean
+    description: Send a flush-out sequence to stuck auxiliary buses
+      automatically after a stuck channel is being detected.
+
+  maxim,preconnection-wiggle-test-enable:
+    type: boolean
+    description: Send a STOP condition to the auxiliary buses when the switch
+      register activates a channel to detect a stuck high fault. On fault the
+      channel is isolated from the upstream bus.
+
 required:
   - compatible
   - reg
@@ -95,6 +112,19 @@ allOf:
         "#interrupt-cells": false
         interrupt-controller: false
 
+  - if:
+      not:
+        properties:
+          compatible:
+            contains:
+              enum:
+                - maxim,max7357
+    then:
+      properties:
+        maxim,isolate-stuck-channel: false
+        maxim,send-flush-out-sequence: false
+        maxim,preconnection-wiggle-test-enable: false
+
 unevaluatedProperties: false
 
 examples:

From 6b572ea231236bb3be4b819d92119470ac121a9e Mon Sep 17 00:00:00 2001
From: Patrick Rudolph <patrick.rudolph@9elements.com>
Date: Tue, 30 Jan 2024 17:49:01 +0530
Subject: [PATCH 372/707] i2c: muxes: pca954x: Enable features on MAX7357

Enable additional features based on DT settings and unconditionally
release the shared interrupt pin after 1.6 seconds and allow to use
it as reset.

These features aren't enabled by default and it's up to board designer
to validate for proper functioning and detection of devices in secondary
bus as sometimes it can cause secondary bus being disabled.

Signed-off-by: Patrick Rudolph <patrick.rudolph@9elements.com>
Signed-off-by: Naresh Solanki <naresh.solanki@9elements.com>
Reviewed-by: Andi Shyti <andi.shyti@kernel.org>
Signed-off-by: Andi Shyti <andi.shyti@kernel.org>
---
 drivers/i2c/muxes/i2c-mux-pca954x.c | 43 ++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/drivers/i2c/muxes/i2c-mux-pca954x.c b/drivers/i2c/muxes/i2c-mux-pca954x.c
index 2219062104fbca..f5dfc33b97c0ab 100644
--- a/drivers/i2c/muxes/i2c-mux-pca954x.c
+++ b/drivers/i2c/muxes/i2c-mux-pca954x.c
@@ -57,6 +57,20 @@
 
 #define PCA954X_IRQ_OFFSET 4
 
+/*
+ * MAX7357's configuration register is writeable after POR, but
+ * can be locked by setting the basic mode bit. MAX7358 configuration
+ * register is locked by default and needs to be unlocked first.
+ * The configuration register holds the following settings:
+ */
+#define MAX7357_CONF_INT_ENABLE			BIT(0)
+#define MAX7357_CONF_FLUSH_OUT			BIT(1)
+#define MAX7357_CONF_RELEASE_INT		BIT(2)
+#define MAX7357_CONF_DISCON_SINGLE_CHAN		BIT(4)
+#define MAX7357_CONF_PRECONNECT_TEST		BIT(7)
+
+#define MAX7357_POR_DEFAULT_CONF		MAX7357_CONF_INT_ENABLE
+
 enum pca_type {
 	max_7356,
 	max_7357,
@@ -470,7 +484,34 @@ static int pca954x_init(struct i2c_client *client, struct pca954x *data)
 	else
 		data->last_chan = 0; /* Disconnect multiplexer */
 
-	ret = i2c_smbus_write_byte(client, data->last_chan);
+	if (device_is_compatible(&client->dev, "maxim,max7357")) {
+		if (i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_WRITE_BYTE_DATA)) {
+			u8 conf = MAX7357_POR_DEFAULT_CONF;
+			/*
+			 * The interrupt signal is shared with the reset pin. Release the
+			 * interrupt after 1.6 seconds to allow using the pin as reset.
+			 */
+			conf |= MAX7357_CONF_RELEASE_INT;
+
+			if (device_property_read_bool(&client->dev, "maxim,isolate-stuck-channel"))
+				conf |= MAX7357_CONF_DISCON_SINGLE_CHAN;
+			if (device_property_read_bool(&client->dev,
+						      "maxim,send-flush-out-sequence"))
+				conf |= MAX7357_CONF_FLUSH_OUT;
+			if (device_property_read_bool(&client->dev,
+						      "maxim,preconnection-wiggle-test-enable"))
+				conf |= MAX7357_CONF_PRECONNECT_TEST;
+
+			ret = i2c_smbus_write_byte_data(client, data->last_chan, conf);
+		} else {
+			dev_warn(&client->dev, "Write byte data not supported."
+				 "Cannot enable enhanced mode features\n");
+			ret = i2c_smbus_write_byte(client, data->last_chan);
+		}
+	} else {
+		ret = i2c_smbus_write_byte(client, data->last_chan);
+	}
+
 	if (ret < 0)
 		data->last_chan = 0;
 

From 6ae2b145edd725c2234c7fde36ebcc5e1a4d4e7d Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Sun, 28 Jan 2024 00:32:45 +0800
Subject: [PATCH 373/707] arm64: dts: allwinner: h6: Add RX DMA channel for
 SPDIF

The SPDIF hardware found on the H6 supports both transmit and receive
functions. However it is missing the RX DMA channel.

Add the SPDIF hardware block's RX DMA channel. Also remove the
by-default pinmux, since the end device can choose to implement
either or both functionalities.

Fixes: f95b598df419 ("arm64: dts: allwinner: Add SPDIF node for Allwinner H6")
Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20240127163247.384439-6-wens@kernel.org
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts | 2 ++
 arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi      | 2 ++
 arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi            | 7 +++----
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts
index 9ec49ac2f6fd5d..381d58cea092d9 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts
@@ -291,6 +291,8 @@
 };
 
 &spdif {
+	pinctrl-names = "default";
+	pinctrl-0 = <&spdif_tx_pin>;
 	status = "okay";
 };
 
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi
index 4903d6358112de..855b7d43bc503a 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi
@@ -166,6 +166,8 @@
 };
 
 &spdif {
+	pinctrl-names = "default";
+	pinctrl-0 = <&spdif_tx_pin>;
 	status = "okay";
 };
 
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi
index ca1d287a0a01d9..d11e5041bae9a4 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi
@@ -406,6 +406,7 @@
 				function = "spi1";
 			};
 
+			/omit-if-no-ref/
 			spdif_tx_pin: spdif-tx-pin {
 				pins = "PH7";
 				function = "spdif";
@@ -655,10 +656,8 @@
 			clocks = <&ccu CLK_BUS_SPDIF>, <&ccu CLK_SPDIF>;
 			clock-names = "apb", "spdif";
 			resets = <&ccu RST_BUS_SPDIF>;
-			dmas = <&dma 2>;
-			dma-names = "tx";
-			pinctrl-names = "default";
-			pinctrl-0 = <&spdif_tx_pin>;
+			dmas = <&dma 2>, <&dma 2>;
+			dma-names = "rx", "tx";
 			status = "disabled";
 		};
 

From adb3ebfc285eb2c0ad67039bda58683be415647f Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Sun, 28 Jan 2024 00:32:46 +0800
Subject: [PATCH 374/707] arm64: dts: allwinner: h616: Add DMA controller and
 DMA channels

The DMA controllers found on the H616 and H618 are the same as the one
found on the A100. The only difference is the DMA endpoint (DRQ) layout.

Add a device node for it, and add DMA channels for existing peripherals.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Link: https://lore.kernel.org/r/20240127163247.384439-7-wens@kernel.org
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 .../arm64/boot/dts/allwinner/sun50i-h616.dtsi | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
index d549d277d9729f..885809137b9de3 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
@@ -133,6 +133,19 @@
 			#reset-cells = <1>;
 		};
 
+		dma: dma-controller@3002000 {
+			compatible = "allwinner,sun50i-h616-dma",
+				     "allwinner,sun50i-a100-dma";
+			reg = <0x03002000 0x1000>;
+			interrupts = <GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&ccu CLK_BUS_DMA>, <&ccu CLK_MBUS_DMA>;
+			clock-names = "bus", "mbus";
+			dma-channels = <16>;
+			dma-requests = <49>;
+			resets = <&ccu RST_BUS_DMA>;
+			#dma-cells = <1>;
+		};
+
 		sid: efuse@3006000 {
 			compatible = "allwinner,sun50i-h616-sid", "allwinner,sun50i-a64-sid";
 			reg = <0x03006000 0x1000>;
@@ -339,6 +352,8 @@
 			reg-shift = <2>;
 			reg-io-width = <4>;
 			clocks = <&ccu CLK_BUS_UART0>;
+			dmas = <&dma 14>, <&dma 14>;
+			dma-names = "tx", "rx";
 			resets = <&ccu RST_BUS_UART0>;
 			status = "disabled";
 		};
@@ -350,6 +365,8 @@
 			reg-shift = <2>;
 			reg-io-width = <4>;
 			clocks = <&ccu CLK_BUS_UART1>;
+			dmas = <&dma 15>, <&dma 15>;
+			dma-names = "tx", "rx";
 			resets = <&ccu RST_BUS_UART1>;
 			status = "disabled";
 		};
@@ -361,6 +378,8 @@
 			reg-shift = <2>;
 			reg-io-width = <4>;
 			clocks = <&ccu CLK_BUS_UART2>;
+			dmas = <&dma 16>, <&dma 16>;
+			dma-names = "tx", "rx";
 			resets = <&ccu RST_BUS_UART2>;
 			status = "disabled";
 		};
@@ -372,6 +391,8 @@
 			reg-shift = <2>;
 			reg-io-width = <4>;
 			clocks = <&ccu CLK_BUS_UART3>;
+			dmas = <&dma 17>, <&dma 17>;
+			dma-names = "tx", "rx";
 			resets = <&ccu RST_BUS_UART3>;
 			status = "disabled";
 		};
@@ -383,6 +404,8 @@
 			reg-shift = <2>;
 			reg-io-width = <4>;
 			clocks = <&ccu CLK_BUS_UART4>;
+			dmas = <&dma 18>, <&dma 18>;
+			dma-names = "tx", "rx";
 			resets = <&ccu RST_BUS_UART4>;
 			status = "disabled";
 		};
@@ -394,6 +417,8 @@
 			reg-shift = <2>;
 			reg-io-width = <4>;
 			clocks = <&ccu CLK_BUS_UART5>;
+			dmas = <&dma 19>, <&dma 19>;
+			dma-names = "tx", "rx";
 			resets = <&ccu RST_BUS_UART5>;
 			status = "disabled";
 		};
@@ -405,6 +430,8 @@
 			reg = <0x05002000 0x400>;
 			interrupts = <GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_I2C0>;
+			dmas = <&dma 43>, <&dma 43>;
+			dma-names = "rx", "tx";
 			resets = <&ccu RST_BUS_I2C0>;
 			pinctrl-names = "default";
 			pinctrl-0 = <&i2c0_pins>;
@@ -420,6 +447,8 @@
 			reg = <0x05002400 0x400>;
 			interrupts = <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_I2C1>;
+			dmas = <&dma 44>, <&dma 44>;
+			dma-names = "rx", "tx";
 			resets = <&ccu RST_BUS_I2C1>;
 			status = "disabled";
 			#address-cells = <1>;
@@ -433,6 +462,8 @@
 			reg = <0x05002800 0x400>;
 			interrupts = <GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_I2C2>;
+			dmas = <&dma 45>, <&dma 45>;
+			dma-names = "rx", "tx";
 			resets = <&ccu RST_BUS_I2C2>;
 			status = "disabled";
 			#address-cells = <1>;
@@ -446,6 +477,8 @@
 			reg = <0x05002c00 0x400>;
 			interrupts = <GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_I2C3>;
+			dmas = <&dma 46>, <&dma 46>;
+			dma-names = "rx", "tx";
 			resets = <&ccu RST_BUS_I2C3>;
 			status = "disabled";
 			#address-cells = <1>;
@@ -459,6 +492,8 @@
 			reg = <0x05003000 0x400>;
 			interrupts = <GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_I2C4>;
+			dmas = <&dma 47>, <&dma 47>;
+			dma-names = "rx", "tx";
 			resets = <&ccu RST_BUS_I2C4>;
 			status = "disabled";
 			#address-cells = <1>;
@@ -472,6 +507,8 @@
 			interrupts = <GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_SPI0>, <&ccu CLK_SPI0>;
 			clock-names = "ahb", "mod";
+			dmas = <&dma 22>, <&dma 22>;
+			dma-names = "rx", "tx";
 			resets = <&ccu RST_BUS_SPI0>;
 			status = "disabled";
 			#address-cells = <1>;
@@ -485,6 +522,8 @@
 			interrupts = <GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_SPI1>, <&ccu CLK_SPI1>;
 			clock-names = "ahb", "mod";
+			dmas = <&dma 23>, <&dma 23>;
+			dma-names = "rx", "tx";
 			resets = <&ccu RST_BUS_SPI1>;
 			status = "disabled";
 			#address-cells = <1>;
@@ -734,6 +773,8 @@
 			reg = <0x07081400 0x400>;
 			interrupts = <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&r_ccu CLK_R_APB2_I2C>;
+			dmas = <&dma 48>, <&dma 48>;
+			dma-names = "rx", "tx";
 			resets = <&r_ccu RST_R_APB2_I2C>;
 			status = "disabled";
 			#address-cells = <1>;

From 7ef7d495bb10fe338f85f0769e6a3cac4ebf2d74 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Sun, 28 Jan 2024 00:32:47 +0800
Subject: [PATCH 375/707] arm64: dts: allwinner: h616: Add SPDIF device node

The H616 SoC has an SPDIF transmitter hardware block, which has the same
layout as the one in the H6, minus the receiver side.

Add a device node for it, and a default pinmux.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20240127163247.384439-8-wens@kernel.org
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
---
 .../arm64/boot/dts/allwinner/sun50i-h616.dtsi | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
index 885809137b9de3..b1bf4fb5fc58b8 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi
@@ -253,6 +253,11 @@
 				function = "spi1";
 			};
 
+			spdif_tx_pin: spdif-tx-pin {
+				pins = "PH4";
+				function = "spdif";
+			};
+
 			uart0_ph_pins: uart0-ph-pins {
 				pins = "PH0", "PH1";
 				function = "uart0";
@@ -550,6 +555,21 @@
 			};
 		};
 
+		spdif: spdif@5093000 {
+			compatible = "allwinner,sun50i-h616-spdif";
+			reg = <0x05093000 0x400>;
+			interrupts = <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&ccu CLK_BUS_SPDIF>, <&ccu CLK_SPDIF>;
+			clock-names = "apb", "spdif";
+			resets = <&ccu RST_BUS_SPDIF>;
+			dmas = <&dma 2>;
+			dma-names = "tx";
+			pinctrl-names = "default";
+			pinctrl-0 = <&spdif_tx_pin>;
+			#sound-dai-cells = <0>;
+			status = "disabled";
+		};
+
 		usbotg: usb@5100000 {
 			compatible = "allwinner,sun50i-h616-musb",
 				     "allwinner,sun8i-h3-musb";

From a0868e7c5575ec87d77d0c0924e6812b20e2d879 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 11 Jan 2024 14:59:01 +0100
Subject: [PATCH 376/707] KVM: selftests: Compare wall time from xen shinfo
 against KVM_GET_CLOCK

xen_shinfo_test is observed to be flaky failing sporadically with
"VM time too old". With min_ts/max_ts debug print added:

Wall clock (v 3269818) 1704906491.986255664
Time info 1: v 1282712 tsc 33530585736 time 14014430025 mul 3587552223 shift 4294967295 flags 1
Time info 2: v 1282712 tsc 33530585736 time 14014430025 mul 3587552223 shift 4294967295 flags 1
min_ts: 1704906491.986312153
max_ts: 1704906506.001006963
==== Test Assertion Failure ====
  x86_64/xen_shinfo_test.c:1003: cmp_timespec(&min_ts, &vm_ts) <= 0
  pid=32724 tid=32724 errno=4 - Interrupted system call
     1	0x00000000004030ad: main at xen_shinfo_test.c:1003
     2	0x00007fca6b23feaf: ?? ??:0
     3	0x00007fca6b23ff5f: ?? ??:0
     4	0x0000000000405e04: _start at ??:?
  VM time too old

The test compares wall clock data from shinfo (which is the output of
kvm_get_wall_clock_epoch()) against clock_gettime(CLOCK_REALTIME) in the
host system before the VM is created. In the example above, it compares

 shinfo: 1704906491.986255664 vs min_ts: 1704906491.986312153

and fails as the later is greater than the former.  While this sounds like
a sane test, it doesn't pass reality check: kvm_get_wall_clock_epoch()
calculates guest's epoch (realtime when the guest was created) by
subtracting kvmclock from the current realtime and the calculation happens
when shinfo is setup. The problem is that kvmclock is a raw clock and
realtime clock is affected by NTP. This means that if realtime ticks with a
slightly reduced frequency, "guest's epoch" calculated by
kvm_get_wall_clock_epoch() will actually tick backwards! This is not a big
issue from guest's perspective as the guest can't really observe this but
this epoch can't be compared with a fixed clock_gettime() on the host.

Replace the check with comparing wall clock data from shinfo to
KVM_GET_CLOCK. The later gives both realtime and kvmclock so guest's epoch
can be calculated by subtraction. Note, the computed epoch may still differ
a few nanoseconds from shinfo as different TSC is used and there are
rounding errors but 100 nanoseconds margin should be enough to cover
it (famous last words).

Reported-by: Jan Richter <jarichte@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20240111135901.1785096-1-vkuznets@redhat.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/x86_64/xen_shinfo_test.c    | 36 ++++++++-----------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index 9ec9ab60b63ee2..5e1ad243d95dc0 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -375,20 +375,6 @@ static void guest_code(void)
 	GUEST_SYNC(TEST_DONE);
 }
 
-static int cmp_timespec(struct timespec *a, struct timespec *b)
-{
-	if (a->tv_sec > b->tv_sec)
-		return 1;
-	else if (a->tv_sec < b->tv_sec)
-		return -1;
-	else if (a->tv_nsec > b->tv_nsec)
-		return 1;
-	else if (a->tv_nsec < b->tv_nsec)
-		return -1;
-	else
-		return 0;
-}
-
 static struct vcpu_info *vinfo;
 static struct kvm_vcpu *vcpu;
 
@@ -425,7 +411,6 @@ static void *juggle_shinfo_state(void *arg)
 
 int main(int argc, char *argv[])
 {
-	struct timespec min_ts, max_ts, vm_ts;
 	struct kvm_xen_hvm_attr evt_reset;
 	struct kvm_vm *vm;
 	pthread_t thread;
@@ -443,8 +428,6 @@ int main(int argc, char *argv[])
 	bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL);
 	bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND);
 
-	clock_gettime(CLOCK_REALTIME, &min_ts);
-
 	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
 
 	/* Map a region for the shared_info page */
@@ -969,7 +952,6 @@ int main(int argc, char *argv[])
 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &evt_reset);
 
 	alarm(0);
-	clock_gettime(CLOCK_REALTIME, &max_ts);
 
 	/*
 	 * Just a *really* basic check that things are being put in the
@@ -978,11 +960,16 @@ int main(int argc, char *argv[])
 	 */
 	struct pvclock_wall_clock *wc;
 	struct pvclock_vcpu_time_info *ti, *ti2;
+	struct kvm_clock_data kcdata;
+	long long delta;
 
 	wc = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0xc00);
 	ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20);
 	ti2 = addr_gpa2hva(vm, PVTIME_ADDR);
 
+	vm_ioctl(vm, KVM_GET_CLOCK, &kcdata);
+	delta = (wc->sec * NSEC_PER_SEC + wc->nsec) - (kcdata.realtime - kcdata.clock);
+
 	if (verbose) {
 		printf("Wall clock (v %d) %d.%09d\n", wc->version, wc->sec, wc->nsec);
 		printf("Time info 1: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
@@ -991,14 +978,19 @@ int main(int argc, char *argv[])
 		printf("Time info 2: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
 		       ti2->version, ti2->tsc_timestamp, ti2->system_time, ti2->tsc_to_system_mul,
 		       ti2->tsc_shift, ti2->flags);
+		printf("KVM_GET_CLOCK realtime: %lld.%09lld\n", kcdata.realtime / NSEC_PER_SEC,
+		       kcdata.realtime % NSEC_PER_SEC);
+		printf("KVM_GET_CLOCK clock: %lld.%09lld\n", kcdata.clock / NSEC_PER_SEC,
+		       kcdata.clock % NSEC_PER_SEC);
 	}
 
-	vm_ts.tv_sec = wc->sec;
-	vm_ts.tv_nsec = wc->nsec;
 	TEST_ASSERT(wc->version && !(wc->version & 1),
 		    "Bad wallclock version %x", wc->version);
-	TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old");
-	TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new");
+
+	TEST_ASSERT(llabs(delta) < 100,
+		    "Guest's epoch from shinfo %d.%09d differs from KVM_GET_CLOCK %lld.%lld",
+		    wc->sec, wc->nsec, (kcdata.realtime - kcdata.clock) / NSEC_PER_SEC,
+		    (kcdata.realtime - kcdata.clock) % NSEC_PER_SEC);
 
 	TEST_ASSERT(ti->version && !(ti->version & 1),
 		    "Bad time_info version %x", ti->version);

From 9f14f46a276521c92cdffb0fc36f907e868d3888 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 22 Sep 2023 21:34:13 +0200
Subject: [PATCH 377/707] i2c: i801: Replace magic value with constant in
 dmi_check_onboard_devices

Replace magic number 10 with the appropriate constant.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andi Shyti <andi.shyti@kernel.org>
Signed-off-by: Andi Shyti <andi.shyti@kernel.org>
---
 drivers/i2c/busses/i2c-i801.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c
index 3932e8d96a1717..8af944bd89f219 100644
--- a/drivers/i2c/busses/i2c-i801.c
+++ b/drivers/i2c/busses/i2c-i801.c
@@ -1117,7 +1117,7 @@ static void dmi_check_onboard_devices(const struct dmi_header *dm, void *adap)
 {
 	int i, count;
 
-	if (dm->type != 10)
+	if (dm->type != DMI_ENTRY_ONBOARD_DEVICE)
 		return;
 
 	count = (dm->length - sizeof(struct dmi_header)) / 2;

From 96b125361866d998471c1380f809f2a2b4db60c0 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 22 Sep 2023 21:35:00 +0200
Subject: [PATCH 378/707] i2c: i801: Remove unused argument from tco functions

Argument priv isn't used, so remove it.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andi Shyti <andi.shyti@kernel.org>
Signed-off-by: Andi Shyti <andi.shyti@kernel.org>
---
 drivers/i2c/busses/i2c-i801.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c
index 8af944bd89f219..b9b850b69b73db 100644
--- a/drivers/i2c/busses/i2c-i801.c
+++ b/drivers/i2c/busses/i2c-i801.c
@@ -1445,8 +1445,7 @@ static inline void i801_del_mux(struct i801_priv *priv) { }
 #endif
 
 static struct platform_device *
-i801_add_tco_spt(struct i801_priv *priv, struct pci_dev *pci_dev,
-		 struct resource *tco_res)
+i801_add_tco_spt(struct pci_dev *pci_dev, struct resource *tco_res)
 {
 	static const struct itco_wdt_platform_data pldata = {
 		.name = "Intel PCH",
@@ -1477,8 +1476,7 @@ i801_add_tco_spt(struct i801_priv *priv, struct pci_dev *pci_dev,
 }
 
 static struct platform_device *
-i801_add_tco_cnl(struct i801_priv *priv, struct pci_dev *pci_dev,
-		 struct resource *tco_res)
+i801_add_tco_cnl(struct pci_dev *pci_dev, struct resource *tco_res)
 {
 	static const struct itco_wdt_platform_data pldata = {
 		.name = "Intel PCH",
@@ -1518,9 +1516,9 @@ static void i801_add_tco(struct i801_priv *priv)
 	res->flags = IORESOURCE_IO;
 
 	if (priv->features & FEATURE_TCO_CNL)
-		priv->tco_pdev = i801_add_tco_cnl(priv, pci_dev, tco_res);
+		priv->tco_pdev = i801_add_tco_cnl(pci_dev, tco_res);
 	else
-		priv->tco_pdev = i801_add_tco_spt(priv, pci_dev, tco_res);
+		priv->tco_pdev = i801_add_tco_spt(pci_dev, tco_res);
 
 	if (IS_ERR(priv->tco_pdev))
 		dev_warn(&pci_dev->dev, "failed to create iTCO device\n");

From 449d0d6ccf55d52b707fccf2b7756de4636742df Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Tue, 9 Jan 2024 15:11:17 +0100
Subject: [PATCH 379/707] KVM: selftests: Generalize check_clocksource() from
 kvm_clock_test

Several existing x86 selftests need to check that the underlying system
clocksource is TSC or based on TSC but every test implements its own
check. As a first step towards unification, extract check_clocksource()
from kvm_clock_test and split it into two functions: arch-neutral
'sys_get_cur_clocksource()' and x86-specific 'sys_clocksource_is_tsc()'.
Fix a couple of pre-existing issues in kvm_clock_test: memory leakage in
check_clocksource() and using TEST_ASSERT() instead of TEST_REQUIRE().
The change also makes the test fail when system clocksource can't be read
from sysfs.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20240109141121.1619463-2-vkuznets@redhat.com
[sean: eliminate if-elif pattern just to set a bool true]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../testing/selftests/kvm/include/test_util.h |  2 +
 .../selftests/kvm/include/x86_64/processor.h  |  2 +
 tools/testing/selftests/kvm/lib/test_util.c   | 25 ++++++++++++
 .../selftests/kvm/lib/x86_64/processor.c      | 10 +++++
 .../selftests/kvm/x86_64/kvm_clock_test.c     | 38 +------------------
 5 files changed, 40 insertions(+), 37 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index 71a41fa924b7d0..50a5e31ba8da1b 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -195,4 +195,6 @@ __printf(3, 4) int guest_snprintf(char *buf, int n, const char *fmt, ...);
 
 char *strdup_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2), nonnull(1)));
 
+char *sys_get_cur_clocksource(void);
+
 #endif /* SELFTEST_KVM_TEST_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index a84863503fcb46..01eec72e0d3e36 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -1271,4 +1271,6 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 #define PFERR_GUEST_PAGE_MASK	BIT_ULL(PFERR_GUEST_PAGE_BIT)
 #define PFERR_IMPLICIT_ACCESS	BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT)
 
+bool sys_clocksource_is_tsc(void);
+
 #endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index 5d7f28b02d73ba..5a8f8becb12984 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -392,3 +392,28 @@ char *strdup_printf(const char *fmt, ...)
 
 	return str;
 }
+
+#define CLOCKSOURCE_PATH "/sys/devices/system/clocksource/clocksource0/current_clocksource"
+
+char *sys_get_cur_clocksource(void)
+{
+	char *clk_name;
+	struct stat st;
+	FILE *fp;
+
+	fp = fopen(CLOCKSOURCE_PATH, "r");
+	TEST_ASSERT(fp, "failed to open clocksource file, errno: %d", errno);
+
+	TEST_ASSERT(!fstat(fileno(fp), &st), "failed to stat clocksource file, errno: %d",
+		    errno);
+
+	clk_name = malloc(st.st_size);
+	TEST_ASSERT(clk_name, "failed to allocate buffer to read file");
+
+	TEST_ASSERT(fgets(clk_name, st.st_size, fp), "failed to read clocksource file: %d",
+		    ferror(fp));
+
+	fclose(fp);
+
+	return clk_name;
+}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 4bc52948447d8c..e6964ff2a37da7 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -1299,3 +1299,13 @@ void kvm_selftest_arch_init(void)
 	host_cpu_is_intel = this_cpu_is_intel();
 	host_cpu_is_amd = this_cpu_is_amd();
 }
+
+bool sys_clocksource_is_tsc(void)
+{
+	char *clk_name = sys_get_cur_clocksource();
+	bool ret = !strcmp(clk_name, "tsc\n");
+
+	free(clk_name);
+
+	return ret;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c
index 3e0b7d51abdaa5..6fcc1a43358757 100644
--- a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c
+++ b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c
@@ -132,42 +132,6 @@ static void enter_guest(struct kvm_vcpu *vcpu)
 	}
 }
 
-#define CLOCKSOURCE_PATH "/sys/devices/system/clocksource/clocksource0/current_clocksource"
-
-static void check_clocksource(void)
-{
-	char *clk_name;
-	struct stat st;
-	FILE *fp;
-
-	fp = fopen(CLOCKSOURCE_PATH, "r");
-	if (!fp) {
-		pr_info("failed to open clocksource file: %d; assuming TSC.\n",
-			errno);
-		return;
-	}
-
-	if (fstat(fileno(fp), &st)) {
-		pr_info("failed to stat clocksource file: %d; assuming TSC.\n",
-			errno);
-		goto out;
-	}
-
-	clk_name = malloc(st.st_size);
-	TEST_ASSERT(clk_name, "failed to allocate buffer to read file");
-
-	if (!fgets(clk_name, st.st_size, fp)) {
-		pr_info("failed to read clocksource file: %d; assuming TSC.\n",
-			ferror(fp));
-		goto out;
-	}
-
-	TEST_ASSERT(!strncmp(clk_name, "tsc\n", st.st_size),
-		    "clocksource not supported: %s", clk_name);
-out:
-	fclose(fp);
-}
-
 int main(void)
 {
 	struct kvm_vcpu *vcpu;
@@ -179,7 +143,7 @@ int main(void)
 	flags = kvm_check_cap(KVM_CAP_ADJUST_CLOCK);
 	TEST_REQUIRE(flags & KVM_CLOCK_REALTIME);
 
-	check_clocksource();
+	TEST_REQUIRE(sys_clocksource_is_tsc());
 
 	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
 

From a79036441a68ca04061dda185b5d9cae1d76ed82 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Tue, 9 Jan 2024 15:11:18 +0100
Subject: [PATCH 380/707] KVM: selftests: Use generic sys_clocksource_is_tsc()
 in vmx_nested_tsc_scaling_test

Despite its name, system_has_stable_tsc() just checks that system
clocksource is 'tsc'; this can now be done with generic
sys_clocksource_is_tsc().

No functional change intended.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20240109141121.1619463-3-vkuznets@redhat.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../kvm/x86_64/vmx_nested_tsc_scaling_test.c  | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
index e710b6e7fb384a..93b0a850a24003 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
@@ -116,23 +116,6 @@ static void l1_guest_code(struct vmx_pages *vmx_pages)
 	GUEST_DONE();
 }
 
-static bool system_has_stable_tsc(void)
-{
-	bool tsc_is_stable;
-	FILE *fp;
-	char buf[4];
-
-	fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
-	if (fp == NULL)
-		return false;
-
-	tsc_is_stable = fgets(buf, sizeof(buf), fp) &&
-			!strncmp(buf, "tsc", sizeof(buf));
-
-	fclose(fp);
-	return tsc_is_stable;
-}
-
 int main(int argc, char *argv[])
 {
 	struct kvm_vcpu *vcpu;
@@ -148,7 +131,7 @@ int main(int argc, char *argv[])
 
 	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
 	TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL));
-	TEST_REQUIRE(system_has_stable_tsc());
+	TEST_REQUIRE(sys_clocksource_is_tsc());
 
 	/*
 	 * We set L1's scale factor to be a random number from 2 to 10.

From 436e6e541cb27added742ed7998612d5a466223c Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Tue, 9 Jan 2024 15:11:19 +0100
Subject: [PATCH 381/707] KVM: selftests: Run clocksource dependent tests with
 hyperv_clocksource_tsc_page too

KVM's 'gtod_is_based_on_tsc()' recognizes two clocksources: 'tsc' and
'hyperv_clocksource_tsc_page' and enables kvmclock in 'masterclock'
mode when either is in use. Transform 'sys_clocksource_is_tsc()' into
'sys_clocksource_is_based_on_tsc()' to support the later. This affects
two tests: kvm_clock_test and vmx_nested_tsc_scaling_test, both seem
to work well when system clocksource is 'hyperv_clocksource_tsc_page'.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20240109141121.1619463-4-vkuznets@redhat.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/x86_64/processor.h       | 2 +-
 tools/testing/selftests/kvm/lib/x86_64/processor.c           | 5 +++--
 tools/testing/selftests/kvm/x86_64/kvm_clock_test.c          | 2 +-
 .../selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c       | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 01eec72e0d3e36..5bca8c947c8253 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -1271,6 +1271,6 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 #define PFERR_GUEST_PAGE_MASK	BIT_ULL(PFERR_GUEST_PAGE_BIT)
 #define PFERR_IMPLICIT_ACCESS	BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT)
 
-bool sys_clocksource_is_tsc(void);
+bool sys_clocksource_is_based_on_tsc(void);
 
 #endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index e6964ff2a37da7..f639b3e062e3a3 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -1300,10 +1300,11 @@ void kvm_selftest_arch_init(void)
 	host_cpu_is_amd = this_cpu_is_amd();
 }
 
-bool sys_clocksource_is_tsc(void)
+bool sys_clocksource_is_based_on_tsc(void)
 {
 	char *clk_name = sys_get_cur_clocksource();
-	bool ret = !strcmp(clk_name, "tsc\n");
+	bool ret = !strcmp(clk_name, "tsc\n") ||
+		   !strcmp(clk_name, "hyperv_clocksource_tsc_page\n");
 
 	free(clk_name);
 
diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c
index 6fcc1a43358757..5bc12222d87af6 100644
--- a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c
+++ b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c
@@ -143,7 +143,7 @@ int main(void)
 	flags = kvm_check_cap(KVM_CAP_ADJUST_CLOCK);
 	TEST_REQUIRE(flags & KVM_CLOCK_REALTIME);
 
-	TEST_REQUIRE(sys_clocksource_is_tsc());
+	TEST_REQUIRE(sys_clocksource_is_based_on_tsc());
 
 	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
 
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
index 93b0a850a24003..1759fa5cb3f29c 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
@@ -131,7 +131,7 @@ int main(int argc, char *argv[])
 
 	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
 	TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL));
-	TEST_REQUIRE(sys_clocksource_is_tsc());
+	TEST_REQUIRE(sys_clocksource_is_based_on_tsc());
 
 	/*
 	 * We set L1's scale factor to be a random number from 2 to 10.

From 14fce852a14b4c74411ec0553322d23f2a740138 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Tue, 9 Jan 2024 15:11:20 +0100
Subject: [PATCH 382/707] KVM: selftests: Make hyperv_clock require TSC based
 system clocksource

KVM sets up Hyper-V TSC page clocksource for its guests when system
clocksource is 'based on TSC' (see gtod_is_based_on_tsc()), running
hyperv_clock with any other clocksource leads to imminent failure.

Add the missing requirement to make the test skip gracefully.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20240109141121.1619463-5-vkuznets@redhat.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86_64/hyperv_clock.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
index 65690d916db7e6..e058bc676cd693 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
@@ -212,6 +212,7 @@ int main(void)
 	int stage;
 
 	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_TIME));
+	TEST_REQUIRE(sys_clocksource_is_based_on_tsc());
 
 	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
 

From 57cc5371293416a162dafa4afab9c4f1d7f904c2 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Tue, 9 Jan 2024 15:11:21 +0100
Subject: [PATCH 383/707] KVM: x86: Make gtod_is_based_on_tsc() return 'bool'

gtod_is_based_on_tsc() is boolean in nature, i.e. it returns '1' for good
clocksources and '0' otherwise. Moreover, its result is used raw by
kvm_get_time_and_clockread()/kvm_get_walltime_and_clockread() which are
'bool'.

No functional change intended.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Link: https://lore.kernel.org/r/20240109141121.1619463-6-vkuznets@redhat.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 363b1c08020578..aa27aec10860a4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2507,7 +2507,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 }
 
 #ifdef CONFIG_X86_64
-static inline int gtod_is_based_on_tsc(int mode)
+static inline bool gtod_is_based_on_tsc(int mode)
 {
 	return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
 }

From 11f1357336cde9924da0b455e528f11fbd5011f4 Mon Sep 17 00:00:00 2001
From: Esben Haabendal <esben@geanix.com>
Date: Thu, 25 Jan 2024 14:56:36 +0100
Subject: [PATCH 384/707] i2c: imx: move to generic GPIO recovery

Starting with
commit 75820314de26 ("i2c: core: add generic I2C GPIO recovery")
GPIO bus recovery is supported by the I2C core, so we can remove the
driver implementation and use that one instead.

As a nice side-effect, pinctrl becomes optional, allowing bus recovery on
LS1021A, which does not have such luxury, but can be wired up to use extra
fixed GPIO pins.

Note: The previous error messages about bus recovery not being supported is
dropped with this change. Given that it is perfectly possible to have platforms
where bus recovery works without pinctrl support, I happen to work on one such,
both error messages does not really make sense in those cases. And I don't see
how to know if this is the case or not.

Signed-off-by: Esben Haabendal <esben@geanix.com>
Acked-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Andi Shyti <andi.shyti@kernel.org>
---
 drivers/i2c/busses/i2c-imx.c | 62 +++---------------------------------
 1 file changed, 5 insertions(+), 57 deletions(-)

diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c
index 88a053987403cc..d6ba93fc7fee05 100644
--- a/drivers/i2c/busses/i2c-imx.c
+++ b/drivers/i2c/busses/i2c-imx.c
@@ -212,10 +212,6 @@ struct imx_i2c_struct {
 	const struct imx_i2c_hwdata	*hwdata;
 	struct i2c_bus_recovery_info rinfo;
 
-	struct pinctrl *pinctrl;
-	struct pinctrl_state *pinctrl_pins_default;
-	struct pinctrl_state *pinctrl_pins_gpio;
-
 	struct imx_i2c_dma	*dma;
 	struct i2c_client	*slave;
 	enum i2c_slave_event last_slave_event;
@@ -1357,24 +1353,6 @@ static int i2c_imx_xfer_atomic(struct i2c_adapter *adapter,
 	return result;
 }
 
-static void i2c_imx_prepare_recovery(struct i2c_adapter *adap)
-{
-	struct imx_i2c_struct *i2c_imx;
-
-	i2c_imx = container_of(adap, struct imx_i2c_struct, adapter);
-
-	pinctrl_select_state(i2c_imx->pinctrl, i2c_imx->pinctrl_pins_gpio);
-}
-
-static void i2c_imx_unprepare_recovery(struct i2c_adapter *adap)
-{
-	struct imx_i2c_struct *i2c_imx;
-
-	i2c_imx = container_of(adap, struct imx_i2c_struct, adapter);
-
-	pinctrl_select_state(i2c_imx->pinctrl, i2c_imx->pinctrl_pins_default);
-}
-
 /*
  * We switch SCL and SDA to their GPIO function and do some bitbanging
  * for bus recovery. These alternative pinmux settings can be
@@ -1385,43 +1363,13 @@ static void i2c_imx_unprepare_recovery(struct i2c_adapter *adap)
 static int i2c_imx_init_recovery_info(struct imx_i2c_struct *i2c_imx,
 		struct platform_device *pdev)
 {
-	struct i2c_bus_recovery_info *rinfo = &i2c_imx->rinfo;
-
-	i2c_imx->pinctrl = devm_pinctrl_get(&pdev->dev);
-	if (!i2c_imx->pinctrl) {
-		dev_info(&pdev->dev, "pinctrl unavailable, bus recovery not supported\n");
-		return 0;
-	}
-	if (IS_ERR(i2c_imx->pinctrl)) {
-		dev_info(&pdev->dev, "can't get pinctrl, bus recovery not supported\n");
-		return PTR_ERR(i2c_imx->pinctrl);
-	}
-
-	i2c_imx->pinctrl_pins_default = pinctrl_lookup_state(i2c_imx->pinctrl,
-			PINCTRL_STATE_DEFAULT);
-	i2c_imx->pinctrl_pins_gpio = pinctrl_lookup_state(i2c_imx->pinctrl,
-			"gpio");
-	rinfo->sda_gpiod = devm_gpiod_get_optional(&pdev->dev, "sda", GPIOD_IN);
-	rinfo->scl_gpiod = devm_gpiod_get(&pdev->dev, "scl", GPIOD_OUT_HIGH_OPEN_DRAIN);
-
-	if (PTR_ERR(rinfo->sda_gpiod) == -EPROBE_DEFER ||
-	    PTR_ERR(rinfo->scl_gpiod) == -EPROBE_DEFER) {
-		return -EPROBE_DEFER;
-	} else if (IS_ERR(rinfo->sda_gpiod) ||
-		   IS_ERR(rinfo->scl_gpiod) ||
-		   IS_ERR(i2c_imx->pinctrl_pins_default) ||
-		   IS_ERR(i2c_imx->pinctrl_pins_gpio)) {
-		dev_dbg(&pdev->dev, "recovery information incomplete\n");
-		return 0;
-	}
+	struct i2c_bus_recovery_info *bri = &i2c_imx->rinfo;
 
-	dev_dbg(&pdev->dev, "using scl%s for recovery\n",
-		rinfo->sda_gpiod ? ",sda" : "");
+	bri->pinctrl = devm_pinctrl_get(&pdev->dev);
+	if (IS_ERR(bri->pinctrl))
+		return PTR_ERR(bri->pinctrl);
 
-	rinfo->prepare_recovery = i2c_imx_prepare_recovery;
-	rinfo->unprepare_recovery = i2c_imx_unprepare_recovery;
-	rinfo->recover_bus = i2c_generic_scl_recovery;
-	i2c_imx->adapter.bus_recovery_info = rinfo;
+	i2c_imx->adapter.bus_recovery_info = bri;
 
 	return 0;
 }

From b5d5f64f9bd2dab3b9cebdc1bf52e82367fe352b Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Mon, 29 Jan 2024 17:13:23 +0100
Subject: [PATCH 385/707] dt-bindings: hwmon: Add LTC4282 bindings

Add bindings for the LTC4282 High Current Hot Swap Controller with I2C
Compatible Monitoring.

Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240129-b4-ltc4282-support-v4-1-fe75798164cc@analog.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 .../bindings/hwmon/adi,ltc4282.yaml           | 159 ++++++++++++++++++
 MAINTAINERS                                   |   6 +
 2 files changed, 165 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml

diff --git a/Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml b/Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml
new file mode 100644
index 00000000000000..4854b95a93e341
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/adi,ltc4282.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Analog Devices LTC4282 I2C High Current Hot Swap Controller over I2C
+
+maintainers:
+  - Nuno Sa <nuno.sa@analog.com>
+
+description: |
+  Analog Devices LTC4282 I2C High Current Hot Swap Controller over I2C.
+
+  https://www.analog.com/media/en/technical-documentation/data-sheets/ltc4282.pdf
+
+properties:
+  compatible:
+    enum:
+      - adi,ltc4282
+
+  reg:
+    maxItems: 1
+
+  vdd-supply: true
+
+  clocks:
+    maxItems: 1
+
+  '#clock-cells':
+    const: 0
+
+  adi,rsense-nano-ohms:
+    description: Value of the sense resistor.
+
+  adi,vin-mode-microvolt:
+    description:
+      Selects operating range for the Undervoltage, Overvoltage and Foldback
+      pins. Also for the ADC. Should be set to the nominal input voltage.
+    enum: [3300000, 5000000, 12000000, 24000000]
+    default: 12000000
+
+  adi,fet-bad-timeout-ms:
+    description:
+      From the moment a FET bad conditions is present, this property selects the
+      wait time/timeout for a FET-bad fault to be signaled. Setting this to 0,
+      disables FET bad faults to be reported.
+    default: 255
+    maximum: 255
+
+  adi,overvoltage-dividers:
+    description: |
+      Select which dividers to use for VDD Overvoltage detection. Note that
+      when the internal dividers are used the threshold is referenced to VDD.
+      The percentages in the datasheet are misleading since the actual values
+      to look for are in the "Absolute Maximum Ratings" table in the
+      "Comparator Inputs" section. In there there's a line for each of the 5%,
+      10% and 15% settings with the actual min, typical and max tolerances.
+    $ref: /schemas/types.yaml#/definitions/string
+    enum: [external, vdd_5_percent, vdd_10_percent, vdd_15_percent]
+    default: external
+
+  adi,undervoltage-dividers:
+    description: |
+      Select which dividers to use for VDD Overvoltage detection. Note that
+      when the internal dividers are used the threshold is referenced to VDD.
+      The percentages in the datasheet are misleading since the actual values
+      to look for are in the "Absolute Maximum Ratings" table in the
+      "Comparator Inputs" section. In there there's a line for each of the 5%,
+      10% and 15% settings with the actual min, typical and max tolerances.
+    $ref: /schemas/types.yaml#/definitions/string
+    enum: [external, vdd_5_percent, vdd_10_percent, vdd_15_percent]
+    default: external
+
+  adi,current-limit-sense-microvolt:
+    description:
+      The current limit sense voltage of the chip is adjustable between
+      12.5mV and 34.4mV in 3.1mV steps. This effectively limits the current
+      on the load.
+    enum: [12500, 15625, 18750, 21875, 25000, 28125, 31250, 34375]
+    default: 25000
+
+  adi,overcurrent-retry:
+    description:
+      If set, enables the chip to auto-retry 256 timer cycles after an
+      Overcurrent fault.
+    type: boolean
+
+  adi,overvoltage-retry-disable:
+    description:
+      If set, disables the chip to auto-retry 50ms after an Overvoltage fault.
+      It's enabled by default.
+    type: boolean
+
+  adi,undervoltage-retry-disable:
+    description:
+      If set, disables the chip to auto-retry 50ms after an Undervoltage fault.
+      It's enabled by default.
+    type: boolean
+
+  adi,fault-log-enable:
+    description:
+      If set, enables the FAULT_LOG and ADC_ALERT_LOG registers to be written
+      to the EEPROM when a fault bit transitions high and hence, will be
+      available after a power cycle (the chip loads the contents of
+      the EE_FAULT_LOG register - the one in EEPROM - into FAULT_LOG at boot).
+    type: boolean
+
+  adi,gpio1-mode:
+    description: Defines the function of the Pin. It can indicate that power is
+      good (PULL the pin low when power is not good) or that power is bad (Go
+      into high-z when power is not good).
+    $ref: /schemas/types.yaml#/definitions/string
+    enum: [power_bad, power_good]
+    default: power_good
+
+  adi,gpio2-mode:
+    description: Defines the function of the Pin. It can be set as the input for
+      the ADC or indicating that the MOSFET is in stress (dissipating power).
+    $ref: /schemas/types.yaml#/definitions/string
+    enum: [adc_input, stress_fet]
+    default: adc_input
+
+  adi,gpio3-monitor-enable:
+    description: If set, gpio3 is set as input for the ADC instead of gpio2.
+    type: boolean
+
+allOf:
+  - if:
+      required:
+        - adi,gpio3-monitor-enable
+    then:
+      properties:
+        adi,gpio2-mode:
+          const: stress_fet
+
+required:
+  - compatible
+  - reg
+  - adi,rsense-nano-ohms
+
+additionalProperties: false
+
+examples:
+  - |
+    i2c {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        hwmon@50 {
+            compatible = "adi,ltc4282";
+            reg = <0x50>;
+            adi,rsense-nano-ohms = <500>;
+
+            adi,gpio1-mode = "power_good";
+            adi,gpio2-mode = "adc_input";
+        };
+    };
+...
diff --git a/MAINTAINERS b/MAINTAINERS
index 5e7239cb40ea6a..3c5f7ae166f0f0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12759,6 +12759,12 @@ S:	Maintained
 F:	Documentation/hwmon/ltc4261.rst
 F:	drivers/hwmon/ltc4261.c
 
+LTC4282 HARDWARE MONITOR DRIVER
+M:	Nuno Sa <nuno.sa@analog.com>
+L:	linux-hwmon@vger.kernel.org
+S:	Supported
+F:	Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml
+
 LTC4286 HARDWARE MONITOR DRIVER
 M:	Delphine CC Chiu <Delphine_CC_Chiu@Wiwynn.com>
 L:	linux-i2c@vger.kernel.org

From 5b413e5322b62da39794187f3c5ebe3884a3ac27 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Mon, 29 Jan 2024 17:13:24 +0100
Subject: [PATCH 386/707] hwmon: add fault attribute for voltage channels

Sometimes a voltage channel might have an hard failure (eg: a shorted
MOSFET). Hence, add a fault attribute to report such failures.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240129-b4-ltc4282-support-v4-2-fe75798164cc@analog.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/ABI/testing/sysfs-class-hwmon | 9 +++++++++
 drivers/hwmon/hwmon.c                       | 1 +
 include/linux/hwmon.h                       | 2 ++
 3 files changed, 12 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-class-hwmon b/Documentation/ABI/testing/sysfs-class-hwmon
index 3dac923c9b0ef0..6c4e68ad4a8331 100644
--- a/Documentation/ABI/testing/sysfs-class-hwmon
+++ b/Documentation/ABI/testing/sysfs-class-hwmon
@@ -149,6 +149,15 @@ Description:
 
 		RW
 
+What:		/sys/class/hwmon/hwmonX/inY_fault
+Description:
+		Reports a voltage hard failure (eg: shorted component)
+
+		- 1: Failed
+		- 0: Ok
+
+		RO
+
 What:		/sys/class/hwmon/hwmonX/cpuY_vid
 Description:
 		CPU core reference voltage.
diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c
index c7dd3f5b2bd549..18705049ad610e 100644
--- a/drivers/hwmon/hwmon.c
+++ b/drivers/hwmon/hwmon.c
@@ -510,6 +510,7 @@ static const char * const hwmon_in_attr_templates[] = {
 	[hwmon_in_rated_min] = "in%d_rated_min",
 	[hwmon_in_rated_max] = "in%d_rated_max",
 	[hwmon_in_beep] = "in%d_beep",
+	[hwmon_in_fault] = "in%d_fault",
 };
 
 static const char * const hwmon_curr_attr_templates[] = {
diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h
index c2c0da18dfa369..c7885fdce88f09 100644
--- a/include/linux/hwmon.h
+++ b/include/linux/hwmon.h
@@ -141,6 +141,7 @@ enum hwmon_in_attributes {
 	hwmon_in_rated_min,
 	hwmon_in_rated_max,
 	hwmon_in_beep,
+	hwmon_in_fault,
 };
 
 #define HWMON_I_ENABLE		BIT(hwmon_in_enable)
@@ -162,6 +163,7 @@ enum hwmon_in_attributes {
 #define HWMON_I_RATED_MIN	BIT(hwmon_in_rated_min)
 #define HWMON_I_RATED_MAX	BIT(hwmon_in_rated_max)
 #define HWMON_I_BEEP		BIT(hwmon_in_beep)
+#define HWMON_I_FAULT		BIT(hwmon_in_fault)
 
 enum hwmon_curr_attributes {
 	hwmon_curr_enable,

From 6120fec68e78eefdf7d89325668c082a90817c75 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Mon, 29 Jan 2024 17:13:25 +0100
Subject: [PATCH 387/707] hwmon: ltc4282: add support for the LTC4282 chip

The LTC4282 hot swap controller allows a board to be safely inserted and
removed from a live backplane. Using one or more external N-channel pass
transistors, board supply voltage and inrush current are ramped up at an
adjustable rate. An I2C interface and onboard ADC allows for monitoring
of board current, voltage, power, energy and fault status.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240129-b4-ltc4282-support-v4-3-fe75798164cc@analog.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/index.rst   |    1 +
 Documentation/hwmon/ltc4282.rst |  133 +++
 MAINTAINERS                     |    2 +
 drivers/hwmon/Kconfig           |   11 +
 drivers/hwmon/Makefile          |    1 +
 drivers/hwmon/ltc4282.c         | 1784 +++++++++++++++++++++++++++++++
 6 files changed, 1932 insertions(+)
 create mode 100644 Documentation/hwmon/ltc4282.rst
 create mode 100644 drivers/hwmon/ltc4282.c

diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst
index c7ed1f73ac0661..f16c6dfaec7dc9 100644
--- a/Documentation/hwmon/index.rst
+++ b/Documentation/hwmon/index.rst
@@ -129,6 +129,7 @@ Hardware Monitoring Kernel Drivers
    ltc4245
    ltc4260
    ltc4261
+   ltc4282
    ltc4286
    max127
    max15301
diff --git a/Documentation/hwmon/ltc4282.rst b/Documentation/hwmon/ltc4282.rst
new file mode 100644
index 00000000000000..a87ec3564998fe
--- /dev/null
+++ b/Documentation/hwmon/ltc4282.rst
@@ -0,0 +1,133 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+Kernel drivers ltc4282
+==========================================
+
+Supported chips:
+
+  * Analog Devices LTC4282
+
+    Prefix: 'ltc4282'
+
+    Addresses scanned: - I2C 0x40 - 0x5A (7-bit)
+    Addresses scanned: - I2C 0x80 - 0xB4 with a step of 2 (8-bit)
+
+    Datasheet:
+
+        https://www.analog.com/media/en/technical-documentation/data-sheets/ltc4282.pdf
+
+Author: Nuno Sá <nuno.sa@analog.com>
+
+Description
+___________
+
+The LTC4282 hot swap controller allows a board to be safely inserted and removed
+from a live backplane. Using one or more external N-channel pass transistors,
+board supply voltage and inrush current are ramped up at an adjustable rate. An
+I2C interface and onboard ADC allows for monitoring of board current, voltage,
+power, energy and fault status. The device features analog foldback current
+limiting and supply monitoring for applications from 2.9V to 33V. Dual 12V gate
+drive allows high power applications to either share safe operating area across
+parallel MOSFETs or support a 2-stage start-up that first charges the load
+capacitance followed by enabling a low on-resistance path to the load. The
+LTC4282 is well suited to high power applications because the precise monitoring
+capability and accurate current limiting reduce the extremes in which both loads
+and power supplies must safely operate. Non-volatile configuration allows for
+flexibility in the autonomous generation of alerts and response to faults.
+
+Sysfs entries
+_____________
+
+The following attributes are supported. Limits are read-write and all the other
+attributes are read-only. Note that in0 and in1 are mutually exclusive. Enabling
+one disables the other and disabling one enables the other.
+
+======================= ==========================================
+in0_input		Output voltage (mV).
+in0_min			Undervoltage threshold
+in0_max                 Overvoltage threshold
+in0_lowest		Lowest measured voltage
+in0_highest		Highest measured voltage
+in0_reset_history	Write 1 to reset in0 history.
+			Also clears fet bad and short fault logs.
+in0_min_alarm		Undervoltage alarm
+in0_max_alarm           Overvoltage alarm
+in0_enable		Enable/Disable VSOURCE monitoring
+in0_fault		Failure in the MOSFETs. Either bad or shorted FET.
+in0_label		Channel label (VSOURCE)
+
+in1_input		Input voltage (mV).
+in1_min			Undervoltage threshold
+in1_max                 Overvoltage threshold
+in1_lowest		Lowest measured voltage
+in1_highest		Highest measured voltage
+in1_reset_history	Write 1 to reset in1 history.
+			Also clears over/undervoltage fault logs.
+in1_min_alarm		Undervoltage alarm
+in1_max_alarm           Overvoltage alarm
+in1_lcrit_alarm         Critical Undervoltage alarm
+in1_crit_alarm          Critical Overvoltage alarm
+in1_enable		Enable/Disable VDD monitoring
+in1_label		Channel label (VDD)
+
+in2_input		GPIO voltage (mV)
+in2_min			Undervoltage threshold
+in2_max			Overvoltage threshold
+in2_lowest		Lowest measured voltage
+in2_highest		Highest measured voltage
+in2_reset_history	Write 1 to reset in2 history
+in2_min_alarm		Undervoltage alarm
+in2_max_alarm		Overvoltage alarm
+in2_label		Channel label (VGPIO)
+
+curr1_input		Sense current (mA)
+curr1_min		Undercurrent threshold
+curr1_max		Overcurrent threshold
+curr1_lowest		Lowest measured current
+curr1_highest		Highest measured current
+curr1_reset_history	Write 1 to reset curr1 history.
+			Also clears overcurrent fault logs.
+curr1_min_alarm		Undercurrent alarm
+curr1_max_alarm		Overcurrent alarm
+curr1_crit_alarm        Critical Overcurrent alarm
+curr1_label		Channel label (ISENSE)
+
+power1_input		Power (in uW)
+power1_min		Low power threshold
+power1_max		High power threshold
+power1_input_lowest	Historical minimum power use
+power1_input_highest	Historical maximum power use
+power1_reset_history	Write 1 to reset power1 history.
+			Also clears power bad fault logs.
+power1_min_alarm	Low power alarm
+power1_max_alarm	High power alarm
+power1_label		Channel label (Power)
+
+energy1_input		Measured energy over time (in microJoule)
+energy1_enable		Enable/Disable Energy accumulation
+======================= ==========================================
+
+DebugFs entries
+_______________
+
+The chip also has a fault log register where failures can be logged. Hence,
+as these are logging events, we give access to them in debugfs. Note that
+even if some failure is detected in these logs, it does necessarily mean
+that the failure is still present. As mentioned in the proper Sysfs entries,
+these logs can be cleared by writing in the proper reset_history attribute.
+
+.. warning:: The debugfs interface is subject to change without notice
+             and is only available when the kernel is compiled with
+             ``CONFIG_DEBUG_FS`` defined.
+
+``/sys/kernel/debug/ltc4282-hwmon[X]/``
+contains the following attributes:
+
+=======================  ==========================================
+power1_bad_fault_log     Set to 1 by a power1 bad fault occurring.
+in0_fet_short_fault_log	 Set to 1 when the ADC detects a FET-short fault.
+in0_fet_bad_fault_log    Set to 1 when a FET-BAD fault occurs.
+in1_crit_fault_log       Set to 1 by a VDD overvoltage fault occurring.
+in1_lcrit_fault_log      Set to 1 by a VDD undervoltage fault occurring.
+curr1_crit_fault_log	 Set to 1 by an overcurrent fault occurring.
+=======================  ==========================================
diff --git a/MAINTAINERS b/MAINTAINERS
index 3c5f7ae166f0f0..e49eea8b18066b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12764,6 +12764,8 @@ M:	Nuno Sa <nuno.sa@analog.com>
 L:	linux-hwmon@vger.kernel.org
 S:	Supported
 F:	Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml
+F:	Documentation/hwmon/ltc4282.rst
+F:	drivers/hwmon/ltc4282.c
 
 LTC4286 HARDWARE MONITOR DRIVER
 M:	Delphine CC Chiu <Delphine_CC_Chiu@Wiwynn.com>
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index a608264da87df8..f6160cc7007773 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -1038,6 +1038,17 @@ config SENSORS_LTC4261
 	  This driver can also be built as a module. If so, the module will
 	  be called ltc4261.
 
+config SENSORS_LTC4282
+	tristate "Analog Devices LTC4282"
+	depends on I2C
+	select REGMAP_I2C
+	help
+	  If you say yes here you get support for Analog Devices LTC4282
+	  High Current Hot Swap Controller I2C interface.
+
+	  This driver can also be built as a module. If so, the module will
+	  be called ltc4282.
+
 config SENSORS_LTQ_CPUTEMP
 	bool "Lantiq cpu temperature sensor driver"
 	depends on SOC_XWAY
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 47be39af5c0381..8bfc422a29e532 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -136,6 +136,7 @@ obj-$(CONFIG_SENSORS_LTC4222)	+= ltc4222.o
 obj-$(CONFIG_SENSORS_LTC4245)	+= ltc4245.o
 obj-$(CONFIG_SENSORS_LTC4260)	+= ltc4260.o
 obj-$(CONFIG_SENSORS_LTC4261)	+= ltc4261.o
+obj-$(CONFIG_SENSORS_LTC4282)	+= ltc4282.o
 obj-$(CONFIG_SENSORS_LTQ_CPUTEMP) += ltq-cputemp.o
 obj-$(CONFIG_SENSORS_MAX1111)	+= max1111.o
 obj-$(CONFIG_SENSORS_MAX127)	+= max127.o
diff --git a/drivers/hwmon/ltc4282.c b/drivers/hwmon/ltc4282.c
new file mode 100644
index 00000000000000..cf4c9cfec8b42b
--- /dev/null
+++ b/drivers/hwmon/ltc4282.c
@@ -0,0 +1,1784 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Analog Devices LTC4282 I2C High Current Hot Swap Controller over I2C
+ *
+ * Copyright 2023 Analog Devices Inc.
+ */
+#include <linux/bitfield.h>
+#include <linux/cleanup.h>
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/i2c.h>
+#include <linux/math.h>
+#include <linux/minmax.h>
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/mutex.h>
+#include <linux/regmap.h>
+#include <linux/property.h>
+#include <linux/string.h>
+#include <linux/units.h>
+#include <linux/util_macros.h>
+
+#define LTC4282_CTRL_LSB			0x00
+  #define LTC4282_CTRL_OV_RETRY_MASK		BIT(0)
+  #define LTC4282_CTRL_UV_RETRY_MASK		BIT(1)
+  #define LTC4282_CTRL_OC_RETRY_MASK		BIT(2)
+  #define LTC4282_CTRL_ON_ACTIVE_LOW_MASK	BIT(5)
+  #define LTC4282_CTRL_ON_DELAY_MASK		BIT(6)
+#define LTC4282_CTRL_MSB			0x01
+  #define LTC4282_CTRL_VIN_MODE_MASK		GENMASK(1, 0)
+  #define LTC4282_CTRL_OV_MODE_MASK		GENMASK(3, 2)
+  #define LTC4282_CTRL_UV_MODE_MASK		GENMASK(5, 4)
+#define LTC4282_FAULT_LOG			0x04
+  #define LTC4282_OV_FAULT_MASK			BIT(0)
+  #define LTC4282_UV_FAULT_MASK			BIT(1)
+  #define LTC4282_VDD_FAULT_MASK \
+		(LTC4282_OV_FAULT_MASK | LTC4282_UV_FAULT_MASK)
+  #define LTC4282_OC_FAULT_MASK			BIT(2)
+  #define LTC4282_POWER_BAD_FAULT_MASK		BIT(3)
+  #define LTC4282_FET_SHORT_FAULT_MASK		BIT(5)
+  #define LTC4282_FET_BAD_FAULT_MASK		BIT(6)
+  #define LTC4282_FET_FAILURE_FAULT_MASK \
+		(LTC4282_FET_SHORT_FAULT_MASK | LTC4282_FET_BAD_FAULT_MASK)
+#define LTC4282_ADC_ALERT_LOG			0x05
+  #define LTC4282_GPIO_ALARM_L_MASK		BIT(0)
+  #define LTC4282_GPIO_ALARM_H_MASK		BIT(1)
+  #define LTC4282_VSOURCE_ALARM_L_MASK		BIT(2)
+  #define LTC4282_VSOURCE_ALARM_H_MASK		BIT(3)
+  #define LTC4282_VSENSE_ALARM_L_MASK		BIT(4)
+  #define LTC4282_VSENSE_ALARM_H_MASK		BIT(5)
+  #define LTC4282_POWER_ALARM_L_MASK		BIT(6)
+  #define LTC4282_POWER_ALARM_H_MASK		BIT(7)
+#define LTC4282_FET_BAD_FAULT_TIMEOUT		0x06
+  #define LTC4282_FET_BAD_MAX_TIMEOUT		255
+#define LTC4282_GPIO_CONFIG			0x07
+  #define LTC4282_GPIO_2_FET_STRESS_MASK	BIT(1)
+  #define LTC4282_GPIO_1_CONFIG_MASK		GENMASK(5, 4)
+#define LTC4282_VGPIO_MIN			0x08
+#define LTC4282_VGPIO_MAX			0x09
+#define LTC4282_VSOURCE_MIN			0x0a
+#define LTC4282_VSOURCE_MAX			0x0b
+#define LTC4282_VSENSE_MIN			0x0c
+#define LTC4282_VSENSE_MAX			0x0d
+#define LTC4282_POWER_MIN			0x0e
+#define LTC4282_POWER_MAX			0x0f
+#define LTC4282_CLK_DIV				0x10
+  #define LTC4282_CLK_DIV_MASK			GENMASK(4, 0)
+  #define LTC4282_CLKOUT_MASK			GENMASK(6, 5)
+#define LTC4282_ILIM_ADJUST			0x11
+  #define LTC4282_GPIO_MODE_MASK		BIT(1)
+  #define LTC4282_VDD_MONITOR_MASK		BIT(2)
+  #define LTC4282_FOLDBACK_MODE_MASK		GENMASK(4, 3)
+  #define LTC4282_ILIM_ADJUST_MASK		GENMASK(7, 5)
+#define LTC4282_ENERGY				0x12
+#define LTC4282_TIME_COUNTER			0x18
+#define LTC4282_ALERT_CTRL			0x1c
+  #define LTC4282_ALERT_OUT_MASK		BIT(6)
+#define LTC4282_ADC_CTRL			0x1d
+  #define LTC4282_FAULT_LOG_EN_MASK		BIT(2)
+  #define LTC4282_METER_HALT_MASK		BIT(5)
+  #define LTC4282_METER_RESET_MASK		BIT(6)
+  #define LTC4282_RESET_MASK			BIT(7)
+#define LTC4282_STATUS_LSB			0x1e
+  #define LTC4282_OV_STATUS_MASK		BIT(0)
+  #define LTC4282_UV_STATUS_MASK		BIT(1)
+  #define LTC4282_VDD_STATUS_MASK \
+		(LTC4282_OV_STATUS_MASK | LTC4282_UV_STATUS_MASK)
+  #define LTC4282_OC_STATUS_MASK		BIT(2)
+  #define LTC4282_POWER_GOOD_MASK		BIT(3)
+  #define LTC4282_FET_FAILURE_MASK		GENMASK(6, 5)
+#define LTC4282_STATUS_MSB			0x1f
+#define LTC4282_RESERVED_1			0x32
+#define LTC4282_RESERVED_2			0x33
+#define LTC4282_VGPIO				0x34
+#define LTC4282_VGPIO_LOWEST			0x36
+#define LTC4282_VGPIO_HIGHEST			0x38
+#define LTC4282_VSOURCE				0x3a
+#define LTC4282_VSOURCE_LOWEST			0x3c
+#define LTC4282_VSOURCE_HIGHEST			0x3e
+#define LTC4282_VSENSE				0x40
+#define LTC4282_VSENSE_LOWEST			0x42
+#define LTC4282_VSENSE_HIGHEST			0x44
+#define LTC4282_POWER				0x46
+#define LTC4282_POWER_LOWEST			0x48
+#define LTC4282_POWER_HIGHEST			0x4a
+#define LTC4282_RESERVED_3			0x50
+
+#define LTC4282_CLKIN_MIN	(250 * KILO)
+#define LTC4282_CLKIN_MAX	(15500 * KILO)
+#define LTC4282_CLKIN_RANGE	(LTC4282_CLKIN_MAX - LTC4282_CLKIN_MIN + 1)
+#define LTC4282_CLKOUT_SYSTEM	(250 * KILO)
+#define LTC4282_CLKOUT_CNV	15
+
+enum {
+	LTC4282_CHAN_VSOURCE,
+	LTC4282_CHAN_VDD,
+	LTC4282_CHAN_VGPIO,
+};
+
+struct ltc4282_cache {
+	u32 in_max_raw;
+	u32 in_min_raw;
+	long in_highest;
+	long in_lowest;
+	bool en;
+};
+
+struct ltc4282_state {
+	struct regmap *map;
+	/* Protect against multiple accesses to the device registers */
+	struct mutex lock;
+	struct clk_hw clk_hw;
+	/*
+	 * Used to cache values for VDD/VSOURCE depending which will be used
+	 * when hwmon is not enabled for that channel. Needed because they share
+	 * the same registers.
+	 */
+	struct ltc4282_cache in0_1_cache[LTC4282_CHAN_VGPIO];
+	u32 vsense_max;
+	long power_max;
+	u32 rsense;
+	u16 vdd;
+	u16 vfs_out;
+	bool energy_en;
+};
+
+enum {
+	LTC4282_CLKOUT_NONE,
+	LTC4282_CLKOUT_INT,
+	LTC4282_CLKOUT_TICK,
+};
+
+static int ltc4282_set_rate(struct clk_hw *hw,
+			    unsigned long rate, unsigned long parent_rate)
+{
+	struct ltc4282_state *st = container_of(hw, struct ltc4282_state,
+						clk_hw);
+	u32 val = LTC4282_CLKOUT_INT;
+
+	if (rate == LTC4282_CLKOUT_CNV)
+		val = LTC4282_CLKOUT_TICK;
+
+	return regmap_update_bits(st->map, LTC4282_CLK_DIV, LTC4282_CLKOUT_MASK,
+				  FIELD_PREP(LTC4282_CLKOUT_MASK, val));
+}
+
+/*
+ * Note the 15HZ conversion rate assumes 12bit ADC which is what we are
+ * supporting for now.
+ */
+static const unsigned int ltc4282_out_rates[] = {
+	LTC4282_CLKOUT_CNV, LTC4282_CLKOUT_SYSTEM
+};
+
+static long ltc4282_round_rate(struct clk_hw *hw, unsigned long rate,
+			       unsigned long *parent_rate)
+{
+	int idx = find_closest(rate, ltc4282_out_rates,
+			       ARRAY_SIZE(ltc4282_out_rates));
+
+	return ltc4282_out_rates[idx];
+}
+
+static unsigned long ltc4282_recalc_rate(struct clk_hw *hw,
+					 unsigned long parent)
+{
+	struct ltc4282_state *st = container_of(hw, struct ltc4282_state,
+						clk_hw);
+	u32 clkdiv;
+	int ret;
+
+	ret = regmap_read(st->map, LTC4282_CLK_DIV, &clkdiv);
+	if (ret)
+		return 0;
+
+	clkdiv = FIELD_GET(LTC4282_CLKOUT_MASK, clkdiv);
+	if (!clkdiv)
+		return 0;
+	if (clkdiv == LTC4282_CLKOUT_INT)
+		return LTC4282_CLKOUT_SYSTEM;
+
+	return LTC4282_CLKOUT_CNV;
+}
+
+static void ltc4282_disable(struct clk_hw *clk_hw)
+{
+	struct ltc4282_state *st = container_of(clk_hw, struct ltc4282_state,
+						clk_hw);
+
+	regmap_clear_bits(st->map, LTC4282_CLK_DIV, LTC4282_CLKOUT_MASK);
+}
+
+static int ltc4282_read_voltage_word(const struct ltc4282_state *st, u32 reg,
+				     u32 fs, long *val)
+{
+	__be16 in;
+	int ret;
+
+	ret = regmap_bulk_read(st->map, reg, &in, sizeof(in));
+	if (ret)
+		return ret;
+
+	/*
+	 * This is also used to calculate current in which case fs comes in
+	 * 10 * uV. Hence the ULL usage.
+	 */
+	*val = DIV_ROUND_CLOSEST_ULL(be16_to_cpu(in) * (u64)fs, U16_MAX);
+	return 0;
+}
+
+static int ltc4282_read_voltage_byte_cached(const struct ltc4282_state *st,
+					    u32 reg, u32 fs, long *val,
+					    u32 *cached_raw)
+{
+	int ret;
+	u32 in;
+
+	if (cached_raw) {
+		in = *cached_raw;
+	} else {
+		ret = regmap_read(st->map, reg, &in);
+		if (ret)
+			return ret;
+	}
+
+	*val = DIV_ROUND_CLOSEST(in * fs, U8_MAX);
+	return 0;
+}
+
+static int ltc4282_read_voltage_byte(const struct ltc4282_state *st, u32 reg,
+				     u32 fs, long *val)
+{
+	return ltc4282_read_voltage_byte_cached(st, reg, fs, val, NULL);
+}
+
+static int __ltc4282_read_alarm(struct ltc4282_state *st, u32 reg, u32 mask,
+				long *val)
+{
+	u32 alarm;
+	int ret;
+
+	ret = regmap_read(st->map, reg, &alarm);
+	if (ret)
+		return ret;
+
+	*val = !!(alarm & mask);
+
+	/* if not status/fault logs, clear the alarm after reading it */
+	if (reg != LTC4282_STATUS_LSB && reg != LTC4282_FAULT_LOG)
+		return regmap_clear_bits(st->map, reg, mask);
+
+	return 0;
+}
+
+static int ltc4282_read_alarm(struct ltc4282_state *st, u32 reg, u32 mask,
+			      long *val)
+{
+	guard(mutex)(&st->lock);
+	return __ltc4282_read_alarm(st, reg, mask, val);
+}
+
+static int ltc4282_vdd_source_read_in(struct ltc4282_state *st, u32 channel,
+				      long *val)
+{
+	guard(mutex)(&st->lock);
+	if (!st->in0_1_cache[channel].en)
+		return -ENODATA;
+
+	return ltc4282_read_voltage_word(st, LTC4282_VSOURCE, st->vfs_out, val);
+}
+
+static int ltc4282_vdd_source_read_hist(struct ltc4282_state *st, u32 reg,
+					u32 channel, long *cached, long *val)
+{
+	int ret;
+
+	guard(mutex)(&st->lock);
+	if (!st->in0_1_cache[channel].en) {
+		*val = *cached;
+		return 0;
+	}
+
+	ret = ltc4282_read_voltage_word(st, reg, st->vfs_out, val);
+	if (ret)
+		return ret;
+
+	*cached = *val;
+	return 0;
+}
+
+static int ltc4282_vdd_source_read_lim(struct ltc4282_state *st, u32 reg,
+				       u32 channel, u32 *cached, long *val)
+{
+	guard(mutex)(&st->lock);
+	if (!st->in0_1_cache[channel].en)
+		return ltc4282_read_voltage_byte_cached(st, reg, st->vfs_out,
+							val, cached);
+
+	return ltc4282_read_voltage_byte(st, reg, st->vfs_out, val);
+}
+
+static int ltc4282_vdd_source_read_alm(struct ltc4282_state *st, u32 mask,
+				       u32 channel, long *val)
+{
+	guard(mutex)(&st->lock);
+	if (!st->in0_1_cache[channel].en) {
+		/*
+		 * Do this otherwise alarms can get confused because we clear
+		 * them after reading them. So, if someone mistakenly reads
+		 * VSOURCE right before VDD (or the other way around), we might
+		 * get no alarm just because it was cleared when reading VSOURCE
+		 * and had no time for a new conversion and thus having the
+		 * alarm again.
+		 */
+		*val = 0;
+		return 0;
+	}
+
+	return __ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG, mask, val);
+}
+
+static int ltc4282_read_in(struct ltc4282_state *st, u32 attr, long *val,
+			   u32 channel)
+{
+	switch (attr) {
+	case hwmon_in_input:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_read_voltage_word(st, LTC4282_VGPIO,
+							 1280, val);
+
+		return ltc4282_vdd_source_read_in(st, channel, val);
+	case hwmon_in_highest:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_read_voltage_word(st,
+							 LTC4282_VGPIO_HIGHEST,
+							 1280, val);
+
+		return ltc4282_vdd_source_read_hist(st, LTC4282_VSOURCE_HIGHEST,
+						    channel,
+						    &st->in0_1_cache[channel].in_highest, val);
+	case hwmon_in_lowest:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_read_voltage_word(st, LTC4282_VGPIO_LOWEST,
+							 1280, val);
+
+		return ltc4282_vdd_source_read_hist(st, LTC4282_VSOURCE_LOWEST,
+						    channel,
+						    &st->in0_1_cache[channel].in_lowest, val);
+	case hwmon_in_max_alarm:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG,
+						  LTC4282_GPIO_ALARM_H_MASK,
+						  val);
+
+		return ltc4282_vdd_source_read_alm(st,
+						   LTC4282_VSOURCE_ALARM_H_MASK,
+						   channel, val);
+	case hwmon_in_min_alarm:
+		if (channel == LTC4282_CHAN_VGPIO)
+			ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG,
+					   LTC4282_GPIO_ALARM_L_MASK, val);
+
+		return ltc4282_vdd_source_read_alm(st,
+						   LTC4282_VSOURCE_ALARM_L_MASK,
+						   channel, val);
+	case hwmon_in_crit_alarm:
+		return ltc4282_read_alarm(st, LTC4282_STATUS_LSB,
+					  LTC4282_OV_STATUS_MASK, val);
+	case hwmon_in_lcrit_alarm:
+		return ltc4282_read_alarm(st, LTC4282_STATUS_LSB,
+					  LTC4282_UV_STATUS_MASK, val);
+	case hwmon_in_max:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_read_voltage_byte(st, LTC4282_VGPIO_MAX,
+							 1280, val);
+
+		return ltc4282_vdd_source_read_lim(st, LTC4282_VSOURCE_MAX,
+						   channel,
+						   &st->in0_1_cache[channel].in_max_raw, val);
+	case hwmon_in_min:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_read_voltage_byte(st, LTC4282_VGPIO_MIN,
+							 1280, val);
+
+		return ltc4282_vdd_source_read_lim(st, LTC4282_VSOURCE_MIN,
+						   channel,
+						   &st->in0_1_cache[channel].in_min_raw, val);
+	case hwmon_in_enable:
+		scoped_guard(mutex, &st->lock) {
+			*val = st->in0_1_cache[channel].en;
+		}
+		return 0;
+	case hwmon_in_fault:
+		/*
+		 * We report failure if we detect either a fer_bad or a
+		 * fet_short in the status register.
+		 */
+		return ltc4282_read_alarm(st, LTC4282_STATUS_LSB,
+					  LTC4282_FET_FAILURE_MASK, val);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int ltc4282_read_current_word(const struct ltc4282_state *st, u32 reg,
+				     long *val)
+{
+	long in;
+	int ret;
+
+	/*
+	 * We pass in full scale in 10 * micro (note that 40 is already
+	 * millivolt) so we have better approximations to calculate current.
+	 */
+	ret = ltc4282_read_voltage_word(st, reg, DECA * 40 * MILLI, &in);
+	if (ret)
+		return ret;
+
+	*val = DIV_ROUND_CLOSEST(in * MILLI, st->rsense);
+
+	return 0;
+}
+
+static int ltc4282_read_current_byte(const struct ltc4282_state *st, u32 reg,
+				     long *val)
+{
+	long in;
+	int ret;
+
+	ret = ltc4282_read_voltage_byte(st, reg, DECA * 40 * MILLI, &in);
+	if (ret)
+		return ret;
+
+	*val = DIV_ROUND_CLOSEST(in * MILLI, st->rsense);
+
+	return 0;
+}
+
+static int ltc4282_read_curr(struct ltc4282_state *st, const u32 attr,
+			     long *val)
+{
+	switch (attr) {
+	case hwmon_curr_input:
+		return ltc4282_read_current_word(st, LTC4282_VSENSE, val);
+	case hwmon_curr_highest:
+		return ltc4282_read_current_word(st, LTC4282_VSENSE_HIGHEST,
+						 val);
+	case hwmon_curr_lowest:
+		return ltc4282_read_current_word(st, LTC4282_VSENSE_LOWEST,
+						 val);
+	case hwmon_curr_max:
+		return ltc4282_read_current_byte(st, LTC4282_VSENSE_MAX, val);
+	case hwmon_curr_min:
+		return ltc4282_read_current_byte(st, LTC4282_VSENSE_MIN, val);
+	case hwmon_curr_max_alarm:
+		return ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG,
+					  LTC4282_VSENSE_ALARM_H_MASK, val);
+	case hwmon_curr_min_alarm:
+		return ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG,
+					  LTC4282_VSENSE_ALARM_L_MASK, val);
+	case hwmon_curr_crit_alarm:
+		return ltc4282_read_alarm(st, LTC4282_STATUS_LSB,
+					  LTC4282_OC_STATUS_MASK, val);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int ltc4282_read_power_word(const struct ltc4282_state *st, u32 reg,
+				   long *val)
+{
+	u64 temp =  DECA * 40ULL * st->vfs_out * BIT(16), temp_2;
+	__be16 raw;
+	u16 power;
+	int ret;
+
+	ret = regmap_bulk_read(st->map, reg, &raw, sizeof(raw));
+	if (ret)
+		return ret;
+
+	power = be16_to_cpu(raw);
+	/*
+	 * Power is given by:
+	 *     P = CODE(16b) * 0.040 * Vfs(out) * 2^16 / ((2^16 - 1)^2 * Rsense)
+	 */
+	if (check_mul_overflow(power * temp, MICRO, &temp_2)) {
+		temp = DIV_ROUND_CLOSEST_ULL(power * temp, U16_MAX);
+		*val = DIV64_U64_ROUND_CLOSEST(temp * MICRO,
+					       U16_MAX * (u64)st->rsense);
+		return 0;
+	}
+
+	*val = DIV64_U64_ROUND_CLOSEST(temp_2,
+				       st->rsense * int_pow(U16_MAX, 2));
+
+	return 0;
+}
+
+static int ltc4282_read_power_byte(const struct ltc4282_state *st, u32 reg,
+				   long *val)
+{
+	u32 power;
+	u64 temp;
+	int ret;
+
+	ret = regmap_read(st->map, reg, &power);
+	if (ret)
+		return ret;
+
+	temp = power * 40 * DECA * st->vfs_out * BIT_ULL(8);
+	*val = DIV64_U64_ROUND_CLOSEST(temp * MICRO,
+				       int_pow(U8_MAX, 2) * st->rsense);
+
+	return 0;
+}
+
+static int ltc4282_read_energy(const struct ltc4282_state *st, u64 *val)
+{
+	u64 temp, energy;
+	__be64 raw;
+	int ret;
+
+	ret = regmap_bulk_read(st->map, LTC4282_ENERGY, &raw, 6);
+	if (ret)
+		return ret;
+
+	energy =  be64_to_cpu(raw) >> 16;
+	/*
+	 * The formula for energy is given by:
+	 *	E = CODE(48b) * 0.040 * Vfs(out) * Tconv * 256 /
+	 *						((2^16 - 1)^2 * Rsense)
+	 *
+	 * Since we only support 12bit ADC, Tconv = 0.065535s. Passing Vfs(out)
+	 * and 0.040 to mV and Tconv to us, we can simplify the formula to:
+	 *	E = CODE(48b) * 40 * Vfs(out) * 256 / (U16_MAX * Rsense)
+	 *
+	 * As Rsense can have tenths of micro-ohm resolution, we need to
+	 * multiply by DECA to get microujoule.
+	 */
+	if (check_mul_overflow(DECA * st->vfs_out * 40 * BIT(8), energy, &temp)) {
+		temp = DIV_ROUND_CLOSEST(DECA * st->vfs_out * 40 * BIT(8), U16_MAX);
+		*val = DIV_ROUND_CLOSEST_ULL(temp * energy, st->rsense);
+		return 0;
+	}
+
+	*val = DIV64_U64_ROUND_CLOSEST(temp, U16_MAX * (u64)st->rsense);
+
+	return 0;
+}
+
+static int ltc4282_read_power(struct ltc4282_state *st, const u32 attr,
+			      long *val)
+{
+	switch (attr) {
+	case hwmon_power_input:
+		return ltc4282_read_power_word(st, LTC4282_POWER, val);
+	case hwmon_power_input_highest:
+		return ltc4282_read_power_word(st, LTC4282_POWER_HIGHEST, val);
+	case hwmon_power_input_lowest:
+		return ltc4282_read_power_word(st, LTC4282_POWER_LOWEST, val);
+	case hwmon_power_max_alarm:
+		return ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG,
+					  LTC4282_POWER_ALARM_H_MASK, val);
+	case hwmon_power_min_alarm:
+		return ltc4282_read_alarm(st, LTC4282_ADC_ALERT_LOG,
+					  LTC4282_POWER_ALARM_L_MASK, val);
+	case hwmon_power_max:
+		return ltc4282_read_power_byte(st, LTC4282_POWER_MAX, val);
+	case hwmon_power_min:
+		return ltc4282_read_power_byte(st, LTC4282_POWER_MIN, val);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int ltc4282_read(struct device *dev, enum hwmon_sensor_types type,
+			u32 attr, int channel, long *val)
+{
+	struct ltc4282_state *st = dev_get_drvdata(dev);
+
+	switch (type) {
+	case hwmon_in:
+		return ltc4282_read_in(st, attr, val, channel);
+	case hwmon_curr:
+		return ltc4282_read_curr(st, attr, val);
+	case hwmon_power:
+		return ltc4282_read_power(st, attr, val);
+	case hwmon_energy:
+		scoped_guard(mutex, &st->lock) {
+			*val = st->energy_en;
+		}
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int ltc4282_write_power_byte(const struct ltc4282_state *st, u32 reg,
+				    long val)
+{
+	u32 power;
+	u64 temp;
+
+	if (val > st->power_max)
+		val = st->power_max;
+
+	temp = val * int_pow(U8_MAX, 2) * st->rsense;
+	power = DIV64_U64_ROUND_CLOSEST(temp,
+					MICRO * DECA * 256ULL * st->vfs_out * 40);
+
+	return regmap_write(st->map, reg, power);
+}
+
+static int ltc4282_write_power_word(const struct ltc4282_state *st, u32 reg,
+				    long val)
+{
+	u64 temp = int_pow(U16_MAX, 2) * st->rsense, temp_2;
+	__be16 __raw;
+	u16 code;
+
+	if (check_mul_overflow(temp, val, &temp_2)) {
+		temp = DIV_ROUND_CLOSEST_ULL(temp, DECA * MICRO);
+		code = DIV64_U64_ROUND_CLOSEST(temp * val,
+					       40ULL * BIT(16) * st->vfs_out);
+	} else {
+		temp =  DECA * MICRO * 40ULL * BIT(16) * st->vfs_out;
+		code = DIV64_U64_ROUND_CLOSEST(temp_2, temp);
+	}
+
+	__raw = cpu_to_be16(code);
+	return regmap_bulk_write(st->map, reg, &__raw, sizeof(__raw));
+}
+
+static int __ltc4282_in_write_history(const struct ltc4282_state *st, u32 reg,
+				      long lowest, long highest, u32 fs)
+{
+	__be16 __raw;
+	u16 tmp;
+	int ret;
+
+	tmp = DIV_ROUND_CLOSEST(U16_MAX * lowest, fs);
+
+	__raw = cpu_to_be16(tmp);
+
+	ret = regmap_bulk_write(st->map, reg, &__raw, 2);
+	if (ret)
+		return ret;
+
+	tmp = DIV_ROUND_CLOSEST(U16_MAX * highest, fs);
+
+	__raw = cpu_to_be16(tmp);
+
+	return regmap_bulk_write(st->map, reg + 2, &__raw, 2);
+}
+
+static int ltc4282_in_write_history(struct ltc4282_state *st, u32 reg,
+				    long lowest, long highest, u32 fs)
+{
+	guard(mutex)(&st->lock);
+	return __ltc4282_in_write_history(st, reg, lowest, highest, fs);
+}
+
+static int ltc4282_power_reset_hist(struct ltc4282_state *st)
+{
+	int ret;
+
+	guard(mutex)(&st->lock);
+
+	ret = ltc4282_write_power_word(st, LTC4282_POWER_LOWEST,
+				       st->power_max);
+	if (ret)
+		return ret;
+
+	ret = ltc4282_write_power_word(st, LTC4282_POWER_HIGHEST, 0);
+	if (ret)
+		return ret;
+
+	/* now, let's also clear possible power_bad fault logs */
+	return regmap_clear_bits(st->map, LTC4282_FAULT_LOG,
+				 LTC4282_POWER_BAD_FAULT_MASK);
+}
+
+static int ltc4282_write_power(struct ltc4282_state *st, u32 attr,
+			       long val)
+{
+	switch (attr) {
+	case hwmon_power_max:
+		return ltc4282_write_power_byte(st, LTC4282_POWER_MAX, val);
+	case hwmon_power_min:
+		return ltc4282_write_power_byte(st, LTC4282_POWER_MIN, val);
+	case hwmon_power_reset_history:
+		return ltc4282_power_reset_hist(st);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int ltc4282_write_voltage_byte_cached(const struct ltc4282_state *st,
+					     u32 reg, u32 fs, long val,
+					     u32 *cache_raw)
+{
+	u32 in;
+
+	if (val >= fs)
+		in = U8_MAX;
+	else
+		in = DIV_ROUND_CLOSEST(val * U8_MAX, fs);
+
+	if (cache_raw) {
+		*cache_raw = in;
+		return 0;
+	}
+
+	return regmap_write(st->map, reg, in);
+}
+
+static int ltc4282_write_voltage_byte(const struct ltc4282_state *st, u32 reg,
+				      u32 fs, long val)
+{
+	return ltc4282_write_voltage_byte_cached(st, reg, fs, val, NULL);
+}
+
+static int ltc4282_cache_history(struct ltc4282_state *st, u32 channel)
+{
+	long val;
+	int ret;
+
+	ret = ltc4282_read_voltage_word(st, LTC4282_VSOURCE_LOWEST, st->vfs_out,
+					&val);
+	if (ret)
+		return ret;
+
+	st->in0_1_cache[channel].in_lowest = val;
+
+	ret = ltc4282_read_voltage_word(st, LTC4282_VSOURCE_HIGHEST,
+					st->vfs_out, &val);
+	if (ret)
+		return ret;
+
+	st->in0_1_cache[channel].in_highest = val;
+
+	ret = regmap_read(st->map, LTC4282_VSOURCE_MIN,
+			  &st->in0_1_cache[channel].in_min_raw);
+	if (ret)
+		return ret;
+
+	return regmap_read(st->map, LTC4282_VSOURCE_MAX,
+			  &st->in0_1_cache[channel].in_max_raw);
+}
+
+static int ltc4282_cache_sync(struct ltc4282_state *st, u32 channel)
+{
+	int ret;
+
+	ret = __ltc4282_in_write_history(st, LTC4282_VSOURCE_LOWEST,
+					 st->in0_1_cache[channel].in_lowest,
+					 st->in0_1_cache[channel].in_highest,
+					 st->vfs_out);
+	if (ret)
+		return ret;
+
+	ret = regmap_write(st->map, LTC4282_VSOURCE_MIN,
+			   st->in0_1_cache[channel].in_min_raw);
+	if (ret)
+		return ret;
+
+	return regmap_write(st->map, LTC4282_VSOURCE_MAX,
+			    st->in0_1_cache[channel].in_max_raw);
+}
+
+static int ltc4282_vdd_source_write_lim(struct ltc4282_state *st, u32 reg,
+					int channel, u32 *cache, long val)
+{
+	int ret;
+
+	guard(mutex)(&st->lock);
+	if (st->in0_1_cache[channel].en)
+		ret = ltc4282_write_voltage_byte(st, reg, st->vfs_out, val);
+	else
+		ret = ltc4282_write_voltage_byte_cached(st, reg, st->vfs_out,
+							val, cache);
+
+	return ret;
+}
+
+static int ltc4282_vdd_source_reset_hist(struct ltc4282_state *st, int channel)
+{
+	long lowest = st->vfs_out;
+	int ret;
+
+	if (channel == LTC4282_CHAN_VDD)
+		lowest = st->vdd;
+
+	guard(mutex)(&st->lock);
+	if (st->in0_1_cache[channel].en) {
+		ret = __ltc4282_in_write_history(st, LTC4282_VSOURCE_LOWEST,
+						 lowest, 0, st->vfs_out);
+		if (ret)
+			return ret;
+	}
+
+	st->in0_1_cache[channel].in_lowest = lowest;
+	st->in0_1_cache[channel].in_highest = 0;
+
+	/*
+	 * We are also clearing possible fault logs in reset_history. Clearing
+	 * the logs might be important when the auto retry bits are not enabled
+	 * as the chip only enables the output again after having these logs
+	 * cleared. As some of these logs are related to limits, it makes sense
+	 * to clear them in here. For VDD, we need to clear under/over voltage
+	 * events. For VSOURCE, fet_short and fet_bad...
+	 */
+	if (channel == LTC4282_CHAN_VSOURCE)
+		return regmap_clear_bits(st->map, LTC4282_FAULT_LOG,
+					 LTC4282_FET_FAILURE_FAULT_MASK);
+
+	return regmap_clear_bits(st->map, LTC4282_FAULT_LOG,
+				 LTC4282_VDD_FAULT_MASK);
+}
+
+/*
+ * We need to mux between VSOURCE and VDD which means they are mutually
+ * exclusive. Moreover, we can't really disable both VDD and VSOURCE as the ADC
+ * is continuously running (we cannot independently halt it without also
+ * stopping VGPIO). Hence, the logic is that disabling or enabling VDD will
+ * automatically have the reverse effect on VSOURCE and vice-versa.
+ */
+static int ltc4282_vdd_source_enable(struct ltc4282_state *st, int channel,
+				     long val)
+{
+	int ret, other_chan = ~channel & 0x1;
+	u8 __val = val;
+
+	guard(mutex)(&st->lock);
+	if (st->in0_1_cache[channel].en == !!val)
+		return 0;
+
+	/* clearing the bit makes the ADC to monitor VDD */
+	if (channel == LTC4282_CHAN_VDD)
+		__val = !__val;
+
+	ret = regmap_update_bits(st->map, LTC4282_ILIM_ADJUST,
+				 LTC4282_VDD_MONITOR_MASK,
+				 FIELD_PREP(LTC4282_VDD_MONITOR_MASK, !!__val));
+	if (ret)
+		return ret;
+
+	st->in0_1_cache[channel].en = !!val;
+	st->in0_1_cache[other_chan].en = !val;
+
+	if (st->in0_1_cache[channel].en) {
+		/*
+		 * Then, we are disabling @other_chan. Let's save it's current
+		 * history.
+		 */
+		ret = ltc4282_cache_history(st, other_chan);
+		if (ret)
+			return ret;
+
+		return ltc4282_cache_sync(st, channel);
+	}
+	/*
+	 * Then, we are enabling @other_chan. We need to do the opposite from
+	 * above.
+	 */
+	ret = ltc4282_cache_history(st, channel);
+	if (ret)
+		return ret;
+
+	return ltc4282_cache_sync(st, other_chan);
+}
+
+static int ltc4282_write_in(struct ltc4282_state *st, u32 attr, long val,
+			    int channel)
+{
+	switch (attr) {
+	case hwmon_in_max:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_write_voltage_byte(st, LTC4282_VGPIO_MAX,
+							  1280, val);
+
+		return ltc4282_vdd_source_write_lim(st, LTC4282_VSOURCE_MAX,
+						    channel,
+						    &st->in0_1_cache[channel].in_max_raw, val);
+	case hwmon_in_min:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_write_voltage_byte(st, LTC4282_VGPIO_MIN,
+							  1280, val);
+
+		return ltc4282_vdd_source_write_lim(st, LTC4282_VSOURCE_MIN,
+						    channel,
+						    &st->in0_1_cache[channel].in_min_raw, val);
+	case hwmon_in_reset_history:
+		if (channel == LTC4282_CHAN_VGPIO)
+			return ltc4282_in_write_history(st,
+							LTC4282_VGPIO_LOWEST,
+							1280, 0, 1280);
+
+		return ltc4282_vdd_source_reset_hist(st, channel);
+	case hwmon_in_enable:
+		return ltc4282_vdd_source_enable(st, channel, val);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int ltc4282_curr_reset_hist(struct ltc4282_state *st)
+{
+	int ret;
+
+	guard(mutex)(&st->lock);
+
+	ret = __ltc4282_in_write_history(st, LTC4282_VSENSE_LOWEST,
+					 st->vsense_max, 0, 40 * MILLI);
+	if (ret)
+		return ret;
+
+	/* now, let's also clear possible overcurrent fault logs */
+	return regmap_clear_bits(st->map, LTC4282_FAULT_LOG,
+				 LTC4282_OC_FAULT_MASK);
+}
+
+static int ltc4282_write_curr(struct ltc4282_state *st, u32 attr,
+			      long val)
+{
+	/* need to pass it in millivolt */
+	u32 in = DIV_ROUND_CLOSEST_ULL((u64)val * st->rsense, DECA * MICRO);
+
+	switch (attr) {
+	case hwmon_curr_max:
+		return ltc4282_write_voltage_byte(st, LTC4282_VSENSE_MAX, 40,
+						  in);
+	case hwmon_curr_min:
+		return ltc4282_write_voltage_byte(st, LTC4282_VSENSE_MIN, 40,
+						  in);
+	case hwmon_curr_reset_history:
+		return ltc4282_curr_reset_hist(st);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int ltc4282_energy_enable_set(struct ltc4282_state *st, long val)
+{
+	int ret;
+
+	guard(mutex)(&st->lock);
+	/* setting the bit halts the meter */
+	ret = regmap_update_bits(st->map, LTC4282_ADC_CTRL,
+				 LTC4282_METER_HALT_MASK,
+				 FIELD_PREP(LTC4282_METER_HALT_MASK, !val));
+	if (ret)
+		return ret;
+
+	st->energy_en = !!val;
+
+	return 0;
+}
+
+static int ltc4282_write(struct device *dev,
+			 enum hwmon_sensor_types type,
+			 u32 attr, int channel, long val)
+{
+	struct ltc4282_state *st = dev_get_drvdata(dev);
+
+	switch (type) {
+	case hwmon_power:
+		return ltc4282_write_power(st, attr, val);
+	case hwmon_in:
+		return ltc4282_write_in(st, attr, val, channel);
+	case hwmon_curr:
+		return ltc4282_write_curr(st, attr, val);
+	case hwmon_energy:
+		return ltc4282_energy_enable_set(st, val);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static umode_t ltc4282_in_is_visible(const struct ltc4282_state *st, u32 attr)
+{
+	switch (attr) {
+	case hwmon_in_input:
+	case hwmon_in_highest:
+	case hwmon_in_lowest:
+	case hwmon_in_max_alarm:
+	case hwmon_in_min_alarm:
+	case hwmon_in_label:
+	case hwmon_in_lcrit_alarm:
+	case hwmon_in_crit_alarm:
+	case hwmon_in_fault:
+		return 0444;
+	case hwmon_in_max:
+	case hwmon_in_min:
+	case hwmon_in_enable:
+	case hwmon_in_reset_history:
+		return 0644;
+	default:
+		return 0;
+	}
+}
+
+static umode_t ltc4282_curr_is_visible(u32 attr)
+{
+	switch (attr) {
+	case hwmon_curr_input:
+	case hwmon_curr_highest:
+	case hwmon_curr_lowest:
+	case hwmon_curr_max_alarm:
+	case hwmon_curr_min_alarm:
+	case hwmon_curr_crit_alarm:
+	case hwmon_curr_label:
+		return 0444;
+	case hwmon_curr_max:
+	case hwmon_curr_min:
+	case hwmon_curr_reset_history:
+		return 0644;
+	default:
+		return 0;
+	}
+}
+
+static umode_t ltc4282_power_is_visible(u32 attr)
+{
+	switch (attr) {
+	case hwmon_power_input:
+	case hwmon_power_input_highest:
+	case hwmon_power_input_lowest:
+	case hwmon_power_label:
+	case hwmon_power_max_alarm:
+	case hwmon_power_min_alarm:
+		return 0444;
+	case hwmon_power_max:
+	case hwmon_power_min:
+	case hwmon_power_reset_history:
+		return 0644;
+	default:
+		return 0;
+	}
+}
+
+static umode_t ltc4282_is_visible(const void *data,
+				  enum hwmon_sensor_types type,
+				  u32 attr, int channel)
+{
+	switch (type) {
+	case hwmon_in:
+		return ltc4282_in_is_visible(data, attr);
+	case hwmon_curr:
+		return ltc4282_curr_is_visible(attr);
+	case hwmon_power:
+		return ltc4282_power_is_visible(attr);
+	case hwmon_energy:
+		/* hwmon_energy_enable */
+		return 0644;
+	default:
+		return 0;
+	}
+}
+
+static const char * const ltc4282_in_strs[] = {
+	"VSOURCE", "VDD", "VGPIO"
+};
+
+static int ltc4282_read_labels(struct device *dev,
+			       enum hwmon_sensor_types type,
+			       u32 attr, int channel, const char **str)
+{
+	switch (type) {
+	case hwmon_in:
+		*str = ltc4282_in_strs[channel];
+		return 0;
+	case hwmon_curr:
+		*str = "ISENSE";
+		return 0;
+	case hwmon_power:
+		*str = "Power";
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static ssize_t ltc4282_energy_show(struct device *dev,
+				   struct device_attribute *da, char *buf)
+{
+	struct ltc4282_state *st = dev_get_drvdata(dev);
+	u64 energy;
+	int ret;
+
+	guard(mutex)(&st->lock);
+	if (!st->energy_en)
+		return -ENODATA;
+
+	ret = ltc4282_read_energy(st, &energy);
+	if (ret < 0)
+		return ret;
+
+	return sysfs_emit(buf, "%llu\n", energy);
+}
+
+static const struct clk_ops ltc4282_ops = {
+	.recalc_rate = ltc4282_recalc_rate,
+	.round_rate = ltc4282_round_rate,
+	.set_rate = ltc4282_set_rate,
+	.disable = ltc4282_disable,
+};
+
+static int ltc428_clk_provider_setup(struct ltc4282_state *st,
+				     struct device *dev)
+{
+	struct clk_init_data init;
+	int ret;
+
+	if (!IS_ENABLED(CONFIG_COMMON_CLK))
+		return 0;
+
+	init.name =  devm_kasprintf(dev, GFP_KERNEL, "%s-clk",
+				    fwnode_get_name(dev_fwnode(dev)));
+	if (!init.name)
+		return -ENOMEM;
+
+	init.ops = &ltc4282_ops;
+	init.flags = CLK_GET_RATE_NOCACHE;
+	st->clk_hw.init = &init;
+
+	ret = devm_clk_hw_register(dev, &st->clk_hw);
+	if (ret)
+		return ret;
+
+	return devm_of_clk_add_hw_provider(dev, of_clk_hw_simple_get,
+					   &st->clk_hw);
+}
+
+static int ltc428_clks_setup(struct ltc4282_state *st, struct device *dev)
+{
+	unsigned long rate;
+	struct clk *clkin;
+	u32 val;
+	int ret;
+
+	ret = ltc428_clk_provider_setup(st, dev);
+	if (ret)
+		return ret;
+
+	clkin = devm_clk_get_optional_enabled(dev, NULL);
+	if (IS_ERR(clkin))
+		return dev_err_probe(dev, PTR_ERR(clkin),
+				     "Failed to get clkin");
+	if (!clkin)
+		return 0;
+
+	rate = clk_get_rate(clkin);
+	if (!in_range(rate, LTC4282_CLKIN_MIN, LTC4282_CLKIN_RANGE))
+		return dev_err_probe(dev, -EINVAL,
+				     "Invalid clkin range(%lu) [%lu %lu]\n",
+				     rate, LTC4282_CLKIN_MIN,
+				     LTC4282_CLKIN_MAX);
+
+	/*
+	 * Clocks faster than 250KHZ should be reduced to 250KHZ. The clock
+	 * frequency is divided by twice the value in the register.
+	 */
+	val = rate / (2 * LTC4282_CLKIN_MIN);
+
+	return regmap_update_bits(st->map, LTC4282_CLK_DIV,
+				  LTC4282_CLK_DIV_MASK,
+				  FIELD_PREP(LTC4282_CLK_DIV_MASK, val));
+}
+
+static const int ltc4282_curr_lim_uv[] = {
+	12500, 15625, 18750, 21875, 25000, 28125, 31250, 34375
+};
+
+static int ltc4282_get_defaults(struct ltc4282_state *st, u32 *vin_mode)
+{
+	u32 reg_val, ilm_adjust;
+	int ret;
+
+	ret = regmap_read(st->map, LTC4282_ADC_CTRL, &reg_val);
+	if (ret)
+		return ret;
+
+	st->energy_en = !FIELD_GET(LTC4282_METER_HALT_MASK, reg_val);
+
+	ret = regmap_read(st->map, LTC4282_CTRL_MSB, &reg_val);
+	if (ret)
+		return ret;
+
+	*vin_mode = FIELD_GET(LTC4282_CTRL_VIN_MODE_MASK, reg_val);
+
+	ret = regmap_read(st->map, LTC4282_ILIM_ADJUST, &reg_val);
+	if (ret)
+		return ret;
+
+	ilm_adjust = FIELD_GET(LTC4282_ILIM_ADJUST_MASK, reg_val);
+	st->vsense_max = ltc4282_curr_lim_uv[ilm_adjust];
+
+	st->in0_1_cache[LTC4282_CHAN_VSOURCE].en = FIELD_GET(LTC4282_VDD_MONITOR_MASK,
+							     ilm_adjust);
+	if (!st->in0_1_cache[LTC4282_CHAN_VSOURCE].en) {
+		st->in0_1_cache[LTC4282_CHAN_VDD].en = true;
+		return regmap_read(st->map, LTC4282_VSOURCE_MAX,
+				   &st->in0_1_cache[LTC4282_CHAN_VSOURCE].in_max_raw);
+	}
+
+	return regmap_read(st->map, LTC4282_VSOURCE_MAX,
+			   &st->in0_1_cache[LTC4282_CHAN_VDD].in_max_raw);
+}
+
+/*
+ * Set max limits for ISENSE and Power as that depends on the max voltage on
+ * rsense that is defined in ILIM_ADJUST. This is specially important for power
+ * because for some rsense and vfsout values, if we allow the default raw 255
+ * value, that would overflow long in 32bit archs when reading back the max
+ * power limit.
+ *
+ * Also set meaningful historic values for VDD and VSOURCE
+ * (0 would not mean much).
+ */
+static int ltc4282_set_max_limits(struct ltc4282_state *st)
+{
+	int ret;
+
+	ret = ltc4282_write_voltage_byte(st, LTC4282_VSENSE_MAX, 40 * MILLI,
+					 st->vsense_max);
+	if (ret)
+		return ret;
+
+	/* Power is given by ISENSE * Vout. */
+	st->power_max = DIV_ROUND_CLOSEST(st->vsense_max * DECA * MILLI, st->rsense) * st->vfs_out;
+	ret = ltc4282_write_power_byte(st, LTC4282_POWER_MAX, st->power_max);
+	if (ret)
+		return ret;
+
+	if (st->in0_1_cache[LTC4282_CHAN_VDD].en) {
+		st->in0_1_cache[LTC4282_CHAN_VSOURCE].in_lowest = st->vfs_out;
+		return __ltc4282_in_write_history(st, LTC4282_VSOURCE_LOWEST,
+						  st->vdd, 0, st->vfs_out);
+	}
+
+	st->in0_1_cache[LTC4282_CHAN_VDD].in_lowest = st->vdd;
+	return __ltc4282_in_write_history(st, LTC4282_VSOURCE_LOWEST,
+					  st->vfs_out, 0, st->vfs_out);
+}
+
+static const char * const ltc4282_gpio1_modes[] = {
+	"power_bad", "power_good"
+};
+
+static const char * const ltc4282_gpio2_modes[] = {
+	"adc_input", "stress_fet"
+};
+
+static int ltc4282_gpio_setup(struct ltc4282_state *st, struct device *dev)
+{
+	const char *func = NULL;
+	int ret;
+
+	ret = device_property_read_string(dev, "adi,gpio1-mode", &func);
+	if (!ret) {
+		ret = match_string(ltc4282_gpio1_modes,
+				   ARRAY_SIZE(ltc4282_gpio1_modes), func);
+		if (ret < 0)
+			return dev_err_probe(dev, ret,
+					     "Invalid func(%s) for gpio1\n",
+					     func);
+
+		ret = regmap_update_bits(st->map, LTC4282_GPIO_CONFIG,
+					 LTC4282_GPIO_1_CONFIG_MASK,
+					 FIELD_PREP(LTC4282_GPIO_1_CONFIG_MASK, ret));
+		if (ret)
+			return ret;
+	}
+
+	ret = device_property_read_string(dev, "adi,gpio2-mode", &func);
+	if (!ret) {
+		ret = match_string(ltc4282_gpio2_modes,
+				   ARRAY_SIZE(ltc4282_gpio2_modes), func);
+		if (ret < 0)
+			return dev_err_probe(dev, ret,
+					     "Invalid func(%s) for gpio2\n",
+					     func);
+		if (!ret) {
+			/* setting the bit to 1 so the ADC to monitors GPIO2 */
+			ret = regmap_set_bits(st->map, LTC4282_ILIM_ADJUST,
+					      LTC4282_GPIO_MODE_MASK);
+		} else {
+			ret = regmap_update_bits(st->map, LTC4282_GPIO_CONFIG,
+						 LTC4282_GPIO_2_FET_STRESS_MASK,
+						 FIELD_PREP(LTC4282_GPIO_2_FET_STRESS_MASK, 1));
+		}
+
+		if (ret)
+			return ret;
+	}
+
+	if (!device_property_read_bool(dev, "adi,gpio3-monitor-enable"))
+		return 0;
+
+	if (func && !strcmp(func, "adc_input"))
+		return dev_err_probe(dev, -EINVAL,
+				     "Cannot have both gpio2 and gpio3 muxed into the ADC");
+
+	return regmap_clear_bits(st->map, LTC4282_ILIM_ADJUST,
+				 LTC4282_GPIO_MODE_MASK);
+}
+
+static const char * const ltc4282_dividers[] = {
+	"external", "vdd_5_percent", "vdd_10_percent", "vdd_15_percent"
+};
+
+/* This maps the Vout full scale for the given Vin mode */
+static const u16 ltc4282_vfs_milli[] = { 5540, 8320, 16640, 33280 };
+
+static const u16 ltc4282_vdd_milli[] = { 3300, 5000, 12000, 24000 };
+
+enum {
+	LTC4282_VIN_3_3V,
+	LTC4282_VIN_5V,
+	LTC4282_VIN_12V,
+	LTC4282_VIN_24V,
+};
+
+static int ltc4282_setup(struct ltc4282_state *st, struct device *dev)
+{
+	const char *divider;
+	u32 val, vin_mode;
+	int ret;
+
+	/* The part has an eeprom so let's get the needed defaults from it */
+	ret = ltc4282_get_defaults(st, &vin_mode);
+	if (ret)
+		return ret;
+
+	ret = device_property_read_u32(dev, "adi,rsense-nano-ohms",
+				       &st->rsense);
+	if (ret)
+		return dev_err_probe(dev, ret,
+				     "Failed to read adi,rsense-nano-ohms\n");
+	if (st->rsense < CENTI)
+		return dev_err_probe(dev, -EINVAL,
+				     "adi,rsense-nano-ohms too small (< %lu)\n",
+				     CENTI);
+
+	/*
+	 * The resolution for rsense is tenths of micro (eg: 62.5 uOhm) which
+	 * means we need nano in the bindings. However, to make things easier to
+	 * handle (with respect to overflows) we divide it by 100 as we don't
+	 * really need the last two digits.
+	 */
+	st->rsense /= CENTI;
+
+	val = vin_mode;
+	ret = device_property_read_u32(dev, "adi,vin-mode-microvolt", &val);
+	if (!ret) {
+		switch (val) {
+		case 3300000:
+			val = LTC4282_VIN_3_3V;
+			break;
+		case 5000000:
+			val = LTC4282_VIN_5V;
+			break;
+		case 12000000:
+			val = LTC4282_VIN_12V;
+			break;
+		case 24000000:
+			val = LTC4282_VIN_24V;
+			break;
+		default:
+			return dev_err_probe(dev, -EINVAL,
+					     "Invalid val(%u) for vin-mode-microvolt\n",
+					     val);
+		}
+
+		ret = regmap_update_bits(st->map, LTC4282_CTRL_MSB,
+					 LTC4282_CTRL_VIN_MODE_MASK,
+					 FIELD_PREP(LTC4282_CTRL_VIN_MODE_MASK, val));
+		if (ret)
+			return ret;
+
+		/* Foldback mode should also be set to the input voltage */
+		ret = regmap_update_bits(st->map, LTC4282_ILIM_ADJUST,
+					 LTC4282_FOLDBACK_MODE_MASK,
+					 FIELD_PREP(LTC4282_FOLDBACK_MODE_MASK, val));
+		if (ret)
+			return ret;
+	}
+
+	st->vfs_out = ltc4282_vfs_milli[val];
+	st->vdd = ltc4282_vdd_milli[val];
+
+	ret = device_property_read_u32(dev, "adi,current-limit-sense-microvolt",
+				       &st->vsense_max);
+	if (!ret) {
+		int reg_val;
+
+		switch (val) {
+		case 12500:
+			reg_val = 0;
+			break;
+		case 15625:
+			reg_val = 1;
+			break;
+		case 18750:
+			reg_val = 2;
+			break;
+		case 21875:
+			reg_val = 3;
+			break;
+		case 25000:
+			reg_val = 4;
+			break;
+		case 28125:
+			reg_val = 5;
+			break;
+		case 31250:
+			reg_val = 6;
+			break;
+		case 34375:
+			reg_val = 7;
+			break;
+		default:
+			return dev_err_probe(dev, -EINVAL,
+					     "Invalid val(%u) for adi,current-limit-microvolt\n",
+					     st->vsense_max);
+		}
+
+		ret = regmap_update_bits(st->map, LTC4282_ILIM_ADJUST,
+					 LTC4282_ILIM_ADJUST_MASK,
+					 FIELD_PREP(LTC4282_ILIM_ADJUST_MASK, reg_val));
+		if (ret)
+			return ret;
+	}
+
+	ret = ltc4282_set_max_limits(st);
+	if (ret)
+		return ret;
+
+	ret = device_property_read_string(dev, "adi,overvoltage-dividers",
+					  &divider);
+	if (!ret) {
+		int div = match_string(ltc4282_dividers,
+				       ARRAY_SIZE(ltc4282_dividers), divider);
+		if (div < 0)
+			return dev_err_probe(dev, -EINVAL,
+					     "Invalid val(%s) for adi,overvoltage-divider\n",
+					     divider);
+
+		ret = regmap_update_bits(st->map, LTC4282_CTRL_MSB,
+					 LTC4282_CTRL_OV_MODE_MASK,
+					 FIELD_PREP(LTC4282_CTRL_OV_MODE_MASK, div));
+	}
+
+	ret = device_property_read_string(dev, "adi,undervoltage-dividers",
+					  &divider);
+	if (!ret) {
+		int div = match_string(ltc4282_dividers,
+				       ARRAY_SIZE(ltc4282_dividers), divider);
+		if (div < 0)
+			return dev_err_probe(dev, -EINVAL,
+					     "Invalid val(%s) for adi,undervoltage-divider\n",
+					     divider);
+
+		ret = regmap_update_bits(st->map, LTC4282_CTRL_MSB,
+					 LTC4282_CTRL_UV_MODE_MASK,
+					 FIELD_PREP(LTC4282_CTRL_UV_MODE_MASK, div));
+	}
+
+	if (device_property_read_bool(dev, "adi,overcurrent-retry")) {
+		ret = regmap_set_bits(st->map, LTC4282_CTRL_LSB,
+				      LTC4282_CTRL_OC_RETRY_MASK);
+		if (ret)
+			return ret;
+	}
+
+	if (device_property_read_bool(dev, "adi,overvoltage-retry-disable")) {
+		ret = regmap_clear_bits(st->map, LTC4282_CTRL_LSB,
+					LTC4282_CTRL_OV_RETRY_MASK);
+		if (ret)
+			return ret;
+	}
+
+	if (device_property_read_bool(dev, "adi,undervoltage-retry-disable")) {
+		ret = regmap_clear_bits(st->map, LTC4282_CTRL_LSB,
+					LTC4282_CTRL_UV_RETRY_MASK);
+		if (ret)
+			return ret;
+	}
+
+	if (device_property_read_bool(dev, "adi,fault-log-enable")) {
+		ret = regmap_set_bits(st->map, LTC4282_ADC_CTRL,
+				      LTC4282_FAULT_LOG_EN_MASK);
+		if (ret)
+			return ret;
+	}
+
+	if (device_property_read_bool(dev, "adi,fault-log-enable")) {
+		ret = regmap_set_bits(st->map, LTC4282_ADC_CTRL, LTC4282_FAULT_LOG_EN_MASK);
+		if (ret)
+			return ret;
+	}
+
+	ret = device_property_read_u32(dev, "adi,fet-bad-timeout-ms", &val);
+	if (!ret) {
+		if (val > LTC4282_FET_BAD_MAX_TIMEOUT)
+			return dev_err_probe(dev, -EINVAL,
+					     "Invalid value(%u) for adi,fet-bad-timeout-ms",
+					     val);
+
+		ret = regmap_write(st->map, LTC4282_FET_BAD_FAULT_TIMEOUT, val);
+		if (ret)
+			return ret;
+	}
+
+	return ltc4282_gpio_setup(st, dev);
+}
+
+static bool ltc4282_readable_reg(struct device *dev, unsigned int reg)
+{
+	if (reg == LTC4282_RESERVED_1 || reg == LTC4282_RESERVED_2)
+		return false;
+
+	return true;
+}
+
+static bool ltc4282_writable_reg(struct device *dev, unsigned int reg)
+{
+	if (reg == LTC4282_STATUS_LSB || reg == LTC4282_STATUS_MSB)
+		return false;
+	if (reg == LTC4282_RESERVED_1 || reg == LTC4282_RESERVED_2)
+		return false;
+
+	return true;
+}
+
+static const struct regmap_config ltc4282_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = LTC4282_RESERVED_3,
+	.readable_reg = ltc4282_readable_reg,
+	.writeable_reg = ltc4282_writable_reg,
+};
+
+static const struct hwmon_channel_info * const ltc4282_info[] = {
+	HWMON_CHANNEL_INFO(in,
+			   HWMON_I_INPUT | HWMON_I_LOWEST | HWMON_I_HIGHEST |
+			   HWMON_I_MAX | HWMON_I_MIN | HWMON_I_MIN_ALARM |
+			   HWMON_I_MAX_ALARM | HWMON_I_ENABLE |
+			   HWMON_I_RESET_HISTORY | HWMON_I_FAULT |
+			   HWMON_I_LABEL,
+			   HWMON_I_INPUT | HWMON_I_LOWEST | HWMON_I_HIGHEST |
+			   HWMON_I_MAX | HWMON_I_MIN | HWMON_I_MIN_ALARM |
+			   HWMON_I_MAX_ALARM | HWMON_I_LCRIT_ALARM |
+			   HWMON_I_CRIT_ALARM | HWMON_I_ENABLE |
+			   HWMON_I_RESET_HISTORY | HWMON_I_LABEL,
+			   HWMON_I_INPUT | HWMON_I_LOWEST | HWMON_I_HIGHEST |
+			   HWMON_I_MAX | HWMON_I_MIN | HWMON_I_MIN_ALARM |
+			   HWMON_I_RESET_HISTORY | HWMON_I_MAX_ALARM |
+			   HWMON_I_LABEL),
+	HWMON_CHANNEL_INFO(curr,
+			   HWMON_C_INPUT | HWMON_C_LOWEST | HWMON_C_HIGHEST |
+			   HWMON_C_MAX | HWMON_C_MIN | HWMON_C_MIN_ALARM |
+			   HWMON_C_MAX_ALARM | HWMON_C_CRIT_ALARM |
+			   HWMON_C_RESET_HISTORY | HWMON_C_LABEL),
+	HWMON_CHANNEL_INFO(power,
+			   HWMON_P_INPUT | HWMON_P_INPUT_LOWEST |
+			   HWMON_P_INPUT_HIGHEST | HWMON_P_MAX | HWMON_P_MIN |
+			   HWMON_P_MAX_ALARM | HWMON_P_MIN_ALARM |
+			   HWMON_P_RESET_HISTORY | HWMON_P_LABEL),
+	HWMON_CHANNEL_INFO(energy,
+			   HWMON_E_ENABLE),
+	NULL
+};
+
+static const struct hwmon_ops ltc4282_hwmon_ops = {
+	.read = ltc4282_read,
+	.write = ltc4282_write,
+	.is_visible = ltc4282_is_visible,
+	.read_string = ltc4282_read_labels,
+};
+
+static const struct hwmon_chip_info ltc2947_chip_info = {
+	.ops = &ltc4282_hwmon_ops,
+	.info = ltc4282_info,
+};
+
+/* energy attributes are 6bytes wide so we need u64 */
+static SENSOR_DEVICE_ATTR_RO(energy1_input, ltc4282_energy, 0);
+
+static struct attribute *ltc4282_attrs[] = {
+	&sensor_dev_attr_energy1_input.dev_attr.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(ltc4282);
+
+static int ltc4282_show_fault_log(void *arg, u64 *val, u32 mask)
+{
+	struct ltc4282_state *st = arg;
+	long alarm;
+	int ret;
+
+	ret = ltc4282_read_alarm(st, LTC4282_FAULT_LOG,	mask, &alarm);
+	if (ret)
+		return ret;
+
+	*val = alarm;
+
+	return 0;
+}
+
+static int ltc4282_show_curr1_crit_fault_log(void *arg, u64 *val)
+{
+	return ltc4282_show_fault_log(arg, val, LTC4282_OC_FAULT_MASK);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_curr1_crit_fault_log,
+			 ltc4282_show_curr1_crit_fault_log, NULL, "%llu\n");
+
+static int ltc4282_show_in1_lcrit_fault_log(void *arg, u64 *val)
+{
+	return ltc4282_show_fault_log(arg, val, LTC4282_UV_FAULT_MASK);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_in1_lcrit_fault_log,
+			 ltc4282_show_in1_lcrit_fault_log, NULL, "%llu\n");
+
+static int ltc4282_show_in1_crit_fault_log(void *arg, u64 *val)
+{
+	return ltc4282_show_fault_log(arg, val, LTC4282_OV_FAULT_MASK);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_in1_crit_fault_log,
+			 ltc4282_show_in1_crit_fault_log, NULL, "%llu\n");
+
+static int ltc4282_show_fet_bad_fault_log(void *arg, u64 *val)
+{
+	return ltc4282_show_fault_log(arg, val, LTC4282_FET_BAD_FAULT_MASK);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_fet_bad_fault_log,
+			 ltc4282_show_fet_bad_fault_log, NULL, "%llu\n");
+
+static int ltc4282_show_fet_short_fault_log(void *arg, u64 *val)
+{
+	return ltc4282_show_fault_log(arg, val, LTC4282_FET_SHORT_FAULT_MASK);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_fet_short_fault_log,
+			 ltc4282_show_fet_short_fault_log, NULL, "%llu\n");
+
+static int ltc4282_show_power1_bad_fault_log(void *arg, u64 *val)
+{
+	return ltc4282_show_fault_log(arg, val, LTC4282_POWER_BAD_FAULT_MASK);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(ltc4282_power1_bad_fault_log,
+			 ltc4282_show_power1_bad_fault_log, NULL, "%llu\n");
+
+static void ltc4282_debugfs_remove(void *dir)
+{
+	debugfs_remove_recursive(dir);
+}
+
+static void ltc4282_debugfs_init(struct ltc4282_state *st,
+				 struct i2c_client *i2c,
+				 const struct device *hwmon)
+{
+	const char *debugfs_name;
+	struct dentry *dentry;
+	int ret;
+
+	if (!IS_ENABLED(CONFIG_DEBUG_FS))
+		return;
+
+	debugfs_name = devm_kasprintf(&i2c->dev, GFP_KERNEL, "ltc4282-%s",
+				      dev_name(hwmon));
+	if (!debugfs_name)
+		return;
+
+	dentry = debugfs_create_dir(debugfs_name, NULL);
+	if (IS_ERR(dentry))
+		return;
+
+	ret = devm_add_action_or_reset(&i2c->dev, ltc4282_debugfs_remove,
+				       dentry);
+	if (ret)
+		return;
+
+	debugfs_create_file_unsafe("power1_bad_fault_log", 0400, dentry, st,
+				   &ltc4282_power1_bad_fault_log);
+	debugfs_create_file_unsafe("in0_fet_short_fault_log", 0400, dentry, st,
+				   &ltc4282_fet_short_fault_log);
+	debugfs_create_file_unsafe("in0_fet_bad_fault_log", 0400, dentry, st,
+				   &ltc4282_fet_bad_fault_log);
+	debugfs_create_file_unsafe("in1_crit_fault_log", 0400, dentry, st,
+				   &ltc4282_in1_crit_fault_log);
+	debugfs_create_file_unsafe("in1_lcrit_fault_log", 0400, dentry, st,
+				   &ltc4282_in1_lcrit_fault_log);
+	debugfs_create_file_unsafe("curr1_crit_fault_log", 0400, dentry, st,
+				   &ltc4282_curr1_crit_fault_log);
+}
+
+static int ltc4282_probe(struct i2c_client *i2c)
+{
+	struct device *dev = &i2c->dev, *hwmon;
+	struct ltc4282_state *st;
+	int ret;
+
+	st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL);
+	if (!st)
+		return dev_err_probe(dev, -ENOMEM,
+				     "Failed to allocate memory\n");
+
+	st->map = devm_regmap_init_i2c(i2c, &ltc4282_regmap_config);
+	if (IS_ERR(st->map))
+		return dev_err_probe(dev, PTR_ERR(st->map),
+				     "failed regmap init\n");
+
+	/* Soft reset */
+	ret = regmap_set_bits(st->map, LTC4282_ADC_CTRL, LTC4282_RESET_MASK);
+	if (ret)
+		return ret;
+
+	/* Yes, it's big but it is as specified in the datasheet */
+	msleep(3200);
+
+	ret = ltc428_clks_setup(st, dev);
+	if (ret)
+		return ret;
+
+	ret = ltc4282_setup(st, dev);
+	if (ret)
+		return ret;
+
+	mutex_init(&st->lock);
+	hwmon = devm_hwmon_device_register_with_info(dev, "ltc4282", st,
+						     &ltc2947_chip_info,
+						     ltc4282_groups);
+	if (IS_ERR(hwmon))
+		return PTR_ERR(hwmon);
+
+	ltc4282_debugfs_init(st, i2c, hwmon);
+
+	return 0;
+}
+
+static const struct of_device_id ltc4282_of_match[] = {
+	{ .compatible = "adi,ltc4282" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, ltc4282_of_match);
+
+static struct i2c_driver ltc4282_driver = {
+	.driver = {
+		.name = "ltc4282",
+		.of_match_table = ltc4282_of_match,
+	},
+	.probe = ltc4282_probe,
+};
+module_i2c_driver(ltc4282_driver);
+
+MODULE_AUTHOR("Nuno Sa <nuno.sa@analog.com>");
+MODULE_DESCRIPTION("LTC4282 I2C High Current Hot Swap Controller");
+MODULE_LICENSE("GPL");

From 7b7cd4c30033a19ea5dfaae7bfe92fe93d749ee1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Jan 2024 10:21:08 +0100
Subject: [PATCH 388/707] mm: add a mapping_clear_large_folios helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "disable large folios for shmem file used by xfs xfile".

Darrick reported that the fairly new XFS xfile code blows up when force
enabling large folio for shmem.  This series fixes this quickly by
disabling large folios for this particular shmem file for now until it can
be fixed properly, which will be a lot more invasive.


This patch (of 2):

Users of shmem_kernel_file_setup might not be able to deal with large
folios (yet).  Give them a way to disable large folio support on their
mapping.

Link: https://lkml.kernel.org/r/20240110092109.1950011-1-hch@lst.de
Link: https://lkml.kernel.org/r/20240110092109.1950011-2-hch@lst.de
Fixes: 3934e8ebb7cc ("xfs: create a big array data structure")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Airlie <airlied@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2df35e65557d27..431b12a23299d8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -360,6 +360,20 @@ static inline void mapping_set_large_folios(struct address_space *mapping)
 	__set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
 }
 
+/**
+ * mapping_clear_large_folios() - Disable large folio support for a mapping
+ * @mapping: The mapping.
+ *
+ * This can be called to undo the effect of mapping_set_large_folios().
+ *
+ * Context: This should not be called while the inode is active as it
+ * is non-atomic.
+ */
+static inline void mapping_clear_large_folios(struct address_space *mapping)
+{
+	__clear_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
+}
+
 /*
  * Large folio support currently depends on THP.  These dependencies are
  * being worked on but are not yet fixed.

From 21eccbf53359ab64b3808d819f00f2f34dcc5dfc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Jan 2024 10:21:09 +0100
Subject: [PATCH 389/707] xfs: disable large folio support in xfile_create
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The xfarray code will crash if large folios are force enabled using:

   echo force > /sys/kernel/mm/transparent_hugepage/shmem_enabled

Fixing this will require a bit of an API change, and prefeably sorting out
the hwpoison story for pages vs folio and where it is placed in the shmem
API.  For now use this one liner to disable large folios.

Link: https://lkml.kernel.org/r/20240110092109.1950011-3-hch@lst.de
Fixes: 3934e8ebb7cc ("xfs: create a big array data structure")
Reported-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Dave Airlie <airlied@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/xfs/scrub/xfile.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index 090c3ead43fdf1..1a8d1bedd0b0dc 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -94,6 +94,11 @@ xfile_create(
 
 	lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
 
+	/*
+	 * We're not quite ready for large folios yet.
+	 */
+	mapping_clear_large_folios(inode->i_mapping);
+
 	trace_xfile_create(xf);
 
 	*xfilep = xf;

From 252ab4b7f958ad64d75661280b40529c1f18bc43 Mon Sep 17 00:00:00 2001
From: Prakash Sangappa <prakash.sangappa@oracle.com>
Date: Tue, 23 Jan 2024 12:04:42 -0800
Subject: [PATCH 390/707] mm: hugetlb pages should not be reserved by shmat()
 if SHM_NORESERVE

For shared memory of type SHM_HUGETLB, hugetlb pages are reserved in
shmget() call.  If SHM_NORESERVE flags is specified then the hugetlb pages
are not reserved.  However when the shared memory is attached with the
shmat() call the hugetlb pages are getting reserved incorrectly for
SHM_HUGETLB shared memory created with SHM_NORESERVE which is a bug.

-------------------------------
Following test shows the issue.

$cat shmhtb.c

int main()
{
	int shmflags = 0660 | IPC_CREAT | SHM_HUGETLB | SHM_NORESERVE;
	int shmid;

	shmid = shmget(SKEY, SHMSZ, shmflags);
	if (shmid < 0)
	{
		printf("shmat: shmget() failed, %d\n", errno);
		return 1;
	}
	printf("After shmget()\n");
	system("cat /proc/meminfo | grep -i hugepages_");

	shmat(shmid, NULL, 0);
	printf("\nAfter shmat()\n");
	system("cat /proc/meminfo | grep -i hugepages_");

	shmctl(shmid, IPC_RMID, NULL);
	return 0;
}

 #sysctl -w vm.nr_hugepages=20
 #./shmhtb

After shmget()
HugePages_Total:      20
HugePages_Free:       20
HugePages_Rsvd:        0
HugePages_Surp:        0

After shmat()
HugePages_Total:      20
HugePages_Free:       20
HugePages_Rsvd:        5 <--
HugePages_Surp:        0
--------------------------------

Fix is to ensure that hugetlb pages are not reserved for SHM_HUGETLB shared
memory in the shmat() call.

Link: https://lkml.kernel.org/r/1706040282-12388-1-git-send-email-prakash.sangappa@oracle.com
Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
Acked-by: Muchun Song <muchun.song@linux.dev>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 671664fed3077f..ee13c2ca8ad293 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -100,6 +100,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	loff_t len, vma_len;
 	int ret;
 	struct hstate *h = hstate_file(file);
+	vm_flags_t vm_flags;
 
 	/*
 	 * vma address alignment (but not the pgoff alignment) has
@@ -141,10 +142,20 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	file_accessed(file);
 
 	ret = -ENOMEM;
+
+	vm_flags = vma->vm_flags;
+	/*
+	 * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
+	 * reserving here. Note: only for SHM hugetlbfs file, the inode
+	 * flag S_PRIVATE is set.
+	 */
+	if (inode->i_flags & S_PRIVATE)
+		vm_flags |= VM_NORESERVE;
+
 	if (!hugetlb_reserve_pages(inode,
 				vma->vm_pgoff >> huge_page_order(h),
 				len >> huge_page_shift(h), vma,
-				vma->vm_flags))
+				vm_flags))
 		goto out;
 
 	ret = 0;

From b0f130279ae18085fa6d156c4316fb2a4c1a6d87 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 22 Jan 2024 16:50:50 +0100
Subject: [PATCH 391/707] getrusage: move thread_group_cputime_adjusted()
 outside of lock_task_sighand()

Patch series "getrusage: use sig->stats_lock", v2.


This patch (of 2):

thread_group_cputime() does its own locking, we can safely shift
thread_group_cputime_adjusted() which does another for_each_thread loop
outside of ->siglock protected section.

This is also preparation for the next patch which changes getrusage() to
use stats_lock instead of siglock, thread_group_cputime() takes the same
lock.  With the current implementation recursive read_seqbegin_or_lock()
is fine, thread_group_cputime() can't enter the slow mode if the caller
holds stats_lock, yet this looks more safe and better performance-wise.

Link: https://lkml.kernel.org/r/20240122155023.GA26169@redhat.com
Link: https://lkml.kernel.org/r/20240122155050.GA26205@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Dylan Hatch <dylanbhatch@google.com>
Tested-by: Dylan Hatch <dylanbhatch@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/sys.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/kernel/sys.c b/kernel/sys.c
index e219fcfa112d86..70ad06ad852e59 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1785,17 +1785,19 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 	struct task_struct *t;
 	unsigned long flags;
 	u64 tgutime, tgstime, utime, stime;
-	unsigned long maxrss = 0;
+	unsigned long maxrss;
+	struct mm_struct *mm;
 	struct signal_struct *sig = p->signal;
 
-	memset((char *)r, 0, sizeof (*r));
+	memset(r, 0, sizeof(*r));
 	utime = stime = 0;
+	maxrss = 0;
 
 	if (who == RUSAGE_THREAD) {
 		task_cputime_adjusted(current, &utime, &stime);
 		accumulate_thread_rusage(p, r);
 		maxrss = sig->maxrss;
-		goto out;
+		goto out_thread;
 	}
 
 	if (!lock_task_sighand(p, &flags))
@@ -1819,9 +1821,6 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 		fallthrough;
 
 	case RUSAGE_SELF:
-		thread_group_cputime_adjusted(p, &tgutime, &tgstime);
-		utime += tgutime;
-		stime += tgstime;
 		r->ru_nvcsw += sig->nvcsw;
 		r->ru_nivcsw += sig->nivcsw;
 		r->ru_minflt += sig->min_flt;
@@ -1839,19 +1838,24 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 	}
 	unlock_task_sighand(p, &flags);
 
-out:
-	r->ru_utime = ns_to_kernel_old_timeval(utime);
-	r->ru_stime = ns_to_kernel_old_timeval(stime);
+	if (who == RUSAGE_CHILDREN)
+		goto out_children;
 
-	if (who != RUSAGE_CHILDREN) {
-		struct mm_struct *mm = get_task_mm(p);
+	thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+	utime += tgutime;
+	stime += tgstime;
 
-		if (mm) {
-			setmax_mm_hiwater_rss(&maxrss, mm);
-			mmput(mm);
-		}
+out_thread:
+	mm = get_task_mm(p);
+	if (mm) {
+		setmax_mm_hiwater_rss(&maxrss, mm);
+		mmput(mm);
 	}
+
+out_children:
 	r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
+	r->ru_utime = ns_to_kernel_old_timeval(utime);
+	r->ru_stime = ns_to_kernel_old_timeval(stime);
 }
 
 SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)

From f51687c6deb99c0ba0b8457b8be06312425bdd91 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 22 Jan 2024 16:50:53 +0100
Subject: [PATCH 392/707] getrusage: use sig->stats_lock rather than
 lock_task_sighand()

lock_task_sighand() can trigger a hard lockup. If NR_CPUS threads call
getrusage() at the same time and the process has NR_THREADS, spin_lock_irq
will spin with irqs disabled O(NR_CPUS * NR_THREADS) time.

Change getrusage() to use sig->stats_lock, it was specifically designed
for this type of use. This way it runs lockless in the likely case.

TODO:
	- Change do_task_stat() to use sig->stats_lock too, then we can
	  remove spin_lock_irq(siglock) in wait_task_zombie().

	- Turn sig->stats_lock into seqcount_rwlock_t, this way the
	  readers in the slow mode won't exclude each other. See
	  https://lore.kernel.org/all/20230913154907.GA26210@redhat.com/

	- stats_lock has to disable irqs because ->siglock can be taken
	  in irq context, it would be very nice to change __exit_signal()
	  to avoid the siglock->stats_lock dependency.

Link: https://lkml.kernel.org/r/20240122155053.GA26214@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Dylan Hatch <dylanbhatch@google.com>
Tested-by: Dylan Hatch <dylanbhatch@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/sys.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/kernel/sys.c b/kernel/sys.c
index 70ad06ad852e59..f8e543f1e38a06 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1788,7 +1788,9 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 	unsigned long maxrss;
 	struct mm_struct *mm;
 	struct signal_struct *sig = p->signal;
+	unsigned int seq = 0;
 
+retry:
 	memset(r, 0, sizeof(*r));
 	utime = stime = 0;
 	maxrss = 0;
@@ -1800,8 +1802,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 		goto out_thread;
 	}
 
-	if (!lock_task_sighand(p, &flags))
-		return;
+	flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
 
 	switch (who) {
 	case RUSAGE_BOTH:
@@ -1829,14 +1830,23 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 		r->ru_oublock += sig->oublock;
 		if (maxrss < sig->maxrss)
 			maxrss = sig->maxrss;
+
+		rcu_read_lock();
 		__for_each_thread(sig, t)
 			accumulate_thread_rusage(t, r);
+		rcu_read_unlock();
+
 		break;
 
 	default:
 		BUG();
 	}
-	unlock_task_sighand(p, &flags);
+
+	if (need_seqretry(&sig->stats_lock, seq)) {
+		seq = 1;
+		goto retry;
+	}
+	done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
 
 	if (who == RUSAGE_CHILDREN)
 		goto out_children;

From 41d1a01ebc166ba59fe5ec3e4eb32271f0267b31 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 23 Jan 2024 16:33:55 +0100
Subject: [PATCH 393/707] fs/proc: do_task_stat: move
 thread_group_cputime_adjusted() outside of lock_task_sighand()

Patch series "fs/proc: do_task_stat: use sig->stats_".

do_task_stat() has the same problem as getrusage() had before "getrusage:
use sig->stats_lock rather than lock_task_sighand()": a hard lockup.  If
NR_CPUS threads call lock_task_sighand() at the same time and the process
has NR_THREADS, spin_lock_irq will spin with irqs disabled O(NR_CPUS *
NR_THREADS) time.


This patch (of 3):

thread_group_cputime() does its own locking, we can safely shift
thread_group_cputime_adjusted() which does another for_each_thread loop
outside of ->siglock protected section.

Not only this removes for_each_thread() from the critical section with
irqs disabled, this removes another case when stats_lock is taken with
siglock held.  We want to remove this dependency, then we can change the
users of stats_lock to not disable irqs.

Link: https://lkml.kernel.org/r/20240123153313.GA21832@redhat.com
Link: https://lkml.kernel.org/r/20240123153355.GA21854@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Dylan Hatch <dylanbhatch@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/array.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index ff08a8957552ad..45ba9186380843 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -511,7 +511,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 	sigemptyset(&sigign);
 	sigemptyset(&sigcatch);
-	cutime = cstime = utime = stime = 0;
+	cutime = cstime = 0;
 	cgtime = gtime = 0;
 
 	if (lock_task_sighand(task, &flags)) {
@@ -546,7 +546,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
-			thread_group_cputime_adjusted(task, &utime, &stime);
 			gtime += sig->gtime;
 
 			if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
@@ -562,10 +561,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 	if (permitted && (!whole || num_threads < 2))
 		wchan = !task_is_running(task);
-	if (!whole) {
+
+	if (whole) {
+		thread_group_cputime_adjusted(task, &utime, &stime);
+	} else {
+		task_cputime_adjusted(task, &utime, &stime);
 		min_flt = task->min_flt;
 		maj_flt = task->maj_flt;
-		task_cputime_adjusted(task, &utime, &stime);
 		gtime = task_gtime(task);
 	}
 

From 48596ced7a2f19e820304fda65e1d819544fc256 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 23 Jan 2024 16:33:57 +0100
Subject: [PATCH 394/707] fs/proc: do_task_stat: use sig->stats_lock to gather
 the threads/children stats

lock_task_sighand() can trigger a hard lockup.  If NR_CPUS threads call
do_task_stat() at the same time and the process has NR_THREADS, it will
spin with irqs disabled O(NR_CPUS * NR_THREADS) time.

Change do_task_stat() to use sig->stats_lock to gather the statistics
outside of ->siglock protected section, in the likely case this code will
run lockless.

Link: https://lkml.kernel.org/r/20240123153357.GA21857@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Dylan Hatch <dylanbhatch@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/array.c | 58 +++++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 45ba9186380843..34a47fb0c57f25 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -477,13 +477,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	int permitted;
 	struct mm_struct *mm;
 	unsigned long long start_time;
-	unsigned long cmin_flt = 0, cmaj_flt = 0;
-	unsigned long  min_flt = 0,  maj_flt = 0;
-	u64 cutime, cstime, utime, stime;
-	u64 cgtime, gtime;
+	unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt;
+	u64 cutime, cstime, cgtime, utime, stime, gtime;
 	unsigned long rsslim = 0;
 	unsigned long flags;
 	int exit_code = task->exit_code;
+	struct signal_struct *sig = task->signal;
+	unsigned int seq = 1;
 
 	state = *get_task_state(task);
 	vsize = eip = esp = 0;
@@ -511,12 +511,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 	sigemptyset(&sigign);
 	sigemptyset(&sigcatch);
-	cutime = cstime = 0;
-	cgtime = gtime = 0;
 
 	if (lock_task_sighand(task, &flags)) {
-		struct signal_struct *sig = task->signal;
-
 		if (sig->tty) {
 			struct pid *pgrp = tty_get_pgrp(sig->tty);
 			tty_pgrp = pid_nr_ns(pgrp, ns);
@@ -527,27 +523,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		num_threads = get_nr_threads(task);
 		collect_sigign_sigcatch(task, &sigign, &sigcatch);
 
-		cmin_flt = sig->cmin_flt;
-		cmaj_flt = sig->cmaj_flt;
-		cutime = sig->cutime;
-		cstime = sig->cstime;
-		cgtime = sig->cgtime;
 		rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
 
-		/* add up live thread stats at the group level */
 		if (whole) {
-			struct task_struct *t;
-
-			__for_each_thread(sig, t) {
-				min_flt += t->min_flt;
-				maj_flt += t->maj_flt;
-				gtime += task_gtime(t);
-			}
-
-			min_flt += sig->min_flt;
-			maj_flt += sig->maj_flt;
-			gtime += sig->gtime;
-
 			if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
 				exit_code = sig->group_exit_code;
 		}
@@ -562,6 +540,34 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	if (permitted && (!whole || num_threads < 2))
 		wchan = !task_is_running(task);
 
+	do {
+		seq++; /* 2 on the 1st/lockless path, otherwise odd */
+		flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+
+		cmin_flt = sig->cmin_flt;
+		cmaj_flt = sig->cmaj_flt;
+		cutime = sig->cutime;
+		cstime = sig->cstime;
+		cgtime = sig->cgtime;
+
+		if (whole) {
+			struct task_struct *t;
+
+			min_flt = sig->min_flt;
+			maj_flt = sig->maj_flt;
+			gtime = sig->gtime;
+
+			rcu_read_lock();
+			__for_each_thread(sig, t) {
+				min_flt += t->min_flt;
+				maj_flt += t->maj_flt;
+				gtime += task_gtime(t);
+			}
+			rcu_read_unlock();
+		}
+	} while (need_seqretry(&sig->stats_lock, seq));
+	done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+
 	if (whole) {
 		thread_group_cputime_adjusted(task, &utime, &stime);
 	} else {

From b608b1d1b11df82d2bb1ab7a071e3b213a5b0f9c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 23 Jan 2024 16:34:00 +0100
Subject: [PATCH 395/707] exit: wait_task_zombie: kill the no longer necessary
 spin_lock_irq(siglock)

After the recent changes nobody use siglock to read the values protected
by stats_lock, we can kill spin_lock_irq(&current->sighand->siglock) and
update the comment.

With this patch only __exit_signal() and thread_group_start_cputime() take
stats_lock under siglock.

Link: https://lkml.kernel.org/r/20240123153359.GA21866@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Dylan Hatch <dylanbhatch@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/exit.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 3988a02efaef06..dfb963d2f862ad 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1127,17 +1127,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		 * and nobody can change them.
 		 *
 		 * psig->stats_lock also protects us from our sub-threads
-		 * which can reap other children at the same time. Until
-		 * we change k_getrusage()-like users to rely on this lock
-		 * we have to take ->siglock as well.
+		 * which can reap other children at the same time.
 		 *
 		 * We use thread_group_cputime_adjusted() to get times for
 		 * the thread group, which consolidates times for all threads
 		 * in the group including the group leader.
 		 */
 		thread_group_cputime_adjusted(p, &tgutime, &tgstime);
-		spin_lock_irq(&current->sighand->siglock);
-		write_seqlock(&psig->stats_lock);
+		write_seqlock_irq(&psig->stats_lock);
 		psig->cutime += tgutime + sig->cutime;
 		psig->cstime += tgstime + sig->cstime;
 		psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1160,8 +1157,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 			psig->cmaxrss = maxrss;
 		task_io_accounting_add(&psig->ioac, &p->ioac);
 		task_io_accounting_add(&psig->ioac, &sig->ioac);
-		write_sequnlock(&psig->stats_lock);
-		spin_unlock_irq(&current->sighand->siglock);
+		write_sequnlock_irq(&psig->stats_lock);
 	}
 
 	if (wo->wo_rusage)

From d5e91629e5c426a872a356bdbbdd29a50d91afef Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Tue, 23 Jan 2024 14:17:55 +0000
Subject: [PATCH 396/707] mm/userfaultfd: UFFDIO_MOVE implementation should use
 ptep_get()

Commit c33c794828f2 ("mm: ptep_get() conversion") converted all (non-arch)
call sites to use ptep_get() instead of doing a direct dereference of the
pte.  Full rationale can be found in that commit's log.

Since then, UFFDIO_MOVE has been implemented which does 7 direct pte
dereferences.  Let's fix those up to use ptep_get().

I've asserted in the past that there is no reliable automated mechanism to
catch these; I'm relying on a combination of Coccinelle (which throws up a
lot of false positives) and some compiler magic to force a compiler error
on dereference.  But given the frequency with which new issues are coming
up, I'll add it to my todo list to try to find an automated solution.

Link: https://lkml.kernel.org/r/20240123141755.3836179-1-ryan.roberts@arm.com
Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI")
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/userfaultfd.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 75fcf1f783bc56..7cf7d43842590c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -902,8 +902,8 @@ static int move_present_pte(struct mm_struct *mm,
 
 	double_pt_lock(dst_ptl, src_ptl);
 
-	if (!pte_same(*src_pte, orig_src_pte) ||
-	    !pte_same(*dst_pte, orig_dst_pte)) {
+	if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
+	    !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
 		err = -EAGAIN;
 		goto out;
 	}
@@ -946,8 +946,8 @@ static int move_swap_pte(struct mm_struct *mm,
 
 	double_pt_lock(dst_ptl, src_ptl);
 
-	if (!pte_same(*src_pte, orig_src_pte) ||
-	    !pte_same(*dst_pte, orig_dst_pte)) {
+	if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
+	    !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
 		double_pt_unlock(dst_ptl, src_ptl);
 		return -EAGAIN;
 	}
@@ -1016,7 +1016,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
 	}
 
 	spin_lock(dst_ptl);
-	orig_dst_pte = *dst_pte;
+	orig_dst_pte = ptep_get(dst_pte);
 	spin_unlock(dst_ptl);
 	if (!pte_none(orig_dst_pte)) {
 		err = -EEXIST;
@@ -1024,7 +1024,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
 	}
 
 	spin_lock(src_ptl);
-	orig_src_pte = *src_pte;
+	orig_src_pte = ptep_get(src_pte);
 	spin_unlock(src_ptl);
 	if (pte_none(orig_src_pte)) {
 		if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
@@ -1054,7 +1054,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
 			 * page isn't freed under us
 			 */
 			spin_lock(src_ptl);
-			if (!pte_same(orig_src_pte, *src_pte)) {
+			if (!pte_same(orig_src_pte, ptep_get(src_pte))) {
 				spin_unlock(src_ptl);
 				err = -EAGAIN;
 				goto out;

From 95bfb99de89e0636c52a00472352f7f8e93cfa41 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Wed, 24 Jan 2024 21:19:36 +0900
Subject: [PATCH 397/707] nilfs2: fix data corruption in dsync block recovery
 for small block sizes

The helper function nilfs_recovery_copy_block() of
nilfs_recovery_dsync_blocks(), which recovers data from logs created by
data sync writes during a mount after an unclean shutdown, incorrectly
calculates the on-page offset when copying repair data to the file's page
cache.  In environments where the block size is smaller than the page
size, this flaw can cause data corruption and leak uninitialized memory
bytes during the recovery process.

Fix these issues by correcting this byte offset calculation on the page.

Link: https://lkml.kernel.org/r/20240124121936.10575-1-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/recovery.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 0955b657938ff2..a9b8d77c8c1d55 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -472,9 +472,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
 
 static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
 				     struct nilfs_recovery_block *rb,
-				     struct page *page)
+				     loff_t pos, struct page *page)
 {
 	struct buffer_head *bh_org;
+	size_t from = pos & ~PAGE_MASK;
 	void *kaddr;
 
 	bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize);
@@ -482,7 +483,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
 		return -EIO;
 
 	kaddr = kmap_atomic(page);
-	memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
+	memcpy(kaddr + from, bh_org->b_data, bh_org->b_size);
 	kunmap_atomic(kaddr);
 	brelse(bh_org);
 	return 0;
@@ -521,7 +522,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
 			goto failed_inode;
 		}
 
-		err = nilfs_recovery_copy_block(nilfs, rb, page);
+		err = nilfs_recovery_copy_block(nilfs, rb, pos, page);
 		if (unlikely(err))
 			goto failed_page;
 

From a06040dcc590b0e90be9c524ad05089cdd7858a4 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Wed, 24 Jan 2024 10:00:22 +0000
Subject: [PATCH 398/707] mm: memcg: optimize parent iteration in
 memcg_rstat_updated()

In memcg_rstat_updated(), we iterate the memcg being updated and its
parents to update memcg->vmstats_percpu->stats_updates in the fast path
(i.e. no atomic updates). According to my math, this is 3 memory loads
(and potentially 3 cache misses) per memcg:
- Load the address of memcg->vmstats_percpu.
- Load vmstats_percpu->stats_updates (based on some percpu calculation).
- Load the address of the parent memcg.

Avoid most of the cache misses by caching a pointer from each struct
memcg_vmstats_percpu to its parent on the corresponding CPU. In this
case, for the first memcg we have 2 memory loads (same as above):
- Load the address of memcg->vmstats_percpu.
- Load vmstats_percpu->stats_updates (based on some percpu calculation).

Then for each additional memcg, we need a single load to get the
parent's stats_updates directly. This reduces the number of loads from
O(3N) to O(2+N) -- where N is the number of memcgs we need to iterate.

Additionally, stash a pointer to memcg->vmstats in each struct
memcg_vmstats_percpu such that we can access the atomic counter that all
CPUs fold into, memcg->vmstats->stats_updates.
memcg_should_flush_stats() is changed to memcg_vmstats_needs_flush() to
accept a struct memcg_vmstats pointer accordingly.

In struct memcg_vmstats_percpu, make sure both pointers together with
stats_updates live on the same cacheline. Finally, update
mem_cgroup_alloc() to take in a parent pointer and initialize the new
cache pointers on each CPU. The percpu loop in mem_cgroup_alloc() may
look concerning, but there are multiple similar loops in the cgroup
creation path (e.g. cgroup_rstat_init()), most of which are hidden
within alloc_percpu().

According to Oliver's testing [1], this fixes multiple 30-38%
regressions in vm-scalability, will-it-scale-tlb_flush2, and
will-it-scale-fallocate1. This comes at a cost of 2 more pointers per
CPU (<2KB on a machine with 128 CPUs).

[1] https://lore.kernel.org/lkml/ZbDJsfsZt2ITyo61@xsang-OptiPlex-9020/

Link: https://lkml.kernel.org/r/20240124100023.660032-1-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Fixes: 8d59d2214c23 ("mm: memcg: make stats flushing threshold per-memcg")
Tested-by: kernel test robot <oliver.sang@intel.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202401221624.cb53a8ca-oliver.sang@intel.com
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 49 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 46d8d02114cfee..d9ca0fdbe4ab04 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -633,8 +633,15 @@ struct memcg_vmstats_percpu {
 	unsigned long		nr_page_events;
 	unsigned long		targets[MEM_CGROUP_NTARGETS];
 
+	/* Fit members below in a single cacheline for memcg_rstat_updated() */
+	CACHELINE_PADDING(_pad1_);
+
 	/* Stats updates since the last flush */
 	unsigned int		stats_updates;
+
+	/* Cached pointers for fast iteration in memcg_rstat_updated() */
+	struct memcg_vmstats_percpu	*parent;
+	struct memcg_vmstats		*vmstats;
 };
 
 struct memcg_vmstats {
@@ -698,36 +705,35 @@ static void memcg_stats_unlock(void)
 }
 
 
-static bool memcg_should_flush_stats(struct mem_cgroup *memcg)
+static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
 {
-	return atomic64_read(&memcg->vmstats->stats_updates) >
+	return atomic64_read(&vmstats->stats_updates) >
 		MEMCG_CHARGE_BATCH * num_online_cpus();
 }
 
 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 {
+	struct memcg_vmstats_percpu *statc;
 	int cpu = smp_processor_id();
-	unsigned int x;
 
 	if (!val)
 		return;
 
 	cgroup_rstat_updated(memcg->css.cgroup, cpu);
-
-	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
-		x = __this_cpu_add_return(memcg->vmstats_percpu->stats_updates,
-					  abs(val));
-
-		if (x < MEMCG_CHARGE_BATCH)
+	statc = this_cpu_ptr(memcg->vmstats_percpu);
+	for (; statc; statc = statc->parent) {
+		statc->stats_updates += abs(val);
+		if (statc->stats_updates < MEMCG_CHARGE_BATCH)
 			continue;
 
 		/*
 		 * If @memcg is already flush-able, increasing stats_updates is
 		 * redundant. Avoid the overhead of the atomic update.
 		 */
-		if (!memcg_should_flush_stats(memcg))
-			atomic64_add(x, &memcg->vmstats->stats_updates);
-		__this_cpu_write(memcg->vmstats_percpu->stats_updates, 0);
+		if (!memcg_vmstats_needs_flush(statc->vmstats))
+			atomic64_add(statc->stats_updates,
+				     &statc->vmstats->stats_updates);
+		statc->stats_updates = 0;
 	}
 }
 
@@ -756,7 +762,7 @@ void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
 	if (!memcg)
 		memcg = root_mem_cgroup;
 
-	if (memcg_should_flush_stats(memcg))
+	if (memcg_vmstats_needs_flush(memcg->vmstats))
 		do_flush_stats(memcg);
 }
 
@@ -770,7 +776,7 @@ void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
 static void flush_memcg_stats_dwork(struct work_struct *w)
 {
 	/*
-	 * Deliberately ignore memcg_should_flush_stats() here so that flushing
+	 * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
 	 * in latency-sensitive paths is as cheap as possible.
 	 */
 	do_flush_stats(root_mem_cgroup);
@@ -5477,10 +5483,11 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
 	__mem_cgroup_free(memcg);
 }
 
-static struct mem_cgroup *mem_cgroup_alloc(void)
+static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
 {
+	struct memcg_vmstats_percpu *statc, *pstatc;
 	struct mem_cgroup *memcg;
-	int node;
+	int node, cpu;
 	int __maybe_unused i;
 	long error = -ENOMEM;
 
@@ -5504,6 +5511,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 	if (!memcg->vmstats_percpu)
 		goto fail;
 
+	for_each_possible_cpu(cpu) {
+		if (parent)
+			pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
+		statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
+		statc->parent = parent ? pstatc : NULL;
+		statc->vmstats = memcg->vmstats;
+	}
+
 	for_each_node(node)
 		if (alloc_mem_cgroup_per_node_info(memcg, node))
 			goto fail;
@@ -5549,7 +5564,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	struct mem_cgroup *memcg, *old_memcg;
 
 	old_memcg = set_active_memcg(parent);
-	memcg = mem_cgroup_alloc();
+	memcg = mem_cgroup_alloc(parent);
 	set_active_memcg(old_memcg);
 	if (IS_ERR(memcg))
 		return ERR_CAST(memcg);

From 38312a7658804dd56d43f78d5081d9e742d9115d Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Wed, 24 Jan 2024 16:40:14 +0800
Subject: [PATCH 399/707] mm/memory-failure: fix crash in
 split_huge_page_to_list from soft_offline_page

When I did soft offline stress test, a machine was observed to crash with
the following message:

  kernel BUG at include/linux/memcontrol.h:554!
  invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
  CPU: 5 PID: 3837 Comm: hwpoison.sh Not tainted 6.7.0-next-20240112-00001-g8ecf3e7fb7c8-dirty #97
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
  RIP: 0010:folio_memcg+0xaf/0xd0
  Code: 10 5b 5d c3 cc cc cc cc 48 c7 c6 08 b1 f2 b2 48 89 ef e8 b4 c5 f8 ff 90 0f 0b 48 c7 c6 d0 b0 f2 b2 48 89 ef e8 a2 c5 f8 ff 90 <0f> 0b 48 c7 c6 08 b1 f2 b2 48 89 ef e8 90 c5 f8 ff 90 0f 0b 66 66
  RSP: 0018:ffffb6c043657c98 EFLAGS: 00000296
  RAX: 000000000000004b RBX: ffff932bc1d1e401 RCX: ffff933abfb5c908
  RDX: 0000000000000000 RSI: 0000000000000027 RDI: ffff933abfb5c900
  RBP: ffffea6f04019080 R08: ffffffffb3338ce8 R09: 0000000000009ffb
  R10: 00000000000004dd R11: ffffffffb3308d00 R12: ffffea6f04019080
  R13: ffffea6f04019080 R14: 0000000000000001 R15: ffffb6c043657da0
  FS:  00007f6c60f6b740(0000) GS:ffff933abfb40000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000559c3bc8b980 CR3: 0000000107f1c000 CR4: 00000000000006f0
  Call Trace:
   <TASK>
   split_huge_page_to_list+0x4d/0x1380
   try_to_split_thp_page+0x3a/0xf0
   soft_offline_page+0x1ea/0x8a0
   soft_offline_page_store+0x52/0x90
   kernfs_fop_write_iter+0x118/0x1b0
   vfs_write+0x30b/0x430
   ksys_write+0x5e/0xe0
   do_syscall_64+0xb0/0x1b0
   entry_SYSCALL_64_after_hwframe+0x6d/0x75
  RIP: 0033:0x7f6c60d14697
  Code: 10 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24
  RSP: 002b:00007ffe9b72b8d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
  RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007f6c60d14697
  RDX: 000000000000000c RSI: 0000559c3bc8b980 RDI: 0000000000000001
  RBP: 0000559c3bc8b980 R08: 00007f6c60dd1460 R09: 000000007fffffff
  R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000c
  R13: 00007f6c60e1a780 R14: 00007f6c60e16600 R15: 00007f6c60e15a00

The problem is that page->mapping is overloaded with slab->slab_list or
slabs fields now, so slab pages could be taken as non-LRU movable pages if
field slabs contains PAGE_MAPPING_MOVABLE or slab_list->prev is set to
LIST_POISON2.  These slab pages will be treated as thp later leading to
crash in split_huge_page_to_list().

Link: https://lkml.kernel.org/r/20240126065837.2100184-1-linmiaohe@huawei.com
Link: https://lkml.kernel.org/r/20240124084014.1772906-1-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Fixes: 130d4df57390 ("mm/sl[au]b: rearrange struct slab fields to allow larger rcu_head")
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory-failure.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 636280d04008d8..9349948f1abfd1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1377,6 +1377,9 @@ void ClearPageHWPoisonTakenOff(struct page *page)
  */
 static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
 {
+	if (PageSlab(page))
+		return false;
+
 	/* Soft offline could migrate non-LRU movable pages */
 	if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
 		return true;

From cf5e3522d28ca30102d7d61eb504b4eaa338da33 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Tue, 24 Oct 2023 20:51:25 +0500
Subject: [PATCH 400/707] selftests: core: include linux/close_range.h for
 CLOSE_RANGE_* macros

Correct header file is needed for getting CLOSE_RANGE_* macros.
Previously it was tested with newer glibc which didn't show the need to
include the header which was a mistake.

Link: https://lkml.kernel.org/r/20231024155137.219700-1-usama.anjum@collabora.com
Fixes: ec54424923cf ("selftests: core: remove duplicate defines")
Reported-by: Aishwarya TCV <aishwarya.tcv@arm.com>
Link: https://lore.kernel.org/all/7161219e-0223-d699-d6f3-81abd9abf13b@arm.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/core/close_range_test.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c
index 534576f06df1cc..c59e4adb905df6 100644
--- a/tools/testing/selftests/core/close_range_test.c
+++ b/tools/testing/selftests/core/close_range_test.c
@@ -12,6 +12,7 @@
 #include <syscall.h>
 #include <unistd.h>
 #include <sys/resource.h>
+#include <linux/close_range.h>
 
 #include "../kselftest_harness.h"
 #include "../clone3/clone3_selftests.h"

From 6c50951b21b502346557e08ca125ec1b372b92b1 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 22 Jan 2024 22:43:05 -0800
Subject: [PATCH 401/707] arch/arm/mm: fix major fault accounting when retrying
 under per-VMA lock

The change [1] missed ARM architecture when fixing major fault accounting
for page fault retry under per-VMA lock.

The user-visible effects is that it restores correct major fault
accounting that was broken after [2] was merged in 6.7 kernel. The
more detailed description is in [3] and this patch simply adds the
same fix to ARM architecture which I missed in [3].

Add missing code to fix ARM architecture fault accounting.

[1] 46e714c729c8 ("arch/mm/fault: fix major fault accounting when retrying under per-VMA lock")
[2] https://lore.kernel.org/all/20231006195318.4087158-6-willy@infradead.org/
[3] https://lore.kernel.org/all/20231226214610.109282-1-surenb@google.com/

Link: https://lkml.kernel.org/r/20240123064305.2829244-1-surenb@google.com
Fixes: 12214eba1992 ("mm: handle read faults under the VMA lock")
Reported-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/mm/fault.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index e96fb40b9cc32a..07565b593ed681 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -298,6 +298,8 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 		goto done;
 	}
 	count_vm_vma_lock_event(VMA_LOCK_RETRY);
+	if (fault & VM_FAULT_MAJOR)
+		flags |= FAULT_FLAG_TRIED;
 
 	/* Quick path to respond to signals */
 	if (fault_signal_pending(fault, regs)) {

From 489bd091f9465e4b4b77c812255642d596d43e72 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <senozhatsky@chromium.org>
Date: Fri, 26 Jan 2024 12:25:48 +0900
Subject: [PATCH 402/707] mm/madvise: don't forget to leave lazy MMU mode in
 madvise_cold_or_pageout_pte_range()

We need to leave lazy MMU mode before unlocking.

Link: https://lkml.kernel.org/r/20240126032608.355899-1-senozhatsky@chromium.org
Fixes: b2f557a21bc8 ("mm/madvise: add cond_resched() in madvise_cold_or_pageout_pte_range()")
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Jiexun Wang <wangjiexun@tinylab.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/madvise.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/madvise.c b/mm/madvise.c
index 912155a94ed587..cfa5e728826118 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -429,6 +429,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 		if (++batch_count == SWAP_CLUSTER_MAX) {
 			batch_count = 0;
 			if (need_resched()) {
+				arch_leave_lazy_mmu_mode();
 				pte_unmap_unlock(start_pte, ptl);
 				cond_resched();
 				goto restart;

From a48c290a3da03746500fdf1c566a40f62154df0b Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:34:38 -0500
Subject: [PATCH 403/707] mm: zswap: fix objcg use-after-free in entry
 destruction

In the per-memcg LRU universe, LRU removal uses entry->objcg to determine
which list count needs to be decreased.  Drop the objcg reference after
updating the LRU, to fix a possible use-after-free.

Link: https://lkml.kernel.org/r/20240130013438.565167-1-hannes@cmpxchg.org
Fixes: a65b0e7607cc ("zswap: make shrinking memcg-aware")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index ca25b676048ea6..0a94b197ed32e1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -536,10 +536,6 @@ static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
  */
 static void zswap_free_entry(struct zswap_entry *entry)
 {
-	if (entry->objcg) {
-		obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
-		obj_cgroup_put(entry->objcg);
-	}
 	if (!entry->length)
 		atomic_dec(&zswap_same_filled_pages);
 	else {
@@ -548,6 +544,10 @@ static void zswap_free_entry(struct zswap_entry *entry)
 		atomic_dec(&entry->pool->nr_stored);
 		zswap_pool_put(entry->pool);
 	}
+	if (entry->objcg) {
+		obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
+		obj_cgroup_put(entry->objcg);
+	}
 	zswap_entry_cache_free(entry);
 	atomic_dec(&zswap_stored_pages);
 	zswap_update_total_size();

From 23469997cda3afa5153e3cd40de94680e67eb2b9 Mon Sep 17 00:00:00 2001
From: John Moon <john@jmoon.dev>
Date: Wed, 31 Jan 2024 03:43:18 +0000
Subject: [PATCH 404/707] mailmap: switch email address for John Moon

Add current email address as QUIC email is no longer active.

Link: https://lkml.kernel.org/r/20240131034311.46706-1-john@jmoon.dev
Signed-off-by: John Moon <john@jmoon.dev>
Acked-by: Trilok Soni <quic_tsoni@quicinc.com>
Cc: Elliot Berman <quic_eberman@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .mailmap | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.mailmap b/.mailmap
index 04998f7bda8181..8ae00bd3708a73 100644
--- a/.mailmap
+++ b/.mailmap
@@ -289,6 +289,7 @@ Johan Hovold <johan@kernel.org> <johan@hovoldconsulting.com>
 John Crispin <john@phrozen.org> <blogic@openwrt.org>
 John Fastabend <john.fastabend@gmail.com> <john.r.fastabend@intel.com>
 John Keeping <john@keeping.me.uk> <john@metanate.com>
+John Moon <john@jmoon.dev> <quic_johmoo@quicinc.com>
 John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
 John Stultz <johnstul@us.ibm.com>
 <jon.toppins+linux@gmail.com> <jtoppins@cumulusnetworks.com>

From 3cc3d209406993d20887a521c8a27f7388eee66c Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 30 Jan 2024 22:04:18 +0100
Subject: [PATCH 405/707] fs,hugetlb: fix NULL pointer dereference in
 hugetlbs_fill_super

When configuring a hugetlb filesystem via the fsconfig() syscall, there is
a possible NULL dereference in hugetlbfs_fill_super() caused by assigning
NULL to ctx->hstate in hugetlbfs_parse_param() when the requested pagesize
is non valid.

E.g: Taking the following steps:

     fd = fsopen("hugetlbfs", FSOPEN_CLOEXEC);
     fsconfig(fd, FSCONFIG_SET_STRING, "pagesize", "1024", 0);
     fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);

Given that the requested "pagesize" is invalid, ctxt->hstate will be replaced
with NULL, losing its previous value, and we will print an error:

 ...
 ...
 case Opt_pagesize:
 ps = memparse(param->string, &rest);
 ctx->hstate = h;
 if (!ctx->hstate) {
         pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
         return -EINVAL;
 }
 return 0;
 ...
 ...

This is a problem because later on, we will dereference ctxt->hstate in
hugetlbfs_fill_super()

 ...
 ...
 sb->s_blocksize = huge_page_size(ctx->hstate);
 ...
 ...

Causing below Oops.

Fix this by replacing cxt->hstate value only when then pagesize is known
to be valid.

 kernel: hugetlbfs: Unsupported page size 0 MB
 kernel: BUG: kernel NULL pointer dereference, address: 0000000000000028
 kernel: #PF: supervisor read access in kernel mode
 kernel: #PF: error_code(0x0000) - not-present page
 kernel: PGD 800000010f66c067 P4D 800000010f66c067 PUD 1b22f8067 PMD 0
 kernel: Oops: 0000 [#1] PREEMPT SMP PTI
 kernel: CPU: 4 PID: 5659 Comm: syscall Tainted: G            E      6.8.0-rc2-default+ #22 5a47c3fef76212addcc6eb71344aabc35190ae8f
 kernel: Hardware name: Intel Corp. GROVEPORT/GROVEPORT, BIOS GVPRCRB1.86B.0016.D04.1705030402 05/03/2017
 kernel: RIP: 0010:hugetlbfs_fill_super+0xb4/0x1a0
 kernel: Code: 48 8b 3b e8 3e c6 ed ff 48 85 c0 48 89 45 20 0f 84 d6 00 00 00 48 b8 ff ff ff ff ff ff ff 7f 4c 89 e7 49 89 44 24 20 48 8b 03 <8b> 48 28 b8 00 10 00 00 48 d3 e0 49 89 44 24 18 48 8b 03 8b 40 28
 kernel: RSP: 0018:ffffbe9960fcbd48 EFLAGS: 00010246
 kernel: RAX: 0000000000000000 RBX: ffff9af5272ae780 RCX: 0000000000372004
 kernel: RDX: ffffffffffffffff RSI: ffffffffffffffff RDI: ffff9af555e9b000
 kernel: RBP: ffff9af52ee66b00 R08: 0000000000000040 R09: 0000000000370004
 kernel: R10: ffffbe9960fcbd48 R11: 0000000000000040 R12: ffff9af555e9b000
 kernel: R13: ffffffffa66b86c0 R14: ffff9af507d2f400 R15: ffff9af507d2f400
 kernel: FS:  00007ffbc0ba4740(0000) GS:ffff9b0bd7000000(0000) knlGS:0000000000000000
 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 kernel: CR2: 0000000000000028 CR3: 00000001b1ee0000 CR4: 00000000001506f0
 kernel: Call Trace:
 kernel:  <TASK>
 kernel:  ? __die_body+0x1a/0x60
 kernel:  ? page_fault_oops+0x16f/0x4a0
 kernel:  ? search_bpf_extables+0x65/0x70
 kernel:  ? fixup_exception+0x22/0x310
 kernel:  ? exc_page_fault+0x69/0x150
 kernel:  ? asm_exc_page_fault+0x22/0x30
 kernel:  ? __pfx_hugetlbfs_fill_super+0x10/0x10
 kernel:  ? hugetlbfs_fill_super+0xb4/0x1a0
 kernel:  ? hugetlbfs_fill_super+0x28/0x1a0
 kernel:  ? __pfx_hugetlbfs_fill_super+0x10/0x10
 kernel:  vfs_get_super+0x40/0xa0
 kernel:  ? __pfx_bpf_lsm_capable+0x10/0x10
 kernel:  vfs_get_tree+0x25/0xd0
 kernel:  vfs_cmd_create+0x64/0xe0
 kernel:  __x64_sys_fsconfig+0x395/0x410
 kernel:  do_syscall_64+0x80/0x160
 kernel:  ? syscall_exit_to_user_mode+0x82/0x240
 kernel:  ? do_syscall_64+0x8d/0x160
 kernel:  ? syscall_exit_to_user_mode+0x82/0x240
 kernel:  ? do_syscall_64+0x8d/0x160
 kernel:  ? exc_page_fault+0x69/0x150
 kernel:  entry_SYSCALL_64_after_hwframe+0x6e/0x76
 kernel: RIP: 0033:0x7ffbc0cb87c9
 kernel: Code: 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 97 96 0d 00 f7 d8 64 89 01 48
 kernel: RSP: 002b:00007ffc29d2f388 EFLAGS: 00000206 ORIG_RAX: 00000000000001af
 kernel: RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007ffbc0cb87c9
 kernel: RDX: 0000000000000000 RSI: 0000000000000006 RDI: 0000000000000003
 kernel: RBP: 00007ffc29d2f3b0 R08: 0000000000000000 R09: 0000000000000000
 kernel: R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000
 kernel: R13: 00007ffc29d2f4c0 R14: 0000000000000000 R15: 0000000000000000
 kernel:  </TASK>
 kernel: Modules linked in: rpcsec_gss_krb5(E) auth_rpcgss(E) nfsv4(E) dns_resolver(E) nfs(E) lockd(E) grace(E) sunrpc(E) netfs(E) af_packet(E) bridge(E) stp(E) llc(E) iscsi_ibft(E) iscsi_boot_sysfs(E) intel_rapl_msr(E) intel_rapl_common(E) iTCO_wdt(E) intel_pmc_bxt(E) sb_edac(E) iTCO_vendor_support(E) x86_pkg_temp_thermal(E) intel_powerclamp(E) coretemp(E) kvm_intel(E) rfkill(E) ipmi_ssif(E) kvm(E) acpi_ipmi(E) irqbypass(E) pcspkr(E) igb(E) ipmi_si(E) mei_me(E) i2c_i801(E) joydev(E) intel_pch_thermal(E) i2c_smbus(E) dca(E) lpc_ich(E) mei(E) ipmi_devintf(E) ipmi_msghandler(E) acpi_pad(E) tiny_power_button(E) button(E) fuse(E) efi_pstore(E) configfs(E) ip_tables(E) x_tables(E) ext4(E) mbcache(E) jbd2(E) hid_generic(E) usbhid(E) sd_mod(E) t10_pi(E) crct10dif_pclmul(E) crc32_pclmul(E) crc32c_intel(E) polyval_clmulni(E) ahci(E) xhci_pci(E) polyval_generic(E) gf128mul(E) ghash_clmulni_intel(E) sha512_ssse3(E) sha256_ssse3(E) xhci_pci_renesas(E) libahci(E) ehci_pci(E) sha1_ssse3(E) xhci_hcd(E) ehci_hcd(E) libata(E)
 kernel:  mgag200(E) i2c_algo_bit(E) usbcore(E) wmi(E) sg(E) dm_multipath(E) dm_mod(E) scsi_dh_rdac(E) scsi_dh_emc(E) scsi_dh_alua(E) scsi_mod(E) scsi_common(E) aesni_intel(E) crypto_simd(E) cryptd(E)
 kernel: Unloaded tainted modules: acpi_cpufreq(E):1 fjes(E):1
 kernel: CR2: 0000000000000028
 kernel: ---[ end trace 0000000000000000 ]---
 kernel: RIP: 0010:hugetlbfs_fill_super+0xb4/0x1a0
 kernel: Code: 48 8b 3b e8 3e c6 ed ff 48 85 c0 48 89 45 20 0f 84 d6 00 00 00 48 b8 ff ff ff ff ff ff ff 7f 4c 89 e7 49 89 44 24 20 48 8b 03 <8b> 48 28 b8 00 10 00 00 48 d3 e0 49 89 44 24 18 48 8b 03 8b 40 28
 kernel: RSP: 0018:ffffbe9960fcbd48 EFLAGS: 00010246
 kernel: RAX: 0000000000000000 RBX: ffff9af5272ae780 RCX: 0000000000372004
 kernel: RDX: ffffffffffffffff RSI: ffffffffffffffff RDI: ffff9af555e9b000
 kernel: RBP: ffff9af52ee66b00 R08: 0000000000000040 R09: 0000000000370004
 kernel: R10: ffffbe9960fcbd48 R11: 0000000000000040 R12: ffff9af555e9b000
 kernel: R13: ffffffffa66b86c0 R14: ffff9af507d2f400 R15: ffff9af507d2f400
 kernel: FS:  00007ffbc0ba4740(0000) GS:ffff9b0bd7000000(0000) knlGS:0000000000000000
 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 kernel: CR2: 0000000000000028 CR3: 00000001b1ee0000 CR4: 00000000001506f0

Link: https://lkml.kernel.org/r/20240130210418.3771-1-osalvador@suse.de
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Muchun Song <muchun.song@linux.dev>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ee13c2ca8ad293..d746866ae3b6ba 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1365,6 +1365,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
 {
 	struct hugetlbfs_fs_context *ctx = fc->fs_private;
 	struct fs_parse_result result;
+	struct hstate *h;
 	char *rest;
 	unsigned long ps;
 	int opt;
@@ -1409,11 +1410,12 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
 
 	case Opt_pagesize:
 		ps = memparse(param->string, &rest);
-		ctx->hstate = size_to_hstate(ps);
-		if (!ctx->hstate) {
+		h = size_to_hstate(ps);
+		if (!h) {
 			pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
 			return -EINVAL;
 		}
+		ctx->hstate = h;
 		return 0;
 
 	case Opt_min_size:

From bbac2bacc15831e9f92dbf7deabe8192c2d8ea92 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sun, 28 Jan 2024 13:28:49 +0000
Subject: [PATCH 406/707] mm/zswap: don't return LRU_SKIP if we have dropped
 lru lock

LRU_SKIP can only be returned if we don't ever dropped lru lock, or we
need to return LRU_RETRY to restart from the head of lru list.

Otherwise, the iteration might continue from a cursor position that was
freed while the locks were dropped.

Actually we may need to introduce another LRU_STOP to really terminate the
ongoing shrinking scan process, when we encounter a warm page already in
the swap cache.  The current list_lru implementation doesn't have this
function to early break from __list_lru_walk_one.

Link: https://lkml.kernel.org/r/20240126-zswap-writeback-race-v2-1-b10479847099@bytedance.com
Fixes: b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure")
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chris Li <chriscli@google.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 0a94b197ed32e1..350dd2fc815994 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -895,10 +895,8 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 		 * into the warmer region. We should terminate shrinking (if we're in the dynamic
 		 * shrinker context).
 		 */
-		if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
-			ret = LRU_SKIP;
+		if (writeback_result == -EEXIST && encountered_page_in_swapcache)
 			*encountered_page_in_swapcache = true;
-		}
 
 		goto put_unlock;
 	}

From a9243ad48dad5605fadeabdf5d146576e1297aa0 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Thu, 25 Jan 2024 08:51:27 +0000
Subject: [PATCH 407/707] mm: zswap: fix missing folio cleanup in writeback
 race path

In zswap_writeback_entry(), after we get a folio from
__read_swap_cache_async(), we grab the tree lock again to check that the
swap entry was not invalidated and recycled.  If it was, we delete the
folio we just added to the swap cache and exit.

However, __read_swap_cache_async() returns the folio locked when it is
newly allocated, which is always true for this path, and the folio is
ref'd.  Make sure to unlock and put the folio before returning.

This was discovered by code inspection, probably because this path handles
a race condition that should not happen often, and the bug would not crash
the system, it will only strand the folio indefinitely.

Link: https://lkml.kernel.org/r/20240125085127.1327013-1-yosryahmed@google.com
Fixes: 04fc7816089c ("mm: fix zswap writeback race condition")
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/zswap.c b/mm/zswap.c
index 350dd2fc815994..d2423247acfd64 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1440,6 +1440,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
 		spin_unlock(&tree->lock);
 		delete_from_swap_cache(folio);
+		folio_unlock(folio);
+		folio_put(folio);
 		return -ENOMEM;
 	}
 	spin_unlock(&tree->lock);

From 064fd31ffbe142d57dc698081f99de5b78979e68 Mon Sep 17 00:00:00 2001
From: Kalesh Singh <kaleshsingh@google.com>
Date: Tue, 9 Jan 2024 17:22:33 -0800
Subject: [PATCH 408/707] mm/cma: fix placement of trace_cma_alloc_start/finish

The current placement of trace_cma_alloc_start/finish misses the fail
cases: !cma || !cma->count || !cma->bitmap.

trace_cma_alloc_finish is also not emitted for the failure case
where bitmap_count > bitmap_maxno.

Fix these missed cases by moving the start event before the failure
checks and moving the finish event to the out label.

Link: https://lkml.kernel.org/r/20240110012234.3793639-1-kaleshsingh@google.com
Fixes: 7bc1aec5e287 ("mm: cma: add trace events for CMA alloc perf testing")
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Liam Mark <lmark@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/cma.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/cma.c b/mm/cma.c
index 7c09c47e530bf6..e12cf41d83549a 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -436,6 +436,9 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
 	unsigned long i;
 	struct page *page = NULL;
 	int ret = -ENOMEM;
+	const char *name = cma ? cma->name : NULL;
+
+	trace_cma_alloc_start(name, count, align);
 
 	if (!cma || !cma->count || !cma->bitmap)
 		goto out;
@@ -446,8 +449,6 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
 	if (!count)
 		goto out;
 
-	trace_cma_alloc_start(cma->name, count, align);
-
 	mask = cma_bitmap_aligned_mask(cma, align);
 	offset = cma_bitmap_aligned_offset(cma, align);
 	bitmap_maxno = cma_bitmap_maxno(cma);
@@ -496,8 +497,6 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
 		start = bitmap_no + mask + 1;
 	}
 
-	trace_cma_alloc_finish(cma->name, pfn, page, count, align, ret);
-
 	/*
 	 * CMA can allocate multiple page blocks, which results in different
 	 * blocks being marked with different tags. Reset the tags to ignore
@@ -516,6 +515,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
 
 	pr_debug("%s(): returned %p\n", __func__, page);
 out:
+	trace_cma_alloc_finish(name, pfn, page, count, align, ret);
 	if (page) {
 		count_vm_event(CMA_ALLOC_SUCCESS);
 		cma_sysfs_account_success_pages(cma, count);

From c3f7c49c1fd3cea9b486dd38e20082dab4b3904a Mon Sep 17 00:00:00 2001
From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Date: Tue, 9 Jan 2024 14:31:19 -0800
Subject: [PATCH 409/707] maple_tree: fix comment describing
 mas_node_count_gfp()

The function description comment for mas_node_count_gfp() mistakingly
refers to the function as mas_node_count().  Change it to refer to the
correct function.

Link: https://lkml.kernel.org/r/20240109223119.162357-1-sidhartha.kumar@oracle.com
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Peng Zhang <zhangpeng.00@bytedance.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/maple_tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 6f241bb3879920..7b161802860bdb 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -1307,8 +1307,8 @@ static inline void mas_free(struct ma_state *mas, struct maple_enode *used)
 }
 
 /*
- * mas_node_count() - Check if enough nodes are allocated and request more if
- * there is not enough nodes.
+ * mas_node_count_gfp() - Check if enough nodes are allocated and request more
+ * if there is not enough nodes.
  * @mas: The maple state
  * @count: The number of nodes needed
  * @gfp: the gfp flags

From 372abffc7919cffd734d993cc5cc33c0f60b9175 Mon Sep 17 00:00:00 2001
From: Sumanth Korikkar <sumanthk@linux.ibm.com>
Date: Mon, 8 Jan 2024 14:27:43 +0100
Subject: [PATCH 410/707] mm/memory_hotplug: introduce
 MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers

Patch series "implement "memmap on memory" feature on s390".

This series provides "memmap on memory" support on s390 platform.  "memmap
on memory" allows struct pages array to be allocated from the hotplugged
memory range instead of allocating it from main system memory.

s390 currently preallocates struct pages array for all potentially
possible memory, which ensures memory onlining always succeeds, but with
the cost of significant memory consumption from the available system
memory during boottime.  In certain extreme configuration, this could lead
to ipl failure.

"memmap on memory" ensures struct pages array are populated from self
contained hotplugged memory range instead of depleting the available
system memory and this could eliminate ipl failure on s390 platform.

On other platforms, system might go OOM when the physically hotplugged
memory depletes the available memory before it is onlined.  Hence, "memmap
on memory" feature was introduced as described in commit a08a2ae34613
("mm,memory_hotplug: allocate memmap from the added memory range").

Unlike other architectures, s390 memory blocks are not physically
accessible until it is online.  To make it physically accessible two new
memory notifiers MEM_PREPARE_ONLINE / MEM_FINISH_OFFLINE are added and
this notifier lets the hypervisor inform that the memory should be made
physically accessible.  This allows for "memmap on memory" initialization
during memory hotplug onlining phase, which is performed before calling
MEM_GOING_ONLINE notifier.

Patch 1 introduces MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers
to prepare the transition of memory to and from a physically accessible
state.  New mhp_flag MHP_OFFLINE_INACCESSIBLE is introduced to ensure
altmap cannot be written when adding memory - before it is set online.
This enhancement is crucial for implementing the "memmap on memory"
feature for s390 in a subsequent patch.

Patches 2 allocates vmemmap pages from self-contained memory range for
s390.  It allocates memory map (struct pages array) from the hotplugged
memory range, rather than using system memory by passing altmap to vmemmap
functions.

Patch 3 removes unhandled memory notifier types on s390.

Patch 4 implements MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers
on s390.  MEM_PREPARE_ONLINE memory notifier makes memory block physical
accessible via sclp assign command.  The notifier ensures self-contained
memory maps are accessible and hence enabling the "memmap on memory" on
s390.  MEM_FINISH_OFFLINE memory notifier shifts the memory block to an
inaccessible state via sclp unassign command.

Patch 5 finally enables MHP_MEMMAP_ON_MEMORY on s390.


This patch (of 5):

Introduce MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers to
prepare the transition of memory to and from a physically accessible
state.  This enhancement is crucial for implementing the "memmap on
memory" feature for s390 in a subsequent patch.

Platforms such as x86 can support physical memory hotplug via ACPI.  When
there is physical memory hotplug, ACPI event leads to the memory addition
with the following callchain:

acpi_memory_device_add()
  -> acpi_memory_enable_device()
     -> __add_memory()

After this, the hotplugged memory is physically accessible, and altmap
support prepared, before the "memmap on memory" initialization in
memory_block_online() is called.

On s390, memory hotplug works in a different way.  The available hotplug
memory has to be defined upfront in the hypervisor, but it is made
physically accessible only when the user sets it online via sysfs,
currently in the MEM_GOING_ONLINE notifier.  This is too late and "memmap
on memory" initialization is performed before calling MEM_GOING_ONLINE
notifier.

During the memory hotplug addition phase, altmap support is prepared and
during the memory onlining phase s390 requires memory to be physically
accessible and then subsequently initiate the "memmap on memory"
initialization process.

The memory provider will handle new MEM_PREPARE_ONLINE /
MEM_FINISH_OFFLINE notifications and make the memory accessible.

The mhp_flag MHP_OFFLINE_INACCESSIBLE is introduced and is relevant when
used along with MHP_MEMMAP_ON_MEMORY, because the altmap cannot be written
(e.g., poisoned) when adding memory -- before it is set online.  This
allows for adding memory with an altmap that is not currently made
available by a hypervisor.  When onlining that memory, the hypervisor can
be instructed to make that memory accessible via the new notifiers and the
onlining phase will not require any memory allocations, which is helpful
in low-memory situations.

All architectures ignore unknown memory notifiers.  Therefore, the
introduction of these new notifiers does not result in any functional
modifications across architectures.

Link: https://lkml.kernel.org/r/20240108132747.3238763-1-sumanthk@linux.ibm.com
Link: https://lkml.kernel.org/r/20240108132747.3238763-2-sumanthk@linux.ibm.com
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Suggested-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/base/memory.c          | 23 ++++++++++++++++++++++-
 include/linux/memory.h         |  9 +++++++++
 include/linux/memory_hotplug.h | 18 +++++++++++++++++-
 include/linux/memremap.h       |  1 +
 mm/memory_hotplug.c            | 17 ++++++++++++++---
 mm/sparse.c                    |  3 ++-
 6 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 14f964a7719bd0..c0436f46cfb701 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -188,6 +188,7 @@ static int memory_block_online(struct memory_block *mem)
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 	unsigned long nr_vmemmap_pages = 0;
+	struct memory_notify arg;
 	struct zone *zone;
 	int ret;
 
@@ -207,9 +208,19 @@ static int memory_block_online(struct memory_block *mem)
 	if (mem->altmap)
 		nr_vmemmap_pages = mem->altmap->free;
 
+	arg.altmap_start_pfn = start_pfn;
+	arg.altmap_nr_pages = nr_vmemmap_pages;
+	arg.start_pfn = start_pfn + nr_vmemmap_pages;
+	arg.nr_pages = nr_pages - nr_vmemmap_pages;
 	mem_hotplug_begin();
+	ret = memory_notify(MEM_PREPARE_ONLINE, &arg);
+	ret = notifier_to_errno(ret);
+	if (ret)
+		goto out_notifier;
+
 	if (nr_vmemmap_pages) {
-		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
+		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages,
+						zone, mem->altmap->inaccessible);
 		if (ret)
 			goto out;
 	}
@@ -231,7 +242,11 @@ static int memory_block_online(struct memory_block *mem)
 					  nr_vmemmap_pages);
 
 	mem->zone = zone;
+	mem_hotplug_done();
+	return ret;
 out:
+	memory_notify(MEM_FINISH_OFFLINE, &arg);
+out_notifier:
 	mem_hotplug_done();
 	return ret;
 }
@@ -244,6 +259,7 @@ static int memory_block_offline(struct memory_block *mem)
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 	unsigned long nr_vmemmap_pages = 0;
+	struct memory_notify arg;
 	int ret;
 
 	if (!mem->zone)
@@ -275,6 +291,11 @@ static int memory_block_offline(struct memory_block *mem)
 		mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 
 	mem->zone = NULL;
+	arg.altmap_start_pfn = start_pfn;
+	arg.altmap_nr_pages = nr_vmemmap_pages;
+	arg.start_pfn = start_pfn + nr_vmemmap_pages;
+	arg.nr_pages = nr_pages - nr_vmemmap_pages;
+	memory_notify(MEM_FINISH_OFFLINE, &arg);
 out:
 	mem_hotplug_done();
 	return ret;
diff --git a/include/linux/memory.h b/include/linux/memory.h
index f53cfdaaaa4166..939a16bd5cea15 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -96,8 +96,17 @@ int set_memory_block_size_order(unsigned int order);
 #define	MEM_GOING_ONLINE	(1<<3)
 #define	MEM_CANCEL_ONLINE	(1<<4)
 #define	MEM_CANCEL_OFFLINE	(1<<5)
+#define	MEM_PREPARE_ONLINE	(1<<6)
+#define	MEM_FINISH_OFFLINE	(1<<7)
 
 struct memory_notify {
+	/*
+	 * The altmap_start_pfn and altmap_nr_pages fields are designated for
+	 * specifying the altmap range and are exclusively intended for use in
+	 * MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
+	 */
+	unsigned long altmap_start_pfn;
+	unsigned long altmap_nr_pages;
 	unsigned long start_pfn;
 	unsigned long nr_pages;
 	int status_change_nid_normal;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 7d207658349416..ee00015575aab3 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -106,6 +106,22 @@ typedef int __bitwise mhp_t;
  * implies the node id (nid).
  */
 #define MHP_NID_IS_MGID		((__force mhp_t)BIT(2))
+/*
+ * The hotplugged memory is completely inaccessible while the memory is
+ * offline. The memory provider will handle MEM_PREPARE_ONLINE /
+ * MEM_FINISH_OFFLINE notifications and make the memory accessible.
+ *
+ * This flag is only relevant when used along with MHP_MEMMAP_ON_MEMORY,
+ * because the altmap cannot be written (e.g., poisoned) when adding
+ * memory -- before it is set online.
+ *
+ * This allows for adding memory with an altmap that is not currently
+ * made available by a hypervisor. When onlining that memory, the
+ * hypervisor can be instructed to make that memory available, and
+ * the onlining phase will not require any memory allocations, which is
+ * helpful in low-memory situations.
+ */
+#define MHP_OFFLINE_INACCESSIBLE	((__force mhp_t)BIT(3))
 
 /*
  * Extended parameters for memory hotplug:
@@ -154,7 +170,7 @@ extern void adjust_present_page_count(struct page *page,
 				      long nr_pages);
 /* VM interface that may be used by firmware interface */
 extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
-				     struct zone *zone);
+				     struct zone *zone, bool mhp_off_inaccessible);
 extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
 extern int online_pages(unsigned long pfn, unsigned long nr_pages,
 			struct zone *zone, struct memory_group *group);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 744c830f4b132c..9837f3e6fb9582 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -25,6 +25,7 @@ struct vmem_altmap {
 	unsigned long free;
 	unsigned long align;
 	unsigned long alloc;
+	bool inaccessible;
 };
 
 /*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 21890994c1d3cc..707027f691503f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1087,7 +1087,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
 }
 
 int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
-			      struct zone *zone)
+			      struct zone *zone, bool mhp_off_inaccessible)
 {
 	unsigned long end_pfn = pfn + nr_pages;
 	int ret, i;
@@ -1096,6 +1096,15 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 	if (ret)
 		return ret;
 
+	/*
+	 * Memory block is accessible at this stage and hence poison the struct
+	 * pages now.  If the memory block is accessible during memory hotplug
+	 * addition phase, then page poisining is already performed in
+	 * sparse_add_section().
+	 */
+	if (mhp_off_inaccessible)
+		page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
+
 	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
 
 	for (i = 0; i < nr_pages; i++)
@@ -1415,7 +1424,7 @@ static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
 }
 
 static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
-					    u64 start, u64 size)
+					    u64 start, u64 size, mhp_t mhp_flags)
 {
 	unsigned long memblock_size = memory_block_size_bytes();
 	u64 cur_start;
@@ -1431,6 +1440,8 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
 		};
 
 		mhp_altmap.free = memory_block_memmap_on_memory_pages();
+		if (mhp_flags & MHP_OFFLINE_INACCESSIBLE)
+			mhp_altmap.inaccessible = true;
 		params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
 					GFP_KERNEL);
 		if (!params.altmap) {
@@ -1516,7 +1527,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 	 */
 	if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
 	    mhp_supports_memmap_on_memory(memory_block_size_bytes())) {
-		ret = create_altmaps_and_memory_blocks(nid, group, start, size);
+		ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags);
 		if (ret)
 			goto error;
 	} else {
diff --git a/mm/sparse.c b/mm/sparse.c
index 338cf946dee8de..aed0951b87fa04 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -908,7 +908,8 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
 	 * Poison uninitialized struct pages in order to catch invalid flags
 	 * combinations.
 	 */
-	page_init_poison(memmap, sizeof(struct page) * nr_pages);
+	if (!altmap || !altmap->inaccessible)
+		page_init_poison(memmap, sizeof(struct page) * nr_pages);
 
 	ms = __nr_to_section(section_nr);
 	set_section_nid(section_nr, nid);

From 18d6f1015a1913132d32f5cfa84aa301ad7e5f84 Mon Sep 17 00:00:00 2001
From: Sumanth Korikkar <sumanthk@linux.ibm.com>
Date: Mon, 8 Jan 2024 14:27:44 +0100
Subject: [PATCH 411/707] s390/mm: allocate vmemmap pages from self-contained
 memory range

Allocate memory map (struct pages array) from the hotplugged memory
range, rather than using system memory. The change addresses the issue
where standby memory, when configured to be much larger than online
memory, could potentially lead to ipl failure due to memory map
allocation from online memory. For example, 16MB of memory map
allocation is needed for a memory block size of 1GB and when standby
memory is configured much larger than online memory, this could lead to
ipl failure.

To address this issue, the solution involves introducing "memmap on
memory" using the vmem_altmap structure on s390.  Architectures that
want to implement it should pass the altmap to the vmemmap_populate()
function and its associated callchain. This enhancement is discussed in
commit 4b94ffdc4163 ("x86, mm: introduce vmem_altmap to augment
vmemmap_populate()")

Provide "memmap on memory" support for s390 by passing the altmap in
vmemmap_populate() and its callchain. The allocation path is described
as follows:
* When altmap is NULL in vmemmap_populate(), memory map allocation
  occurs using the existing vmemmap_alloc_block_buf().
* When altmap is not NULL in vmemmap_populate(), memory map allocation
  still uses vmemmap_alloc_block_buf(), but this function internally
  calls altmap_alloc_block_buf().

For deallocation, the process is outlined as follows:
* When altmap is NULL in vmemmap_free(), memory map deallocation happens
  through free_pages().
* When altmap is not NULL in vmemmap_free(), memory map deallocation
  occurs via vmem_altmap_free().

While memory map allocation is primarily handled through the
self-contained memory map range, there might still be a small amount of
system memory allocation required for vmemmap pagetables. To mitigate
this impact, this feature will be limited to machines with EDAT1
support.

Link: https://lkml.kernel.org/r/20240108132747.3238763-3-sumanthk@linux.ibm.com
Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/mm/init.c |  3 ---
 arch/s390/mm/vmem.c | 62 +++++++++++++++++++++++++--------------------
 2 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 43e612bc2bcd34..8d9a60ccb7771a 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -281,9 +281,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	unsigned long size_pages = PFN_DOWN(size);
 	int rc;
 
-	if (WARN_ON_ONCE(params->altmap))
-		return -EINVAL;
-
 	if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
 		return -EINVAL;
 
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 186a020857cf6a..eb100479f7bec4 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -33,8 +33,12 @@ static void __ref *vmem_alloc_pages(unsigned int order)
 	return memblock_alloc(size, size);
 }
 
-static void vmem_free_pages(unsigned long addr, int order)
+static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap)
 {
+	if (altmap) {
+		vmem_altmap_free(altmap, 1 << order);
+		return;
+	}
 	/* We don't expect boot memory to be removed ever. */
 	if (!slab_is_available() ||
 	    WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr))))
@@ -156,7 +160,8 @@ static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
 
 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
-				  unsigned long end, bool add, bool direct)
+				  unsigned long end, bool add, bool direct,
+				  struct vmem_altmap *altmap)
 {
 	unsigned long prot, pages = 0;
 	int ret = -ENOMEM;
@@ -172,11 +177,11 @@ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
 			if (pte_none(*pte))
 				continue;
 			if (!direct)
-				vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0);
+				vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap);
 			pte_clear(&init_mm, addr, pte);
 		} else if (pte_none(*pte)) {
 			if (!direct) {
-				void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+				void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap);
 
 				if (!new_page)
 					goto out;
@@ -213,7 +218,8 @@ static void try_free_pte_table(pmd_t *pmd, unsigned long start)
 
 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
 static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
-				  unsigned long end, bool add, bool direct)
+				  unsigned long end, bool add, bool direct,
+				  struct vmem_altmap *altmap)
 {
 	unsigned long next, prot, pages = 0;
 	int ret = -ENOMEM;
@@ -234,11 +240,11 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 				if (IS_ALIGNED(addr, PMD_SIZE) &&
 				    IS_ALIGNED(next, PMD_SIZE)) {
 					if (!direct)
-						vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+						vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap);
 					pmd_clear(pmd);
 					pages++;
 				} else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
-					vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+					vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap);
 					pmd_clear(pmd);
 				}
 				continue;
@@ -261,7 +267,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 				 * page tables since vmemmap_populate gets
 				 * called for each section separately.
 				 */
-				new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE);
+				new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap);
 				if (new_page) {
 					set_pmd(pmd, __pmd(__pa(new_page) | prot));
 					if (!IS_ALIGNED(addr, PMD_SIZE) ||
@@ -280,7 +286,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 				vmemmap_use_sub_pmd(addr, next);
 			continue;
 		}
-		ret = modify_pte_table(pmd, addr, next, add, direct);
+		ret = modify_pte_table(pmd, addr, next, add, direct, altmap);
 		if (ret)
 			goto out;
 		if (!add)
@@ -302,12 +308,12 @@ static void try_free_pmd_table(pud_t *pud, unsigned long start)
 	for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
 		if (!pmd_none(*pmd))
 			return;
-	vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER);
+	vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL);
 	pud_clear(pud);
 }
 
 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
-			    bool add, bool direct)
+			    bool add, bool direct, struct vmem_altmap *altmap)
 {
 	unsigned long next, prot, pages = 0;
 	int ret = -ENOMEM;
@@ -347,7 +353,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
 		} else if (pud_large(*pud)) {
 			continue;
 		}
-		ret = modify_pmd_table(pud, addr, next, add, direct);
+		ret = modify_pmd_table(pud, addr, next, add, direct, altmap);
 		if (ret)
 			goto out;
 		if (!add)
@@ -370,12 +376,12 @@ static void try_free_pud_table(p4d_t *p4d, unsigned long start)
 		if (!pud_none(*pud))
 			return;
 	}
-	vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER);
+	vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL);
 	p4d_clear(p4d);
 }
 
 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
-			    bool add, bool direct)
+			    bool add, bool direct, struct vmem_altmap *altmap)
 {
 	unsigned long next;
 	int ret = -ENOMEM;
@@ -394,7 +400,7 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
 				goto out;
 			p4d_populate(&init_mm, p4d, pud);
 		}
-		ret = modify_pud_table(p4d, addr, next, add, direct);
+		ret = modify_pud_table(p4d, addr, next, add, direct, altmap);
 		if (ret)
 			goto out;
 		if (!add)
@@ -415,12 +421,12 @@ static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
 		if (!p4d_none(*p4d))
 			return;
 	}
-	vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER);
+	vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL);
 	pgd_clear(pgd);
 }
 
 static int modify_pagetable(unsigned long start, unsigned long end, bool add,
-			    bool direct)
+			    bool direct, struct vmem_altmap *altmap)
 {
 	unsigned long addr, next;
 	int ret = -ENOMEM;
@@ -445,7 +451,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add,
 				goto out;
 			pgd_populate(&init_mm, pgd, p4d);
 		}
-		ret = modify_p4d_table(pgd, addr, next, add, direct);
+		ret = modify_p4d_table(pgd, addr, next, add, direct, altmap);
 		if (ret)
 			goto out;
 		if (!add)
@@ -458,14 +464,16 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add,
 	return ret;
 }
 
-static int add_pagetable(unsigned long start, unsigned long end, bool direct)
+static int add_pagetable(unsigned long start, unsigned long end, bool direct,
+			 struct vmem_altmap *altmap)
 {
-	return modify_pagetable(start, end, true, direct);
+	return modify_pagetable(start, end, true, direct, altmap);
 }
 
-static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
+static int remove_pagetable(unsigned long start, unsigned long end, bool direct,
+			    struct vmem_altmap *altmap)
 {
-	return modify_pagetable(start, end, false, direct);
+	return modify_pagetable(start, end, false, direct, altmap);
 }
 
 /*
@@ -474,7 +482,7 @@ static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
 static int vmem_add_range(unsigned long start, unsigned long size)
 {
 	start = (unsigned long)__va(start);
-	return add_pagetable(start, start + size, true);
+	return add_pagetable(start, start + size, true, NULL);
 }
 
 /*
@@ -483,7 +491,7 @@ static int vmem_add_range(unsigned long start, unsigned long size)
 static void vmem_remove_range(unsigned long start, unsigned long size)
 {
 	start = (unsigned long)__va(start);
-	remove_pagetable(start, start + size, true);
+	remove_pagetable(start, start + size, true, NULL);
 }
 
 /*
@@ -496,9 +504,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 
 	mutex_lock(&vmem_mutex);
 	/* We don't care about the node, just use NUMA_NO_NODE on allocations */
-	ret = add_pagetable(start, end, false);
+	ret = add_pagetable(start, end, false, altmap);
 	if (ret)
-		remove_pagetable(start, end, false);
+		remove_pagetable(start, end, false, altmap);
 	mutex_unlock(&vmem_mutex);
 	return ret;
 }
@@ -509,7 +517,7 @@ void vmemmap_free(unsigned long start, unsigned long end,
 		  struct vmem_altmap *altmap)
 {
 	mutex_lock(&vmem_mutex);
-	remove_pagetable(start, end, false);
+	remove_pagetable(start, end, false, altmap);
 	mutex_unlock(&vmem_mutex);
 }
 

From 0315083eda6e6cb355b825588525a6482d266891 Mon Sep 17 00:00:00 2001
From: Sumanth Korikkar <sumanthk@linux.ibm.com>
Date: Mon, 8 Jan 2024 14:27:45 +0100
Subject: [PATCH 412/707] s390/sclp: remove unhandled memory notifier type

Remove memory notifier types which are unhandled by s390.  Unhandled
memory notifier types are covered by default case.

Link: https://lkml.kernel.org/r/20240108132747.3238763-4-sumanthk@linux.ibm.com
Suggested-by: Alexander Gordeev <agordeev@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/s390/char/sclp_cmd.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
index 11c428f4c7cf9c..355e63e44e9546 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
@@ -340,9 +340,6 @@ static int sclp_mem_notifier(struct notifier_block *nb,
 		if (contains_standby_increment(start, start + size))
 			rc = -EPERM;
 		break;
-	case MEM_ONLINE:
-	case MEM_CANCEL_OFFLINE:
-		break;
 	case MEM_GOING_ONLINE:
 		rc = sclp_mem_change_state(start, size, 1);
 		break;

From 5aab491ca6980dc38668befba71764afc3b7c2b3 Mon Sep 17 00:00:00 2001
From: Sumanth Korikkar <sumanthk@linux.ibm.com>
Date: Mon, 8 Jan 2024 14:27:46 +0100
Subject: [PATCH 413/707] s390/mm: implement
 MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers

MEM_PREPARE_ONLINE memory notifier makes memory block physical
accessible via sclp assign command. The notifier ensures self-contained
memory maps are accessible and hence enabling the "memmap on memory" on
s390.

MEM_FINISH_OFFLINE memory notifier shifts the memory block to an
inaccessible state via sclp unassign command.

Implementation considerations:
* When MHP_MEMMAP_ON_MEMORY is disabled, the system retains the old
  behavior. This means the memory map is allocated from default memory.
* If MACHINE_HAS_EDAT1 is unavailable, MHP_MEMMAP_ON_MEMORY is
  automatically disabled. This ensures that vmemmap pagetables do not
  consume additional memory from the default memory allocator.
* The MEM_GOING_ONLINE notifier has been modified to perform no
  operation, as MEM_PREPARE_ONLINE already executes the sclp assign
  command.
* The MEM_CANCEL_ONLINE/MEM_OFFLINE notifier now performs no operation, as
  MEM_FINISH_OFFLINE already executes the sclp unassign command.

Link: https://lkml.kernel.org/r/20240108132747.3238763-5-sumanthk@linux.ibm.com
Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/s390/char/sclp_cmd.c | 41 ++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
index 355e63e44e9546..7815e9bea69a13 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
@@ -18,6 +18,7 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/memory.h>
+#include <linux/memory_hotplug.h>
 #include <linux/module.h>
 #include <asm/ctlreg.h>
 #include <asm/chpid.h>
@@ -26,6 +27,7 @@
 #include <asm/sclp.h>
 #include <asm/numa.h>
 #include <asm/facility.h>
+#include <asm/page-states.h>
 
 #include "sclp.h"
 
@@ -340,13 +342,38 @@ static int sclp_mem_notifier(struct notifier_block *nb,
 		if (contains_standby_increment(start, start + size))
 			rc = -EPERM;
 		break;
-	case MEM_GOING_ONLINE:
+	case MEM_PREPARE_ONLINE:
+		/*
+		 * Access the altmap_start_pfn and altmap_nr_pages fields
+		 * within the struct memory_notify specifically when dealing
+		 * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
+		 *
+		 * When altmap is in use, take the specified memory range
+		 * online, which includes the altmap.
+		 */
+		if (arg->altmap_nr_pages) {
+			start = PFN_PHYS(arg->altmap_start_pfn);
+			size += PFN_PHYS(arg->altmap_nr_pages);
+		}
 		rc = sclp_mem_change_state(start, size, 1);
+		if (rc || !arg->altmap_nr_pages)
+			break;
+		/*
+		 * Set CMMA state to nodat here, since the struct page memory
+		 * at the beginning of the memory block will not go through the
+		 * buddy allocator later.
+		 */
+		__arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages);
 		break;
-	case MEM_CANCEL_ONLINE:
-		sclp_mem_change_state(start, size, 0);
-		break;
-	case MEM_OFFLINE:
+	case MEM_FINISH_OFFLINE:
+		/*
+		 * When altmap is in use, take the specified memory range
+		 * offline, which includes the altmap.
+		 */
+		if (arg->altmap_nr_pages) {
+			start = PFN_PHYS(arg->altmap_start_pfn);
+			size += PFN_PHYS(arg->altmap_nr_pages);
+		}
 		sclp_mem_change_state(start, size, 0);
 		break;
 	default:
@@ -397,7 +424,9 @@ static void __init add_memory_merged(u16 rn)
 	if (!size)
 		goto skip_add;
 	for (addr = start; addr < start + size; addr += block_size)
-		add_memory(0, addr, block_size, MHP_NONE);
+		add_memory(0, addr, block_size,
+			   MACHINE_HAS_EDAT1 ?
+			   MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE);
 skip_add:
 	first_rn = rn;
 	num = 1;

From 1b08541d5ddbfd13d3a38c0eee3804fd7e06ecff Mon Sep 17 00:00:00 2001
From: Sumanth Korikkar <sumanthk@linux.ibm.com>
Date: Mon, 8 Jan 2024 14:27:47 +0100
Subject: [PATCH 414/707] s390: enable MHP_MEMMAP_ON_MEMORY

Enable MHP_MEMMAP_ON_MEMORY to support "memmap on memory".
memory_hotplug.memmap_on_memory=true kernel parameter should be set in
kernel boot option to enable the feature.

Link: https://lkml.kernel.org/r/20240108132747.3238763-6-sumanthk@linux.ibm.com
Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index fe565f3a3a917d..a1d6dcbc89654c 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -113,6 +113,7 @@ config S390
 	select ARCH_INLINE_WRITE_UNLOCK_BH
 	select ARCH_INLINE_WRITE_UNLOCK_IRQ
 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+	select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
 	select ARCH_STACKWALK
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC

From f1528ed4b3d39f947278be070bd94865c47df157 Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Mon, 8 Jan 2024 12:48:15 +0800
Subject: [PATCH 415/707] mm/filemap: avoid type conversion

The return type of function folio_test_hugetlb is bool type, there is no
need to assign it to an integer type.

Link: https://lkml.kernel.org/r/20240108044815.3291487-1-lihongbo22@huawei.com
Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/filemap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 750e779c23db74..0d7e20edf46f59 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -843,7 +843,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
 		struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
 {
 	XA_STATE(xas, &mapping->i_pages, index);
-	int huge = folio_test_hugetlb(folio);
+	bool huge = folio_test_hugetlb(folio);
 	bool charged = false;
 	long nr = 1;
 

From d969a80a29f23e42bdf3e0f6493bec63fe2e542c Mon Sep 17 00:00:00 2001
From: JP Kobryn <inwardvessel@gmail.com>
Date: Fri, 5 Jan 2024 12:24:01 -0800
Subject: [PATCH 416/707] selftests/mm/ksm_functional: prevent unmapping
 undefined address

Replace some goto statements with return statements so that unmap() is not
called on an undefined address.  This change is made so that unmap() can
only be reached after mmap() is called (and the address mentioned is
defined).  Returning MAP_FAILED seems acceptable since client code checks
for this value.

Link: https://lkml.kernel.org/r/20240105202401.28851-1-inwardvessel@gmail.com
Fixes: 42096aa24b82 ("selftest/mm: ksm_functional_tests: test in mmap_and_merge_range() if anything got merged")
Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/ksm_functional_tests.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
index fbff0dd09191f1..d615767e396bec 100644
--- a/tools/testing/selftests/mm/ksm_functional_tests.c
+++ b/tools/testing/selftests/mm/ksm_functional_tests.c
@@ -155,12 +155,12 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot,
 	/* Stabilize accounting by disabling KSM completely. */
 	if (ksm_unmerge()) {
 		ksft_test_result_fail("Disabling (unmerging) KSM failed\n");
-		goto unmap;
+		return MAP_FAILED;
 	}
 
 	if (get_my_merging_pages() > 0) {
 		ksft_test_result_fail("Still pages merged\n");
-		goto unmap;
+		return MAP_FAILED;
 	}
 
 	map = mmap(NULL, size, PROT_READ|PROT_WRITE,

From a87529d95e3a0372f04c5c8281fb6a81b358a7f3 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Fri, 5 Jan 2024 07:54:19 -0800
Subject: [PATCH 417/707] selftests/mm: new test that steals pages

This test stresses the race between of madvise(DONTNEED), a page fault
and a parallel huge page mmap, which should fail due to lack of
available page available for mapping.

This test case must run on a system with one and only one huge page
available.

	# echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages

During setup, the test allocates the only available page, and starts
three threads:

  - thread 1:
      * madvise(MADV_DONTNEED) on the allocated huge page
  - thread 2:
      * Write to the allocated huge page
  - thread 3:
      * Tries to allocated (steal) an extra huge page (which is not
        available)

thread 3 should never succeed in the allocation, since the only huge
page was never unmapped, and should be reserved.

Touching the old page after thread3 allocation will raise a SIGBUS.

Link: https://lkml.kernel.org/r/20240105155419.1939484-2-leitao@debian.org
Signed-off-by: Breno Leitao <leitao@debian.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/.gitignore         |   1 +
 tools/testing/selftests/mm/Makefile           |   1 +
 .../selftests/mm/hugetlb_madv_vs_map.c        | 124 ++++++++++++++++++
 3 files changed, 126 insertions(+)
 create mode 100644 tools/testing/selftests/mm/hugetlb_madv_vs_map.c

diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 4ff10ea6146179..d26e962f2ac490 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -46,3 +46,4 @@ gup_longterm
 mkdirty
 va_high_addr_switch
 hugetlb_fault_after_madv
+hugetlb_madv_vs_map
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 2453add65d12f8..990e9bb112c507 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -70,6 +70,7 @@ TEST_GEN_FILES += ksm_tests
 TEST_GEN_FILES += ksm_functional_tests
 TEST_GEN_FILES += mdwe_test
 TEST_GEN_FILES += hugetlb_fault_after_madv
+TEST_GEN_FILES += hugetlb_madv_vs_map
 
 ifneq ($(ARCH),arm64)
 TEST_GEN_FILES += soft-dirty
diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c
new file mode 100644
index 00000000000000..d01e8d4901d0b5
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test case that must run on a system with one and only one huge page available.
+ *	# echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+ *
+ * During setup, the test allocates the only available page, and starts three threads:
+ *  - thread1:
+ *	* madvise(MADV_DONTNEED) on the allocated huge page
+ *  - thread 2:
+ *	* Write to the allocated huge page
+ *  - thread 3:
+ *	* Try to allocated an extra huge page (which must not available)
+ *
+ *  The test fails if thread3 is able to allocate a page.
+ *
+ *  Touching the first page after thread3's allocation will raise a SIGBUS
+ *
+ *  Author: Breno Leitao <leitao@debian.org>
+ */
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "vm_util.h"
+#include "../kselftest.h"
+
+#define MMAP_SIZE (1 << 21)
+#define INLOOP_ITER 100
+
+char *huge_ptr;
+
+/* Touch the memory while it is being madvised() */
+void *touch(void *unused)
+{
+	for (int i = 0; i < INLOOP_ITER; i++)
+		huge_ptr[0] = '.';
+
+	return NULL;
+}
+
+void *madv(void *unused)
+{
+	for (int i = 0; i < INLOOP_ITER; i++)
+		madvise(huge_ptr, MMAP_SIZE, MADV_DONTNEED);
+
+	return NULL;
+}
+
+/*
+ * We got here, and there must be no huge page available for mapping
+ * The other hugepage should be flipping from used <-> reserved, because
+ * of madvise(DONTNEED).
+ */
+void *map_extra(void *unused)
+{
+	void *ptr;
+
+	for (int i = 0; i < INLOOP_ITER; i++) {
+		ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+			   -1, 0);
+
+		if ((long)ptr != -1) {
+			/* Touching the other page now will cause a SIGBUG
+			 * huge_ptr[0] = '1';
+			 */
+			return ptr;
+		}
+	}
+
+	return NULL;
+}
+
+int main(void)
+{
+	pthread_t thread1, thread2, thread3;
+	unsigned long free_hugepages;
+	void *ret;
+
+	/*
+	 * On kernel 6.7, we are able to reproduce the problem with ~10
+	 * interactions
+	 */
+	int max = 10;
+
+	free_hugepages = get_free_hugepages();
+
+	if (free_hugepages != 1) {
+		ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n",
+			       free_hugepages);
+	}
+
+	while (max--) {
+		huge_ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE,
+				MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+				-1, 0);
+
+		if ((unsigned long)huge_ptr == -1) {
+			ksft_exit_skip("Failed to allocated huge page\n");
+			return KSFT_SKIP;
+		}
+
+		pthread_create(&thread1, NULL, madv, NULL);
+		pthread_create(&thread2, NULL, touch, NULL);
+		pthread_create(&thread3, NULL, map_extra, NULL);
+
+		pthread_join(thread1, NULL);
+		pthread_join(thread2, NULL);
+		pthread_join(thread3, &ret);
+
+		if (ret) {
+			ksft_test_result_fail("Unexpected huge page allocation\n");
+			return KSFT_FAIL;
+		}
+
+		/* Unmap and restart */
+		munmap(huge_ptr, MMAP_SIZE);
+	}
+
+	return KSFT_PASS;
+}

From 52729cfd76d397b50105e89d9cfa7a6e1e9ff2cd Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:23 +0100
Subject: [PATCH 418/707] mm: vmalloc: add va_alloc() helper

Patch series "Mitigate a vmap lock contention", v3.

1. Motivation

- Offload global vmap locks making it scaled to number of CPUS;

- If possible and there is an agreement, we can remove the "Per cpu kva
  allocator" to make the vmap code to be more simple;

- There were complaints from XFS folk that a vmalloc might be contented
  on their workloads.

2. Design(high level overview)

We introduce an effective vmap node logic.  A node behaves as independent
entity to serve an allocation request directly(if possible) from its pool.
That way it bypasses a global vmap space that is protected by its own
lock.

An access to pools are serialized by CPUs.  Number of nodes are equal to
number of CPUs in a system.  Please note the high threshold is bound to
128 nodes.

Pools are size segregated and populated based on system demand.  The
maximum alloc request that can be stored into a segregated storage is 256
pages.  The lazily drain path decays a pool by 25% as a first step and as
second populates it by fresh freed VAs for reuse instead of returning them
into a global space.

When a VA is obtained(alloc path), it is stored in separate nodes.  A
va->va_start address is converted into a correct node where it should be
placed and resided.  Doing so we balance VAs across the nodes as a result
an access becomes scalable.  The addr_to_node() function does a proper
address conversion to a correct node.

A vmap space is divided on segments with fixed size, it is 16 pages.  That
way any address can be associated with a segment number.  Number of
segments are equal to num_possible_cpus() but not grater then 128.  The
numeration starts from 0.  See below how it is converted:

static inline unsigned int
addr_to_node_id(unsigned long addr)
{
	return (addr / zone_size) % nr_nodes;
}

On a free path, a VA can be easily found by converting its "va_start"
address to a certain node it resides.  It is moved from "busy" data to
"lazy" data structure.  Later on, as noted earlier, the lazy kworker
decays each node pool and populates it by fresh incoming VAs.  Please
note, a VA is returned to a node that did an alloc request.

3. Test on AMD Ryzen Threadripper 3970X 32-Core Processor

sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64

<default perf>
 94.41%     0.89%  [kernel]        [k] _raw_spin_lock
 93.35%    93.07%  [kernel]        [k] native_queued_spin_lock_slowpath
 76.13%     0.28%  [kernel]        [k] __vmalloc_node_range
 72.96%     0.81%  [kernel]        [k] alloc_vmap_area
 56.94%     0.00%  [kernel]        [k] __get_vm_area_node
 41.95%     0.00%  [kernel]        [k] vmalloc
 37.15%     0.01%  [test_vmalloc]  [k] full_fit_alloc_test
 35.17%     0.00%  [kernel]        [k] ret_from_fork_asm
 35.17%     0.00%  [kernel]        [k] ret_from_fork
 35.17%     0.00%  [kernel]        [k] kthread
 35.08%     0.00%  [test_vmalloc]  [k] test_func
 34.45%     0.00%  [test_vmalloc]  [k] fix_size_alloc_test
 28.09%     0.01%  [test_vmalloc]  [k] long_busy_list_alloc_test
 23.53%     0.25%  [kernel]        [k] vfree.part.0
 21.72%     0.00%  [kernel]        [k] remove_vm_area
 20.08%     0.21%  [kernel]        [k] find_unlink_vmap_area
  2.34%     0.61%  [kernel]        [k] free_vmap_area_noflush
<default perf>
   vs
<patch-series perf>
 82.32%     0.22%  [test_vmalloc]  [k] long_busy_list_alloc_test
 63.36%     0.02%  [kernel]        [k] vmalloc
 63.34%     2.64%  [kernel]        [k] __vmalloc_node_range
 30.42%     4.46%  [kernel]        [k] vfree.part.0
 28.98%     2.51%  [kernel]        [k] __alloc_pages_bulk
 27.28%     0.19%  [kernel]        [k] __get_vm_area_node
 26.13%     1.50%  [kernel]        [k] alloc_vmap_area
 21.72%    21.67%  [kernel]        [k] clear_page_rep
 19.51%     2.43%  [kernel]        [k] _raw_spin_lock
 16.61%    16.51%  [kernel]        [k] native_queued_spin_lock_slowpath
 13.40%     2.07%  [kernel]        [k] free_unref_page
 10.62%     0.01%  [kernel]        [k] remove_vm_area
  9.02%     8.73%  [kernel]        [k] insert_vmap_area
  8.94%     0.00%  [kernel]        [k] ret_from_fork_asm
  8.94%     0.00%  [kernel]        [k] ret_from_fork
  8.94%     0.00%  [kernel]        [k] kthread
  8.29%     0.00%  [test_vmalloc]  [k] test_func
  7.81%     0.05%  [test_vmalloc]  [k] full_fit_alloc_test
  5.30%     4.73%  [kernel]        [k] purge_vmap_node
  4.47%     2.65%  [kernel]        [k] free_vmap_area_noflush
<patch-series perf>

confirms that a native_queued_spin_lock_slowpath goes down to
16.51% percent from 93.07%.

The throughput is ~12x higher:

urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64
Run the test with following parameters: run_test_mask=7 nr_threads=64
Done.
Check the kernel ring buffer to see the summary.

real    10m51.271s
user    0m0.013s
sys     0m0.187s
urezki@pc638:~$

urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64
Run the test with following parameters: run_test_mask=7 nr_threads=64
Done.
Check the kernel ring buffer to see the summary.

real    0m51.301s
user    0m0.015s
sys     0m0.040s
urezki@pc638:~$


This patch (of 11):

Currently __alloc_vmap_area() function contains an open codded logic that
finds and adjusts a VA based on allocation request.

Introduce a va_alloc() helper that adjusts found VA only.  There is no a
functional change as a result of this patch.

Link: https://lkml.kernel.org/r/20240102184633.748113-1-urezki@gmail.com
Link: https://lkml.kernel.org/r/20240102184633.748113-2-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c171c..739401a9eafcfe 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1481,6 +1481,32 @@ adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
 	return 0;
 }
 
+static unsigned long
+va_alloc(struct vmap_area *va,
+		struct rb_root *root, struct list_head *head,
+		unsigned long size, unsigned long align,
+		unsigned long vstart, unsigned long vend)
+{
+	unsigned long nva_start_addr;
+	int ret;
+
+	if (va->va_start > vstart)
+		nva_start_addr = ALIGN(va->va_start, align);
+	else
+		nva_start_addr = ALIGN(vstart, align);
+
+	/* Check the "vend" restriction. */
+	if (nva_start_addr + size > vend)
+		return vend;
+
+	/* Update the free vmap_area. */
+	ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
+	if (WARN_ON_ONCE(ret))
+		return vend;
+
+	return nva_start_addr;
+}
+
 /*
  * Returns a start address of the newly allocated area, if success.
  * Otherwise a vend is returned that indicates failure.
@@ -1493,7 +1519,6 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
 	bool adjust_search_size = true;
 	unsigned long nva_start_addr;
 	struct vmap_area *va;
-	int ret;
 
 	/*
 	 * Do not adjust when:
@@ -1511,18 +1536,8 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
 	if (unlikely(!va))
 		return vend;
 
-	if (va->va_start > vstart)
-		nva_start_addr = ALIGN(va->va_start, align);
-	else
-		nva_start_addr = ALIGN(vstart, align);
-
-	/* Check the "vend" restriction. */
-	if (nva_start_addr + size > vend)
-		return vend;
-
-	/* Update the free vmap_area. */
-	ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
-	if (WARN_ON_ONCE(ret))
+	nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
+	if (nva_start_addr == vend)
 		return vend;
 
 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK

From 169a5a6ddde9a36782c5738e108c858b54f5511b Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:24 +0100
Subject: [PATCH 419/707] mm: vmalloc: rename adjust_va_to_fit_type() function

This patch renames the adjust_va_to_fit_type() function to va_clip() which
is shorter and more expressive.

There is no a functional change as a result of this patch.

Link: https://lkml.kernel.org/r/20240102184633.748113-3-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 739401a9eafcfe..10f289e865122a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1382,9 +1382,9 @@ classify_va_fit_type(struct vmap_area *va,
 }
 
 static __always_inline int
-adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
-		      struct vmap_area *va, unsigned long nva_start_addr,
-		      unsigned long size)
+va_clip(struct rb_root *root, struct list_head *head,
+		struct vmap_area *va, unsigned long nva_start_addr,
+		unsigned long size)
 {
 	struct vmap_area *lva = NULL;
 	enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
@@ -1500,7 +1500,7 @@ va_alloc(struct vmap_area *va,
 		return vend;
 
 	/* Update the free vmap_area. */
-	ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
+	ret = va_clip(root, head, va, nva_start_addr, size);
 	if (WARN_ON_ONCE(ret))
 		return vend;
 
@@ -4155,9 +4155,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 			/* It is a BUG(), but trigger recovery instead. */
 			goto recovery;
 
-		ret = adjust_va_to_fit_type(&free_vmap_area_root,
-					    &free_vmap_area_list,
-					    va, start, size);
+		ret = va_clip(&free_vmap_area_root,
+			&free_vmap_area_list, va, start, size);
 		if (WARN_ON_ONCE(unlikely(ret)))
 			/* It is a BUG(), but trigger recovery instead. */
 			goto recovery;

From 51d4a5c59481a6edb5caa52953f38f686e8f893c Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:25 +0100
Subject: [PATCH 420/707] mm: vmalloc: move vmap_init_free_space() down in
 vmalloc.c

A vmap_init_free_space() is a function that setups a vmap space and is
considered as part of initialization phase.  Since a main entry which is
vmalloc_init(), has been moved down in vmalloc.c it makes sense to follow
the pattern.

There is no a functional change as a result of this patch.

Link: https://lkml.kernel.org/r/20240102184633.748113-4-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 82 ++++++++++++++++++++++++++--------------------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 10f289e865122a..06bd843d18ae99 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2512,47 +2512,6 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
 	kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
 }
 
-static void vmap_init_free_space(void)
-{
-	unsigned long vmap_start = 1;
-	const unsigned long vmap_end = ULONG_MAX;
-	struct vmap_area *busy, *free;
-
-	/*
-	 *     B     F     B     B     B     F
-	 * -|-----|.....|-----|-----|-----|.....|-
-	 *  |           The KVA space           |
-	 *  |<--------------------------------->|
-	 */
-	list_for_each_entry(busy, &vmap_area_list, list) {
-		if (busy->va_start - vmap_start > 0) {
-			free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
-			if (!WARN_ON_ONCE(!free)) {
-				free->va_start = vmap_start;
-				free->va_end = busy->va_start;
-
-				insert_vmap_area_augment(free, NULL,
-					&free_vmap_area_root,
-						&free_vmap_area_list);
-			}
-		}
-
-		vmap_start = busy->va_end;
-	}
-
-	if (vmap_end - vmap_start > 0) {
-		free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
-		if (!WARN_ON_ONCE(!free)) {
-			free->va_start = vmap_start;
-			free->va_end = vmap_end;
-
-			insert_vmap_area_augment(free, NULL,
-				&free_vmap_area_root,
-					&free_vmap_area_list);
-		}
-	}
-}
-
 static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
 	struct vmap_area *va, unsigned long flags, const void *caller)
 {
@@ -4465,6 +4424,47 @@ module_init(proc_vmalloc_init);
 
 #endif
 
+static void vmap_init_free_space(void)
+{
+	unsigned long vmap_start = 1;
+	const unsigned long vmap_end = ULONG_MAX;
+	struct vmap_area *busy, *free;
+
+	/*
+	 *     B     F     B     B     B     F
+	 * -|-----|.....|-----|-----|-----|.....|-
+	 *  |           The KVA space           |
+	 *  |<--------------------------------->|
+	 */
+	list_for_each_entry(busy, &vmap_area_list, list) {
+		if (busy->va_start - vmap_start > 0) {
+			free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+			if (!WARN_ON_ONCE(!free)) {
+				free->va_start = vmap_start;
+				free->va_end = busy->va_start;
+
+				insert_vmap_area_augment(free, NULL,
+					&free_vmap_area_root,
+						&free_vmap_area_list);
+			}
+		}
+
+		vmap_start = busy->va_end;
+	}
+
+	if (vmap_end - vmap_start > 0) {
+		free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+		if (!WARN_ON_ONCE(!free)) {
+			free->va_start = vmap_start;
+			free->va_end = vmap_end;
+
+			insert_vmap_area_augment(free, NULL,
+				&free_vmap_area_root,
+					&free_vmap_area_list);
+		}
+	}
+}
+
 void __init vmalloc_init(void)
 {
 	struct vmap_area *va;

From aac1abf328a2616ad925eeb326d2c6691c628102 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:26 +0100
Subject: [PATCH 421/707] mm: vmalloc: remove global vmap_area_root rb-tree

Store allocated objects in a separate nodes.  A va->va_start address is
converted into a correct node where it should be placed and resided.  An
addr_to_node() function is used to do a proper address conversion to
determine a node that contains a VA.

Such approach balances VAs across nodes as a result an access becomes
scalable.  Number of nodes in a system depends on number of CPUs.

Please note:

1. As of now allocated VAs are bound to a node-0. It means the
   patch does not give any difference comparing with a current
   behavior;

2. The global vmap_area_lock, vmap_area_root are removed as there
   is no need in it anymore. The vmap_area_list is still kept and
   is _empty_. It is exported for a kexec only;

3. The vmallocinfo and vread() have to be reworked to be able to
   handle multiple nodes.

Link: https://lkml.kernel.org/r/20240102184633.748113-5-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 240 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 173 insertions(+), 67 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 06bd843d18ae99..786ecb18ae228b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -728,11 +728,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
 
 
-static DEFINE_SPINLOCK(vmap_area_lock);
 static DEFINE_SPINLOCK(free_vmap_area_lock);
 /* Export for kexec only */
 LIST_HEAD(vmap_area_list);
-static struct rb_root vmap_area_root = RB_ROOT;
 static bool vmap_initialized __read_mostly;
 
 static struct rb_root purge_vmap_area_root = RB_ROOT;
@@ -772,6 +770,38 @@ static struct rb_root free_vmap_area_root = RB_ROOT;
  */
 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
 
+/*
+ * An effective vmap-node logic. Users make use of nodes instead
+ * of a global heap. It allows to balance an access and mitigate
+ * contention.
+ */
+struct rb_list {
+	struct rb_root root;
+	struct list_head head;
+	spinlock_t lock;
+};
+
+static struct vmap_node {
+	/* Bookkeeping data of this node. */
+	struct rb_list busy;
+} single;
+
+static struct vmap_node *vmap_nodes = &single;
+static __read_mostly unsigned int nr_vmap_nodes = 1;
+static __read_mostly unsigned int vmap_zone_size = 1;
+
+static inline unsigned int
+addr_to_node_id(unsigned long addr)
+{
+	return (addr / vmap_zone_size) % nr_vmap_nodes;
+}
+
+static inline struct vmap_node *
+addr_to_node(unsigned long addr)
+{
+	return &vmap_nodes[addr_to_node_id(addr)];
+}
+
 static __always_inline unsigned long
 va_size(struct vmap_area *va)
 {
@@ -803,10 +833,11 @@ unsigned long vmalloc_nr_pages(void)
 }
 
 /* Look up the first VA which satisfies addr < va_end, NULL if none. */
-static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
+static struct vmap_area *
+find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
 {
 	struct vmap_area *va = NULL;
-	struct rb_node *n = vmap_area_root.rb_node;
+	struct rb_node *n = root->rb_node;
 
 	addr = (unsigned long)kasan_reset_tag((void *)addr);
 
@@ -1552,12 +1583,14 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
  */
 static void free_vmap_area(struct vmap_area *va)
 {
+	struct vmap_node *vn = addr_to_node(va->va_start);
+
 	/*
 	 * Remove from the busy tree/list.
 	 */
-	spin_lock(&vmap_area_lock);
-	unlink_va(va, &vmap_area_root);
-	spin_unlock(&vmap_area_lock);
+	spin_lock(&vn->busy.lock);
+	unlink_va(va, &vn->busy.root);
+	spin_unlock(&vn->busy.lock);
 
 	/*
 	 * Insert/Merge it back to the free tree/list.
@@ -1600,6 +1633,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 				int node, gfp_t gfp_mask,
 				unsigned long va_flags)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
 	unsigned long freed;
 	unsigned long addr;
@@ -1645,9 +1679,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->vm = NULL;
 	va->flags = va_flags;
 
-	spin_lock(&vmap_area_lock);
-	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
-	spin_unlock(&vmap_area_lock);
+	vn = addr_to_node(va->va_start);
+
+	spin_lock(&vn->busy.lock);
+	insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
+	spin_unlock(&vn->busy.lock);
 
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
 	BUG_ON(va->va_start < vstart);
@@ -1871,26 +1907,61 @@ static void free_unmap_vmap_area(struct vmap_area *va)
 
 struct vmap_area *find_vmap_area(unsigned long addr)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
+	int i, j;
 
-	spin_lock(&vmap_area_lock);
-	va = __find_vmap_area(addr, &vmap_area_root);
-	spin_unlock(&vmap_area_lock);
+	/*
+	 * An addr_to_node_id(addr) converts an address to a node index
+	 * where a VA is located. If VA spans several zones and passed
+	 * addr is not the same as va->va_start, what is not common, we
+	 * may need to scan an extra nodes. See an example:
+	 *
+	 *      <--va-->
+	 * -|-----|-----|-----|-----|-
+	 *     1     2     0     1
+	 *
+	 * VA resides in node 1 whereas it spans 1 and 2. If passed
+	 * addr is within a second node we should do extra work. We
+	 * should mention that it is rare and is a corner case from
+	 * the other hand it has to be covered.
+	 */
+	i = j = addr_to_node_id(addr);
+	do {
+		vn = &vmap_nodes[i];
 
-	return va;
+		spin_lock(&vn->busy.lock);
+		va = __find_vmap_area(addr, &vn->busy.root);
+		spin_unlock(&vn->busy.lock);
+
+		if (va)
+			return va;
+	} while ((i = (i + 1) % nr_vmap_nodes) != j);
+
+	return NULL;
 }
 
 static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
+	int i, j;
 
-	spin_lock(&vmap_area_lock);
-	va = __find_vmap_area(addr, &vmap_area_root);
-	if (va)
-		unlink_va(va, &vmap_area_root);
-	spin_unlock(&vmap_area_lock);
+	i = j = addr_to_node_id(addr);
+	do {
+		vn = &vmap_nodes[i];
 
-	return va;
+		spin_lock(&vn->busy.lock);
+		va = __find_vmap_area(addr, &vn->busy.root);
+		if (va)
+			unlink_va(va, &vn->busy.root);
+		spin_unlock(&vn->busy.lock);
+
+		if (va)
+			return va;
+	} while ((i = (i + 1) % nr_vmap_nodes) != j);
+
+	return NULL;
 }
 
 /*** Per cpu kva allocator ***/
@@ -2092,6 +2163,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 
 static void free_vmap_block(struct vmap_block *vb)
 {
+	struct vmap_node *vn;
 	struct vmap_block *tmp;
 	struct xarray *xa;
 
@@ -2099,9 +2171,10 @@ static void free_vmap_block(struct vmap_block *vb)
 	tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
 	BUG_ON(tmp != vb);
 
-	spin_lock(&vmap_area_lock);
-	unlink_va(vb->va, &vmap_area_root);
-	spin_unlock(&vmap_area_lock);
+	vn = addr_to_node(vb->va->va_start);
+	spin_lock(&vn->busy.lock);
+	unlink_va(vb->va, &vn->busy.root);
+	spin_unlock(&vn->busy.lock);
 
 	free_vmap_area_noflush(vb->va);
 	kfree_rcu(vb, rcu_head);
@@ -2525,9 +2598,11 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 			      unsigned long flags, const void *caller)
 {
-	spin_lock(&vmap_area_lock);
+	struct vmap_node *vn = addr_to_node(va->va_start);
+
+	spin_lock(&vn->busy.lock);
 	setup_vmalloc_vm_locked(vm, va, flags, caller);
-	spin_unlock(&vmap_area_lock);
+	spin_unlock(&vn->busy.lock);
 }
 
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
@@ -3715,6 +3790,7 @@ static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
  */
 long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
 	struct vm_struct *vm;
 	char *vaddr;
@@ -3728,8 +3804,11 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 
 	remains = count;
 
-	spin_lock(&vmap_area_lock);
-	va = find_vmap_area_exceed_addr((unsigned long)addr);
+	/* Hooked to node_0 so far. */
+	vn = addr_to_node(0);
+	spin_lock(&vn->busy.lock);
+
+	va = find_vmap_area_exceed_addr((unsigned long)addr, &vn->busy.root);
 	if (!va)
 		goto finished_zero;
 
@@ -3737,7 +3816,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 	if ((unsigned long)addr + remains <= va->va_start)
 		goto finished_zero;
 
-	list_for_each_entry_from(va, &vmap_area_list, list) {
+	list_for_each_entry_from(va, &vn->busy.head, list) {
 		size_t copied;
 
 		if (remains == 0)
@@ -3796,12 +3875,12 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 	}
 
 finished_zero:
-	spin_unlock(&vmap_area_lock);
+	spin_unlock(&vn->busy.lock);
 	/* zero-fill memory holes */
 	return count - remains + zero_iter(iter, remains);
 finished:
 	/* Nothing remains, or We couldn't copy/zero everything. */
-	spin_unlock(&vmap_area_lock);
+	spin_unlock(&vn->busy.lock);
 
 	return count - remains;
 }
@@ -4135,14 +4214,15 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	}
 
 	/* insert all vm's */
-	spin_lock(&vmap_area_lock);
 	for (area = 0; area < nr_vms; area++) {
-		insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
+		struct vmap_node *vn = addr_to_node(vas[area]->va_start);
 
+		spin_lock(&vn->busy.lock);
+		insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
 		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
 				 pcpu_get_vm_areas);
+		spin_unlock(&vn->busy.lock);
 	}
-	spin_unlock(&vmap_area_lock);
 
 	/*
 	 * Mark allocated areas as accessible. Do it now as a best-effort
@@ -4253,55 +4333,57 @@ bool vmalloc_dump_obj(void *object)
 {
 	void *objp = (void *)PAGE_ALIGN((unsigned long)object);
 	const void *caller;
-	struct vm_struct *vm;
 	struct vmap_area *va;
+	struct vmap_node *vn;
 	unsigned long addr;
 	unsigned int nr_pages;
+	bool success = false;
 
-	if (!spin_trylock(&vmap_area_lock))
-		return false;
-	va = __find_vmap_area((unsigned long)objp, &vmap_area_root);
-	if (!va) {
-		spin_unlock(&vmap_area_lock);
-		return false;
-	}
+	vn = addr_to_node((unsigned long)objp);
 
-	vm = va->vm;
-	if (!vm) {
-		spin_unlock(&vmap_area_lock);
-		return false;
+	if (spin_trylock(&vn->busy.lock)) {
+		va = __find_vmap_area(addr, &vn->busy.root);
+
+		if (va && va->vm) {
+			addr = (unsigned long)va->vm->addr;
+			caller = va->vm->caller;
+			nr_pages = va->vm->nr_pages;
+			success = true;
+		}
+
+		spin_unlock(&vn->busy.lock);
 	}
-	addr = (unsigned long)vm->addr;
-	caller = vm->caller;
-	nr_pages = vm->nr_pages;
-	spin_unlock(&vmap_area_lock);
-	pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
-		nr_pages, addr, caller);
-	return true;
+
+	if (success)
+		pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
+			nr_pages, addr, caller);
+
+	return success;
 }
 #endif
 
 #ifdef CONFIG_PROC_FS
 static void *s_start(struct seq_file *m, loff_t *pos)
-	__acquires(&vmap_purge_lock)
-	__acquires(&vmap_area_lock)
 {
+	struct vmap_node *vn = addr_to_node(0);
+
 	mutex_lock(&vmap_purge_lock);
-	spin_lock(&vmap_area_lock);
+	spin_lock(&vn->busy.lock);
 
-	return seq_list_start(&vmap_area_list, *pos);
+	return seq_list_start(&vn->busy.head, *pos);
 }
 
 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
-	return seq_list_next(p, &vmap_area_list, pos);
+	struct vmap_node *vn = addr_to_node(0);
+	return seq_list_next(p, &vn->busy.head, pos);
 }
 
 static void s_stop(struct seq_file *m, void *p)
-	__releases(&vmap_area_lock)
-	__releases(&vmap_purge_lock)
 {
-	spin_unlock(&vmap_area_lock);
+	struct vmap_node *vn = addr_to_node(0);
+
+	spin_unlock(&vn->busy.lock);
 	mutex_unlock(&vmap_purge_lock);
 }
 
@@ -4344,9 +4426,11 @@ static void show_purge_info(struct seq_file *m)
 
 static int s_show(struct seq_file *m, void *p)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
 	struct vm_struct *v;
 
+	vn = addr_to_node(0);
 	va = list_entry(p, struct vmap_area, list);
 
 	if (!va->vm) {
@@ -4397,7 +4481,7 @@ static int s_show(struct seq_file *m, void *p)
 	 * As a final step, dump "unpurged" areas.
 	 */
 final:
-	if (list_is_last(&va->list, &vmap_area_list))
+	if (list_is_last(&va->list, &vn->busy.head))
 		show_purge_info(m);
 
 	return 0;
@@ -4428,7 +4512,8 @@ static void vmap_init_free_space(void)
 {
 	unsigned long vmap_start = 1;
 	const unsigned long vmap_end = ULONG_MAX;
-	struct vmap_area *busy, *free;
+	struct vmap_area *free;
+	struct vm_struct *busy;
 
 	/*
 	 *     B     F     B     B     B     F
@@ -4436,12 +4521,12 @@ static void vmap_init_free_space(void)
 	 *  |           The KVA space           |
 	 *  |<--------------------------------->|
 	 */
-	list_for_each_entry(busy, &vmap_area_list, list) {
-		if (busy->va_start - vmap_start > 0) {
+	for (busy = vmlist; busy; busy = busy->next) {
+		if ((unsigned long) busy->addr - vmap_start > 0) {
 			free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
 			if (!WARN_ON_ONCE(!free)) {
 				free->va_start = vmap_start;
-				free->va_end = busy->va_start;
+				free->va_end = (unsigned long) busy->addr;
 
 				insert_vmap_area_augment(free, NULL,
 					&free_vmap_area_root,
@@ -4449,7 +4534,7 @@ static void vmap_init_free_space(void)
 			}
 		}
 
-		vmap_start = busy->va_end;
+		vmap_start = (unsigned long) busy->addr + busy->size;
 	}
 
 	if (vmap_end - vmap_start > 0) {
@@ -4465,9 +4550,23 @@ static void vmap_init_free_space(void)
 	}
 }
 
+static void vmap_init_nodes(void)
+{
+	struct vmap_node *vn;
+	int i;
+
+	for (i = 0; i < nr_vmap_nodes; i++) {
+		vn = &vmap_nodes[i];
+		vn->busy.root = RB_ROOT;
+		INIT_LIST_HEAD(&vn->busy.head);
+		spin_lock_init(&vn->busy.lock);
+	}
+}
+
 void __init vmalloc_init(void)
 {
 	struct vmap_area *va;
+	struct vmap_node *vn;
 	struct vm_struct *tmp;
 	int i;
 
@@ -4489,6 +4588,11 @@ void __init vmalloc_init(void)
 		xa_init(&vbq->vmap_blocks);
 	}
 
+	/*
+	 * Setup nodes before importing vmlist.
+	 */
+	vmap_init_nodes();
+
 	/* Import existing vmlist entries. */
 	for (tmp = vmlist; tmp; tmp = tmp->next) {
 		va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
@@ -4498,7 +4602,9 @@ void __init vmalloc_init(void)
 		va->va_start = (unsigned long)tmp->addr;
 		va->va_end = va->va_start + tmp->size;
 		va->vm = tmp;
-		insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+
+		vn = addr_to_node(va->va_start);
+		insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
 	}
 
 	/*

From 2d6b6cc380b9a922b651e5c93a98ef4f68c0b429 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Thu, 11 Jan 2024 14:26:28 +0100
Subject: [PATCH 422/707] mm: vmalloc: mark vmap_init_free_space() with __init
 tag

vmap_init_free_space() is called only once therefore tag it with __init.
Apart of that it access the "vmlist" variable that is located in
".init.data" section.

Link: https://lkml.kernel.org/r/20240111132628.299644-1-urezki@gmail.com
Fixes: 86817057732a ("mm: vmalloc: remove global vmap_area_root rb-tree")
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202401112056.I41bELL4-lkp@intel.com/
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 786ecb18ae228b..666ea8a379f6bb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4508,7 +4508,7 @@ module_init(proc_vmalloc_init);
 
 #endif
 
-static void vmap_init_free_space(void)
+static void __init vmap_init_free_space(void)
 {
 	unsigned long vmap_start = 1;
 	const unsigned long vmap_end = ULONG_MAX;

From 60eb705ddffe5673b9fb819338aa50ee779b91c4 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Thu, 11 Jan 2024 13:11:04 +0100
Subject: [PATCH 423/707] fix a wrong value passed to __find_vmap_area()

There was a typo in the vmalloc_dump_obj() function.  Instead of passing a
real address which is "objp" an "addr" was used what is wrong and not
initialized.

Link: https://lkml.kernel.org/r/20240111121104.180993-1-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Fixes: 86817057732a ("mm: vmalloc: remove global vmap_area_root rb-tree")
Closes: https://lore.kernel.org/oe-kbuild-all/202401111810.TKPIXLCs-lkp@intel.com/
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 666ea8a379f6bb..86efebf0e0c8a7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4342,7 +4342,7 @@ bool vmalloc_dump_obj(void *object)
 	vn = addr_to_node((unsigned long)objp);
 
 	if (spin_trylock(&vn->busy.lock)) {
-		va = __find_vmap_area(addr, &vn->busy.root);
+		va = __find_vmap_area((unsigned long)objp, &vn->busy.root);
 
 		if (va && va->vm) {
 			addr = (unsigned long)va->vm->addr;

From 697948bd0325e89b3d31d2074bcdcd3868c137a1 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Tue, 2 Jan 2024 19:46:27 +0100
Subject: [PATCH 424/707] mm/vmalloc: remove vmap_area_list

Earlier, vmap_area_list is exported to vmcoreinfo so that makedumpfile get
the base address of vmalloc area.  Now, vmap_area_list is empty, so export
VMALLOC_START to vmcoreinfo instead, and remove vmap_area_list.

Link: https://lkml.kernel.org/r/20240102184633.748113-6-urezki@gmail.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Acked-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kdump/vmcoreinfo.rst | 8 ++++----
 arch/arm64/kernel/crash_core.c                 | 1 -
 arch/riscv/kernel/crash_core.c                 | 1 -
 include/linux/vmalloc.h                        | 1 -
 kernel/crash_core.c                            | 4 +---
 kernel/kallsyms_selftest.c                     | 1 -
 mm/nommu.c                                     | 2 --
 mm/vmalloc.c                                   | 2 --
 8 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index bced9e4b6e0899..0f714fc945acf4 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -65,11 +65,11 @@ Defines the beginning of the text section. In general, _stext indicates
 the kernel start address. Used to convert a virtual address from the
 direct kernel map to a physical address.
 
-vmap_area_list
---------------
+VMALLOC_START
+-------------
 
-Stores the virtual area list. makedumpfile gets the vmalloc start value
-from this variable and its value is necessary for vmalloc translation.
+Stores the base address of vmalloc area. makedumpfile gets this value
+since is necessary for vmalloc translation.
 
 mem_map
 -------
diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c
index 66cde752cd7409..2a24199a9b81e0 100644
--- a/arch/arm64/kernel/crash_core.c
+++ b/arch/arm64/kernel/crash_core.c
@@ -23,7 +23,6 @@ void arch_crash_save_vmcoreinfo(void)
 	/* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */
 	vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR);
 	vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END);
-	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
 	vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END);
 	vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START);
 	vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END);
diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/crash_core.c
index 8706736fd4e2dc..d18d529fd9b984 100644
--- a/arch/riscv/kernel/crash_core.c
+++ b/arch/riscv/kernel/crash_core.c
@@ -8,7 +8,6 @@ void arch_crash_save_vmcoreinfo(void)
 	VMCOREINFO_NUMBER(phys_ram_base);
 
 	vmcoreinfo_append_str("NUMBER(PAGE_OFFSET)=0x%lx\n", PAGE_OFFSET);
-	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
 	vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END);
 #ifdef CONFIG_MMU
 	VMCOREINFO_NUMBER(VA_BITS);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c720be70c8ddde..91810b4e95107b 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -253,7 +253,6 @@ extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
 /*
  *	Internals.  Don't use..
  */
-extern struct list_head vmap_area_list;
 extern __init void vm_area_add_early(struct vm_struct *vm);
 extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 75cd6a736d0306..b60de490c1fccb 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -748,7 +748,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
 #endif
 	VMCOREINFO_SYMBOL(_stext);
-	VMCOREINFO_SYMBOL(vmap_area_list);
+	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
 
 #ifndef CONFIG_NUMA
 	VMCOREINFO_SYMBOL(mem_map);
@@ -789,8 +789,6 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(free_area, free_list);
 	VMCOREINFO_OFFSET(list_head, next);
 	VMCOREINFO_OFFSET(list_head, prev);
-	VMCOREINFO_OFFSET(vmap_area, va_start);
-	VMCOREINFO_OFFSET(vmap_area, list);
 	VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
 	log_buf_vmcoreinfo_setup();
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index b4cac76ea5e989..8a689b4ff4f982 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -89,7 +89,6 @@ static struct test_item test_items[] = {
 	ITEM_DATA(kallsyms_test_var_data_static),
 	ITEM_DATA(kallsyms_test_var_bss),
 	ITEM_DATA(kallsyms_test_var_data),
-	ITEM_DATA(vmap_area_list),
 #endif
 };
 
diff --git a/mm/nommu.c b/mm/nommu.c
index b6dc558d314408..5ec8f44e7ce976 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -131,8 +131,6 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL(follow_pfn);
 
-LIST_HEAD(vmap_area_list);
-
 void vfree(const void *addr)
 {
 	kfree(addr);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 86efebf0e0c8a7..b5882790da0088 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -729,8 +729,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 
 
 static DEFINE_SPINLOCK(free_vmap_area_lock);
-/* Export for kexec only */
-LIST_HEAD(vmap_area_list);
 static bool vmap_initialized __read_mostly;
 
 static struct rb_root purge_vmap_area_root = RB_ROOT;

From 5c39e51e42185e7f4a5bf2c65ad5886202fafb30 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Thu, 11 Jan 2024 20:23:29 +0100
Subject: [PATCH 425/707] mm: vmalloc: Fix a warning in the
 crash_save_vmcoreinfo_init()

The vmcoreinfo_append_str() function expects "long unsigned int" type as a
second argument(0x%lx) to print a beginning of vmalloc start address which
is defined as a VMALLOC_START macro.

For some architectures it can be considered as "int" type, for example m68
generates a compile warning message.  To fix it cast a second argument to
"unsigned long".

Link: https://lkml.kernel.org/r/20240111192329.449189-1-urezki@gmail.com
Fixes: 9bdb180b2db6 ("mm/vmalloc: remove vmap_area_list")
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202401120218.y469Puyf-lkp@intel.com/
Acked-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/crash_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b60de490c1fccb..49b31e59d3ccd1 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -748,7 +748,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
 #endif
 	VMCOREINFO_SYMBOL(_stext);
-	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
+	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START);
 
 #ifndef CONFIG_NUMA
 	VMCOREINFO_SYMBOL(mem_map);

From 431c0b3265072b7c8258abb137a154a15d5c1aab Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:28 +0100
Subject: [PATCH 426/707] mm: vmalloc: remove global purge_vmap_area_root
 rb-tree

Similar to busy VA, lazily-freed area is stored to a node it belongs to.
Such approach does not require any global locking primitive, instead an
access becomes scalable what mitigates a contention.

This patch removes a global purge-lock, global purge-tree and global purge
list.

Link: https://lkml.kernel.org/r/20240102184633.748113-7-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 135 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 82 insertions(+), 53 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b5882790da0088..72822aeff55c22 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -731,10 +731,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 static DEFINE_SPINLOCK(free_vmap_area_lock);
 static bool vmap_initialized __read_mostly;
 
-static struct rb_root purge_vmap_area_root = RB_ROOT;
-static LIST_HEAD(purge_vmap_area_list);
-static DEFINE_SPINLOCK(purge_vmap_area_lock);
-
 /*
  * This kmem_cache is used for vmap_area objects. Instead of
  * allocating from slab we reuse an object from this cache to
@@ -782,6 +778,12 @@ struct rb_list {
 static struct vmap_node {
 	/* Bookkeeping data of this node. */
 	struct rb_list busy;
+	struct rb_list lazy;
+
+	/*
+	 * Ready-to-free areas.
+	 */
+	struct list_head purge_list;
 } single;
 
 static struct vmap_node *vmap_nodes = &single;
@@ -1766,40 +1768,22 @@ static DEFINE_MUTEX(vmap_purge_lock);
 
 /* for per-CPU blocks */
 static void purge_fragmented_blocks_allcpus(void);
+static cpumask_t purge_nodes;
 
 /*
  * Purges all lazily-freed vmap areas.
  */
-static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
+static unsigned long
+purge_vmap_node(struct vmap_node *vn)
 {
-	unsigned long resched_threshold;
-	unsigned int num_purged_areas = 0;
-	struct list_head local_purge_list;
+	unsigned long num_purged_areas = 0;
 	struct vmap_area *va, *n_va;
 
-	lockdep_assert_held(&vmap_purge_lock);
-
-	spin_lock(&purge_vmap_area_lock);
-	purge_vmap_area_root = RB_ROOT;
-	list_replace_init(&purge_vmap_area_list, &local_purge_list);
-	spin_unlock(&purge_vmap_area_lock);
-
-	if (unlikely(list_empty(&local_purge_list)))
-		goto out;
-
-	start = min(start,
-		list_first_entry(&local_purge_list,
-			struct vmap_area, list)->va_start);
-
-	end = max(end,
-		list_last_entry(&local_purge_list,
-			struct vmap_area, list)->va_end);
-
-	flush_tlb_kernel_range(start, end);
-	resched_threshold = lazy_max_pages() << 1;
+	if (list_empty(&vn->purge_list))
+		return 0;
 
 	spin_lock(&free_vmap_area_lock);
-	list_for_each_entry_safe(va, n_va, &local_purge_list, list) {
+	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
 		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
 		unsigned long orig_start = va->va_start;
 		unsigned long orig_end = va->va_end;
@@ -1821,13 +1805,55 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 
 		atomic_long_sub(nr, &vmap_lazy_nr);
 		num_purged_areas++;
-
-		if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
-			cond_resched_lock(&free_vmap_area_lock);
 	}
 	spin_unlock(&free_vmap_area_lock);
 
-out:
+	return num_purged_areas;
+}
+
+/*
+ * Purges all lazily-freed vmap areas.
+ */
+static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
+{
+	unsigned long num_purged_areas = 0;
+	struct vmap_node *vn;
+	int i;
+
+	lockdep_assert_held(&vmap_purge_lock);
+	purge_nodes = CPU_MASK_NONE;
+
+	for (i = 0; i < nr_vmap_nodes; i++) {
+		vn = &vmap_nodes[i];
+
+		INIT_LIST_HEAD(&vn->purge_list);
+
+		if (RB_EMPTY_ROOT(&vn->lazy.root))
+			continue;
+
+		spin_lock(&vn->lazy.lock);
+		WRITE_ONCE(vn->lazy.root.rb_node, NULL);
+		list_replace_init(&vn->lazy.head, &vn->purge_list);
+		spin_unlock(&vn->lazy.lock);
+
+		start = min(start, list_first_entry(&vn->purge_list,
+			struct vmap_area, list)->va_start);
+
+		end = max(end, list_last_entry(&vn->purge_list,
+			struct vmap_area, list)->va_end);
+
+		cpumask_set_cpu(i, &purge_nodes);
+	}
+
+	if (cpumask_weight(&purge_nodes) > 0) {
+		flush_tlb_kernel_range(start, end);
+
+		for_each_cpu(i, &purge_nodes) {
+			vn = &nodes[i];
+			num_purged_areas += purge_vmap_node(vn);
+		}
+	}
+
 	trace_purge_vmap_area_lazy(start, end, num_purged_areas);
 	return num_purged_areas > 0;
 }
@@ -1846,16 +1872,9 @@ static void reclaim_and_purge_vmap_areas(void)
 
 static void drain_vmap_area_work(struct work_struct *work)
 {
-	unsigned long nr_lazy;
-
-	do {
-		mutex_lock(&vmap_purge_lock);
-		__purge_vmap_area_lazy(ULONG_MAX, 0);
-		mutex_unlock(&vmap_purge_lock);
-
-		/* Recheck if further work is required. */
-		nr_lazy = atomic_long_read(&vmap_lazy_nr);
-	} while (nr_lazy > lazy_max_pages());
+	mutex_lock(&vmap_purge_lock);
+	__purge_vmap_area_lazy(ULONG_MAX, 0);
+	mutex_unlock(&vmap_purge_lock);
 }
 
 /*
@@ -1865,6 +1884,7 @@ static void drain_vmap_area_work(struct work_struct *work)
  */
 static void free_vmap_area_noflush(struct vmap_area *va)
 {
+	struct vmap_node *vn = addr_to_node(va->va_start);
 	unsigned long nr_lazy_max = lazy_max_pages();
 	unsigned long va_start = va->va_start;
 	unsigned long nr_lazy;
@@ -1878,10 +1898,9 @@ static void free_vmap_area_noflush(struct vmap_area *va)
 	/*
 	 * Merge or place it to the purge tree/list.
 	 */
-	spin_lock(&purge_vmap_area_lock);
-	merge_or_add_vmap_area(va,
-		&purge_vmap_area_root, &purge_vmap_area_list);
-	spin_unlock(&purge_vmap_area_lock);
+	spin_lock(&vn->lazy.lock);
+	merge_or_add_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
+	spin_unlock(&vn->lazy.lock);
 
 	trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
 
@@ -4411,15 +4430,21 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
 
 static void show_purge_info(struct seq_file *m)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
+	int i;
 
-	spin_lock(&purge_vmap_area_lock);
-	list_for_each_entry(va, &purge_vmap_area_list, list) {
-		seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
-			(void *)va->va_start, (void *)va->va_end,
-			va->va_end - va->va_start);
+	for (i = 0; i < nr_vmap_nodes; i++) {
+		vn = &vmap_nodes[i];
+
+		spin_lock(&vn->lazy.lock);
+		list_for_each_entry(va, &vn->lazy.head, list) {
+			seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
+				(void *)va->va_start, (void *)va->va_end,
+				va->va_end - va->va_start);
+		}
+		spin_unlock(&vn->lazy.lock);
 	}
-	spin_unlock(&purge_vmap_area_lock);
 }
 
 static int s_show(struct seq_file *m, void *p)
@@ -4558,6 +4583,10 @@ static void vmap_init_nodes(void)
 		vn->busy.root = RB_ROOT;
 		INIT_LIST_HEAD(&vn->busy.head);
 		spin_lock_init(&vn->busy.lock);
+
+		vn->lazy.root = RB_ROOT;
+		INIT_LIST_HEAD(&vn->lazy.head);
+		spin_lock_init(&vn->lazy.lock);
 	}
 }
 

From 7db166b4aa0d930d041ae9ff1ad1f706c608449e Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:29 +0100
Subject: [PATCH 427/707] mm: vmalloc: offload free_vmap_area_lock lock

Concurrent access to a global vmap space is a bottle-neck.  We can
simulate a high contention by running a vmalloc test suite.

To address it, introduce an effective vmap node logic.  Each node behaves
as independent entity.  When a node is accessed it serves a request
directly(if possible) from its pool.

This model has a size based pool for requests, i.e.  pools are serialized
and populated based on object size and real demand.  A maximum object size
that pool can handle is set to 256 pages.

This technique reduces a pressure on the global vmap lock.

Link: https://lkml.kernel.org/r/20240102184633.748113-8-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 387 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 342 insertions(+), 45 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 72822aeff55c22..e8b9621ea02b46 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -775,7 +775,22 @@ struct rb_list {
 	spinlock_t lock;
 };
 
+struct vmap_pool {
+	struct list_head head;
+	unsigned long len;
+};
+
+/*
+ * A fast size storage contains VAs up to 1M size.
+ */
+#define MAX_VA_SIZE_PAGES 256
+
 static struct vmap_node {
+	/* Simple size segregated storage. */
+	struct vmap_pool pool[MAX_VA_SIZE_PAGES];
+	spinlock_t pool_lock;
+	bool skip_populate;
+
 	/* Bookkeeping data of this node. */
 	struct rb_list busy;
 	struct rb_list lazy;
@@ -784,6 +799,8 @@ static struct vmap_node {
 	 * Ready-to-free areas.
 	 */
 	struct list_head purge_list;
+	struct work_struct purge_work;
+	unsigned long nr_purged;
 } single;
 
 static struct vmap_node *vmap_nodes = &single;
@@ -802,6 +819,61 @@ addr_to_node(unsigned long addr)
 	return &vmap_nodes[addr_to_node_id(addr)];
 }
 
+static inline struct vmap_node *
+id_to_node(unsigned int id)
+{
+	return &vmap_nodes[id % nr_vmap_nodes];
+}
+
+/*
+ * We use the value 0 to represent "no node", that is why
+ * an encoded value will be the node-id incremented by 1.
+ * It is always greater then 0. A valid node_id which can
+ * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id
+ * is not valid 0 is returned.
+ */
+static unsigned int
+encode_vn_id(unsigned int node_id)
+{
+	/* Can store U8_MAX [0:254] nodes. */
+	if (node_id < nr_vmap_nodes)
+		return (node_id + 1) << BITS_PER_BYTE;
+
+	/* Warn and no node encoded. */
+	WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id);
+	return 0;
+}
+
+/*
+ * Returns an encoded node-id, the valid range is within
+ * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is
+ * returned if extracted data is wrong.
+ */
+static unsigned int
+decode_vn_id(unsigned int val)
+{
+	unsigned int node_id = (val >> BITS_PER_BYTE) - 1;
+
+	/* Can store U8_MAX [0:254] nodes. */
+	if (node_id < nr_vmap_nodes)
+		return node_id;
+
+	/* If it was _not_ zero, warn. */
+	WARN_ONCE(node_id != UINT_MAX,
+		"Decode wrong node id (%d)\n", node_id);
+
+	return nr_vmap_nodes;
+}
+
+static bool
+is_vn_id_valid(unsigned int node_id)
+{
+	if (node_id < nr_vmap_nodes)
+		return true;
+
+	return false;
+}
+
 static __always_inline unsigned long
 va_size(struct vmap_area *va)
 {
@@ -1623,6 +1695,104 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
 		kmem_cache_free(vmap_area_cachep, va);
 }
 
+static struct vmap_pool *
+size_to_va_pool(struct vmap_node *vn, unsigned long size)
+{
+	unsigned int idx = (size - 1) / PAGE_SIZE;
+
+	if (idx < MAX_VA_SIZE_PAGES)
+		return &vn->pool[idx];
+
+	return NULL;
+}
+
+static bool
+node_pool_add_va(struct vmap_node *n, struct vmap_area *va)
+{
+	struct vmap_pool *vp;
+
+	vp = size_to_va_pool(n, va_size(va));
+	if (!vp)
+		return false;
+
+	spin_lock(&n->pool_lock);
+	list_add(&va->list, &vp->head);
+	WRITE_ONCE(vp->len, vp->len + 1);
+	spin_unlock(&n->pool_lock);
+
+	return true;
+}
+
+static struct vmap_area *
+node_pool_del_va(struct vmap_node *vn, unsigned long size,
+		unsigned long align, unsigned long vstart,
+		unsigned long vend)
+{
+	struct vmap_area *va = NULL;
+	struct vmap_pool *vp;
+	int err = 0;
+
+	vp = size_to_va_pool(vn, size);
+	if (!vp || list_empty(&vp->head))
+		return NULL;
+
+	spin_lock(&vn->pool_lock);
+	if (!list_empty(&vp->head)) {
+		va = list_first_entry(&vp->head, struct vmap_area, list);
+
+		if (IS_ALIGNED(va->va_start, align)) {
+			/*
+			 * Do some sanity check and emit a warning
+			 * if one of below checks detects an error.
+			 */
+			err |= (va_size(va) != size);
+			err |= (va->va_start < vstart);
+			err |= (va->va_end > vend);
+
+			if (!WARN_ON_ONCE(err)) {
+				list_del_init(&va->list);
+				WRITE_ONCE(vp->len, vp->len - 1);
+			} else {
+				va = NULL;
+			}
+		} else {
+			list_move_tail(&va->list, &vp->head);
+			va = NULL;
+		}
+	}
+	spin_unlock(&vn->pool_lock);
+
+	return va;
+}
+
+static struct vmap_area *
+node_alloc(unsigned long size, unsigned long align,
+		unsigned long vstart, unsigned long vend,
+		unsigned long *addr, unsigned int *vn_id)
+{
+	struct vmap_area *va;
+
+	*vn_id = 0;
+	*addr = vend;
+
+	/*
+	 * Fallback to a global heap if not vmalloc or there
+	 * is only one node.
+	 */
+	if (vstart != VMALLOC_START || vend != VMALLOC_END ||
+			nr_vmap_nodes == 1)
+		return NULL;
+
+	*vn_id = raw_smp_processor_id() % nr_vmap_nodes;
+	va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend);
+	*vn_id = encode_vn_id(*vn_id);
+
+	if (va)
+		*addr = va->va_start;
+
+	return va;
+}
+
 /*
  * Allocate a region of KVA of the specified size and alignment, within the
  * vstart and vend.
@@ -1637,6 +1807,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	struct vmap_area *va;
 	unsigned long freed;
 	unsigned long addr;
+	unsigned int vn_id;
 	int purged = 0;
 	int ret;
 
@@ -1647,11 +1818,23 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 		return ERR_PTR(-EBUSY);
 
 	might_sleep();
-	gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
 
-	va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
-	if (unlikely(!va))
-		return ERR_PTR(-ENOMEM);
+	/*
+	 * If a VA is obtained from a global heap(if it fails here)
+	 * it is anyway marked with this "vn_id" so it is returned
+	 * to this pool's node later. Such way gives a possibility
+	 * to populate pools based on users demand.
+	 *
+	 * On success a ready to go VA is returned.
+	 */
+	va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
+	if (!va) {
+		gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
+
+		va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+		if (unlikely(!va))
+			return ERR_PTR(-ENOMEM);
+	}
 
 	/*
 	 * Only scan the relevant parts containing pointers to other objects
@@ -1660,10 +1843,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
 
 retry:
-	preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
-	addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
-		size, align, vstart, vend);
-	spin_unlock(&free_vmap_area_lock);
+	if (addr == vend) {
+		preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
+		addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
+			size, align, vstart, vend);
+		spin_unlock(&free_vmap_area_lock);
+	}
 
 	trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
 
@@ -1677,7 +1862,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->va_start = addr;
 	va->va_end = addr + size;
 	va->vm = NULL;
-	va->flags = va_flags;
+	va->flags = (va_flags | vn_id);
 
 	vn = addr_to_node(va->va_start);
 
@@ -1770,63 +1955,135 @@ static DEFINE_MUTEX(vmap_purge_lock);
 static void purge_fragmented_blocks_allcpus(void);
 static cpumask_t purge_nodes;
 
-/*
- * Purges all lazily-freed vmap areas.
- */
-static unsigned long
-purge_vmap_node(struct vmap_node *vn)
+static void
+reclaim_list_global(struct list_head *head)
 {
-	unsigned long num_purged_areas = 0;
-	struct vmap_area *va, *n_va;
+	struct vmap_area *va, *n;
 
-	if (list_empty(&vn->purge_list))
-		return 0;
+	if (list_empty(head))
+		return;
 
 	spin_lock(&free_vmap_area_lock);
+	list_for_each_entry_safe(va, n, head, list)
+		merge_or_add_vmap_area_augment(va,
+			&free_vmap_area_root, &free_vmap_area_list);
+	spin_unlock(&free_vmap_area_lock);
+}
+
+static void
+decay_va_pool_node(struct vmap_node *vn, bool full_decay)
+{
+	struct vmap_area *va, *nva;
+	struct list_head decay_list;
+	struct rb_root decay_root;
+	unsigned long n_decay;
+	int i;
+
+	decay_root = RB_ROOT;
+	INIT_LIST_HEAD(&decay_list);
+
+	for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
+		struct list_head tmp_list;
+
+		if (list_empty(&vn->pool[i].head))
+			continue;
+
+		INIT_LIST_HEAD(&tmp_list);
+
+		/* Detach the pool, so no-one can access it. */
+		spin_lock(&vn->pool_lock);
+		list_replace_init(&vn->pool[i].head, &tmp_list);
+		spin_unlock(&vn->pool_lock);
+
+		if (full_decay)
+			WRITE_ONCE(vn->pool[i].len, 0);
+
+		/* Decay a pool by ~25% out of left objects. */
+		n_decay = vn->pool[i].len >> 2;
+
+		list_for_each_entry_safe(va, nva, &tmp_list, list) {
+			list_del_init(&va->list);
+			merge_or_add_vmap_area(va, &decay_root, &decay_list);
+
+			if (!full_decay) {
+				WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1);
+
+				if (!--n_decay)
+					break;
+			}
+		}
+
+		/* Attach the pool back if it has been partly decayed. */
+		if (!full_decay && !list_empty(&tmp_list)) {
+			spin_lock(&vn->pool_lock);
+			list_replace_init(&tmp_list, &vn->pool[i].head);
+			spin_unlock(&vn->pool_lock);
+		}
+	}
+
+	reclaim_list_global(&decay_list);
+}
+
+static void purge_vmap_node(struct work_struct *work)
+{
+	struct vmap_node *vn = container_of(work,
+		struct vmap_node, purge_work);
+	struct vmap_area *va, *n_va;
+	LIST_HEAD(local_list);
+
+	vn->nr_purged = 0;
+
 	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
 		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
 		unsigned long orig_start = va->va_start;
 		unsigned long orig_end = va->va_end;
+		unsigned int vn_id = decode_vn_id(va->flags);
 
-		/*
-		 * Finally insert or merge lazily-freed area. It is
-		 * detached and there is no need to "unlink" it from
-		 * anything.
-		 */
-		va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
-				&free_vmap_area_list);
-
-		if (!va)
-			continue;
+		list_del_init(&va->list);
 
 		if (is_vmalloc_or_module_addr((void *)orig_start))
 			kasan_release_vmalloc(orig_start, orig_end,
 					      va->va_start, va->va_end);
 
 		atomic_long_sub(nr, &vmap_lazy_nr);
-		num_purged_areas++;
+		vn->nr_purged++;
+
+		if (is_vn_id_valid(vn_id) && !vn->skip_populate)
+			if (node_pool_add_va(vn, va))
+				continue;
+
+		/* Go back to global. */
+		list_add(&va->list, &local_list);
 	}
-	spin_unlock(&free_vmap_area_lock);
 
-	return num_purged_areas;
+	reclaim_list_global(&local_list);
 }
 
 /*
  * Purges all lazily-freed vmap areas.
  */
-static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
+static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
+		bool full_pool_decay)
 {
-	unsigned long num_purged_areas = 0;
+	unsigned long nr_purged_areas = 0;
+	unsigned int nr_purge_helpers;
+	unsigned int nr_purge_nodes;
 	struct vmap_node *vn;
 	int i;
 
 	lockdep_assert_held(&vmap_purge_lock);
+
+	/*
+	 * Use cpumask to mark which node has to be processed.
+	 */
 	purge_nodes = CPU_MASK_NONE;
 
 	for (i = 0; i < nr_vmap_nodes; i++) {
 		vn = &vmap_nodes[i];
 
 		INIT_LIST_HEAD(&vn->purge_list);
+		vn->skip_populate = full_pool_decay;
+		decay_va_pool_node(vn, full_pool_decay);
 
 		if (RB_EMPTY_ROOT(&vn->lazy.root))
 			continue;
@@ -1845,17 +2102,45 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 		cpumask_set_cpu(i, &purge_nodes);
 	}
 
-	if (cpumask_weight(&purge_nodes) > 0) {
+	nr_purge_nodes = cpumask_weight(&purge_nodes);
+	if (nr_purge_nodes > 0) {
 		flush_tlb_kernel_range(start, end);
 
+		/* One extra worker is per a lazy_max_pages() full set minus one. */
+		nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages();
+		nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1;
+
 		for_each_cpu(i, &purge_nodes) {
-			vn = &nodes[i];
-			num_purged_areas += purge_vmap_node(vn);
+			vn = &vmap_nodes[i];
+
+			if (nr_purge_helpers > 0) {
+				INIT_WORK(&vn->purge_work, purge_vmap_node);
+
+				if (cpumask_test_cpu(i, cpu_online_mask))
+					schedule_work_on(i, &vn->purge_work);
+				else
+					schedule_work(&vn->purge_work);
+
+				nr_purge_helpers--;
+			} else {
+				vn->purge_work.func = NULL;
+				purge_vmap_node(&vn->purge_work);
+				nr_purged_areas += vn->nr_purged;
+			}
+		}
+
+		for_each_cpu(i, &purge_nodes) {
+			vn = &vmap_nodes[i];
+
+			if (vn->purge_work.func) {
+				flush_work(&vn->purge_work);
+				nr_purged_areas += vn->nr_purged;
+			}
 		}
 	}
 
-	trace_purge_vmap_area_lazy(start, end, num_purged_areas);
-	return num_purged_areas > 0;
+	trace_purge_vmap_area_lazy(start, end, nr_purged_areas);
+	return nr_purged_areas > 0;
 }
 
 /*
@@ -1866,14 +2151,14 @@ static void reclaim_and_purge_vmap_areas(void)
 {
 	mutex_lock(&vmap_purge_lock);
 	purge_fragmented_blocks_allcpus();
-	__purge_vmap_area_lazy(ULONG_MAX, 0);
+	__purge_vmap_area_lazy(ULONG_MAX, 0, true);
 	mutex_unlock(&vmap_purge_lock);
 }
 
 static void drain_vmap_area_work(struct work_struct *work)
 {
 	mutex_lock(&vmap_purge_lock);
-	__purge_vmap_area_lazy(ULONG_MAX, 0);
+	__purge_vmap_area_lazy(ULONG_MAX, 0, false);
 	mutex_unlock(&vmap_purge_lock);
 }
 
@@ -1884,9 +2169,10 @@ static void drain_vmap_area_work(struct work_struct *work)
  */
 static void free_vmap_area_noflush(struct vmap_area *va)
 {
-	struct vmap_node *vn = addr_to_node(va->va_start);
 	unsigned long nr_lazy_max = lazy_max_pages();
 	unsigned long va_start = va->va_start;
+	unsigned int vn_id = decode_vn_id(va->flags);
+	struct vmap_node *vn;
 	unsigned long nr_lazy;
 
 	if (WARN_ON_ONCE(!list_empty(&va->list)))
@@ -1896,10 +2182,14 @@ static void free_vmap_area_noflush(struct vmap_area *va)
 				PAGE_SHIFT, &vmap_lazy_nr);
 
 	/*
-	 * Merge or place it to the purge tree/list.
+	 * If it was request by a certain node we would like to
+	 * return it to that node, i.e. its pool for later reuse.
 	 */
+	vn = is_vn_id_valid(vn_id) ?
+		id_to_node(vn_id):addr_to_node(va->va_start);
+
 	spin_lock(&vn->lazy.lock);
-	merge_or_add_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
+	insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
 	spin_unlock(&vn->lazy.lock);
 
 	trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
@@ -2408,7 +2698,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
 	}
 	free_purged_blocks(&purge_list);
 
-	if (!__purge_vmap_area_lazy(start, end) && flush)
+	if (!__purge_vmap_area_lazy(start, end, false) && flush)
 		flush_tlb_kernel_range(start, end);
 	mutex_unlock(&vmap_purge_lock);
 }
@@ -4576,7 +4866,7 @@ static void __init vmap_init_free_space(void)
 static void vmap_init_nodes(void)
 {
 	struct vmap_node *vn;
-	int i;
+	int i, j;
 
 	for (i = 0; i < nr_vmap_nodes; i++) {
 		vn = &vmap_nodes[i];
@@ -4587,6 +4877,13 @@ static void vmap_init_nodes(void)
 		vn->lazy.root = RB_ROOT;
 		INIT_LIST_HEAD(&vn->lazy.head);
 		spin_lock_init(&vn->lazy.lock);
+
+		for (j = 0; j < MAX_VA_SIZE_PAGES; j++) {
+			INIT_LIST_HEAD(&vn->pool[j].head);
+			WRITE_ONCE(vn->pool[j].len, 0);
+		}
+
+		spin_lock_init(&vn->pool_lock);
 	}
 }
 

From d77b8e651af5694bfbcc8c9a10b24f1bbb1410ed Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:30 +0100
Subject: [PATCH 428/707] mm: vmalloc: support multiple nodes in vread_iter

Extend the vread_iter() to be able to perform a sequential reading of VAs
which are spread among multiple nodes.  So a data read over the /dev/kmem
correctly reflects a vmalloc memory layout.

Link: https://lkml.kernel.org/r/20240102184633.748113-9-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 53 insertions(+), 14 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e8b9621ea02b46..8b0cad0e2aefa7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -906,7 +906,7 @@ unsigned long vmalloc_nr_pages(void)
 
 /* Look up the first VA which satisfies addr < va_end, NULL if none. */
 static struct vmap_area *
-find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
+__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
 {
 	struct vmap_area *va = NULL;
 	struct rb_node *n = root->rb_node;
@@ -930,6 +930,41 @@ find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
 	return va;
 }
 
+/*
+ * Returns a node where a first VA, that satisfies addr < va_end, resides.
+ * If success, a node is locked. A user is responsible to unlock it when a
+ * VA is no longer needed to be accessed.
+ *
+ * Returns NULL if nothing found.
+ */
+static struct vmap_node *
+find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
+{
+	struct vmap_node *vn, *va_node = NULL;
+	struct vmap_area *va_lowest;
+	int i;
+
+	for (i = 0; i < nr_vmap_nodes; i++) {
+		vn = &vmap_nodes[i];
+
+		spin_lock(&vn->busy.lock);
+		va_lowest = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
+		if (va_lowest) {
+			if (!va_node || va_lowest->va_start < (*va)->va_start) {
+				if (va_node)
+					spin_unlock(&va_node->busy.lock);
+
+				*va = va_lowest;
+				va_node = vn;
+				continue;
+			}
+		}
+		spin_unlock(&vn->busy.lock);
+	}
+
+	return va_node;
+}
+
 static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
 {
 	struct rb_node *n = root->rb_node;
@@ -4102,6 +4137,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 	struct vm_struct *vm;
 	char *vaddr;
 	size_t n, size, flags, remains;
+	unsigned long next;
 
 	addr = kasan_reset_tag(addr);
 
@@ -4111,19 +4147,15 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 
 	remains = count;
 
-	/* Hooked to node_0 so far. */
-	vn = addr_to_node(0);
-	spin_lock(&vn->busy.lock);
-
-	va = find_vmap_area_exceed_addr((unsigned long)addr, &vn->busy.root);
-	if (!va)
+	vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va);
+	if (!vn)
 		goto finished_zero;
 
 	/* no intersects with alive vmap_area */
 	if ((unsigned long)addr + remains <= va->va_start)
 		goto finished_zero;
 
-	list_for_each_entry_from(va, &vn->busy.head, list) {
+	do {
 		size_t copied;
 
 		if (remains == 0)
@@ -4138,10 +4170,10 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 		WARN_ON(flags == VMAP_BLOCK);
 
 		if (!vm && !flags)
-			continue;
+			goto next_va;
 
 		if (vm && (vm->flags & VM_UNINITIALIZED))
-			continue;
+			goto next_va;
 
 		/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
 		smp_rmb();
@@ -4150,7 +4182,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 		size = vm ? get_vm_area_size(vm) : va_size(va);
 
 		if (addr >= vaddr + size)
-			continue;
+			goto next_va;
 
 		if (addr < vaddr) {
 			size_t to_zero = min_t(size_t, vaddr - addr, remains);
@@ -4179,15 +4211,22 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 
 		if (copied != n)
 			goto finished;
-	}
+
+	next_va:
+		next = va->va_end;
+		spin_unlock(&vn->busy.lock);
+	} while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));
 
 finished_zero:
-	spin_unlock(&vn->busy.lock);
+	if (vn)
+		spin_unlock(&vn->busy.lock);
+
 	/* zero-fill memory holes */
 	return count - remains + zero_iter(iter, remains);
 finished:
 	/* Nothing remains, or We couldn't copy/zero everything. */
-	spin_unlock(&vn->busy.lock);
+	if (vn)
+		spin_unlock(&vn->busy.lock);
 
 	return count - remains;
 }

From 478163dd5cac116b724b5ceb7c18c19c843a77ec Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:31 +0100
Subject: [PATCH 429/707] mm: vmalloc: support multiple nodes in vmallocinfo

Allocated areas are spread among nodes, it implies that the scanning has
to be performed individually of each node in order to dump all existing
VAs.

Link: https://lkml.kernel.org/r/20240102184633.748113-10-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 120 ++++++++++++++++++++-------------------------------
 1 file changed, 47 insertions(+), 73 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8b0cad0e2aefa7..f0aaf926e3ccd2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4709,30 +4709,6 @@ bool vmalloc_dump_obj(void *object)
 #endif
 
 #ifdef CONFIG_PROC_FS
-static void *s_start(struct seq_file *m, loff_t *pos)
-{
-	struct vmap_node *vn = addr_to_node(0);
-
-	mutex_lock(&vmap_purge_lock);
-	spin_lock(&vn->busy.lock);
-
-	return seq_list_start(&vn->busy.head, *pos);
-}
-
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
-{
-	struct vmap_node *vn = addr_to_node(0);
-	return seq_list_next(p, &vn->busy.head, pos);
-}
-
-static void s_stop(struct seq_file *m, void *p)
-{
-	struct vmap_node *vn = addr_to_node(0);
-
-	spin_unlock(&vn->busy.lock);
-	mutex_unlock(&vmap_purge_lock);
-}
-
 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
 {
 	if (IS_ENABLED(CONFIG_NUMA)) {
@@ -4776,84 +4752,82 @@ static void show_purge_info(struct seq_file *m)
 	}
 }
 
-static int s_show(struct seq_file *m, void *p)
+static int vmalloc_info_show(struct seq_file *m, void *p)
 {
 	struct vmap_node *vn;
 	struct vmap_area *va;
 	struct vm_struct *v;
+	int i;
 
-	vn = addr_to_node(0);
-	va = list_entry(p, struct vmap_area, list);
+	for (i = 0; i < nr_vmap_nodes; i++) {
+		vn = &vmap_nodes[i];
 
-	if (!va->vm) {
-		if (va->flags & VMAP_RAM)
-			seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
-				(void *)va->va_start, (void *)va->va_end,
-				va->va_end - va->va_start);
+		spin_lock(&vn->busy.lock);
+		list_for_each_entry(va, &vn->busy.head, list) {
+			if (!va->vm) {
+				if (va->flags & VMAP_RAM)
+					seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
+						(void *)va->va_start, (void *)va->va_end,
+						va->va_end - va->va_start);
 
-		goto final;
-	}
+				continue;
+			}
 
-	v = va->vm;
+			v = va->vm;
 
-	seq_printf(m, "0x%pK-0x%pK %7ld",
-		v->addr, v->addr + v->size, v->size);
+			seq_printf(m, "0x%pK-0x%pK %7ld",
+				v->addr, v->addr + v->size, v->size);
 
-	if (v->caller)
-		seq_printf(m, " %pS", v->caller);
+			if (v->caller)
+				seq_printf(m, " %pS", v->caller);
 
-	if (v->nr_pages)
-		seq_printf(m, " pages=%d", v->nr_pages);
+			if (v->nr_pages)
+				seq_printf(m, " pages=%d", v->nr_pages);
 
-	if (v->phys_addr)
-		seq_printf(m, " phys=%pa", &v->phys_addr);
+			if (v->phys_addr)
+				seq_printf(m, " phys=%pa", &v->phys_addr);
 
-	if (v->flags & VM_IOREMAP)
-		seq_puts(m, " ioremap");
+			if (v->flags & VM_IOREMAP)
+				seq_puts(m, " ioremap");
 
-	if (v->flags & VM_ALLOC)
-		seq_puts(m, " vmalloc");
+			if (v->flags & VM_ALLOC)
+				seq_puts(m, " vmalloc");
 
-	if (v->flags & VM_MAP)
-		seq_puts(m, " vmap");
+			if (v->flags & VM_MAP)
+				seq_puts(m, " vmap");
 
-	if (v->flags & VM_USERMAP)
-		seq_puts(m, " user");
+			if (v->flags & VM_USERMAP)
+				seq_puts(m, " user");
 
-	if (v->flags & VM_DMA_COHERENT)
-		seq_puts(m, " dma-coherent");
+			if (v->flags & VM_DMA_COHERENT)
+				seq_puts(m, " dma-coherent");
 
-	if (is_vmalloc_addr(v->pages))
-		seq_puts(m, " vpages");
+			if (is_vmalloc_addr(v->pages))
+				seq_puts(m, " vpages");
 
-	show_numa_info(m, v);
-	seq_putc(m, '\n');
+			show_numa_info(m, v);
+			seq_putc(m, '\n');
+		}
+		spin_unlock(&vn->busy.lock);
+	}
 
 	/*
 	 * As a final step, dump "unpurged" areas.
 	 */
-final:
-	if (list_is_last(&va->list, &vn->busy.head))
-		show_purge_info(m);
-
+	show_purge_info(m);
 	return 0;
 }
 
-static const struct seq_operations vmalloc_op = {
-	.start = s_start,
-	.next = s_next,
-	.stop = s_stop,
-	.show = s_show,
-};
-
 static int __init proc_vmalloc_init(void)
 {
+	void *priv_data = NULL;
+
 	if (IS_ENABLED(CONFIG_NUMA))
-		proc_create_seq_private("vmallocinfo", 0400, NULL,
-				&vmalloc_op,
-				nr_node_ids * sizeof(unsigned int), NULL);
-	else
-		proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
+		priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+
+	proc_create_single_data("vmallocinfo",
+		0400, NULL, vmalloc_info_show, priv_data);
+
 	return 0;
 }
 module_init(proc_vmalloc_init);

From a3822923af51de08df5dce8c32ce7c194b7016f4 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:32 +0100
Subject: [PATCH 430/707] mm: vmalloc: set nr_nodes based on CPUs in a system

A number of nodes which are used in the alloc/free paths is set based on
num_possible_cpus() in a system.  Please note a high limit threshold
though is fixed and corresponds to 128 nodes.

For 32-bit or single core systems an access to a global vmap heap is not
balanced.  Such small systems do not suffer from lock contentions due to
low number of CPUs.  In such case the nr_nodes is equal to 1.

Test on AMD Ryzen Threadripper 3970X 32-Core Processor: sudo
./test_vmalloc.sh run_test_mask=7 nr_threads=64

<default perf>
 94.41%     0.89%  [kernel]        [k] _raw_spin_lock
 93.35%    93.07%  [kernel]        [k] native_queued_spin_lock_slowpath
 76.13%     0.28%  [kernel]        [k] __vmalloc_node_range
 72.96%     0.81%  [kernel]        [k] alloc_vmap_area
 56.94%     0.00%  [kernel]        [k] __get_vm_area_node
 41.95%     0.00%  [kernel]        [k] vmalloc
 37.15%     0.01%  [test_vmalloc]  [k] full_fit_alloc_test
 35.17%     0.00%  [kernel]        [k] ret_from_fork_asm
 35.17%     0.00%  [kernel]        [k] ret_from_fork
 35.17%     0.00%  [kernel]        [k] kthread
 35.08%     0.00%  [test_vmalloc]  [k] test_func
 34.45%     0.00%  [test_vmalloc]  [k] fix_size_alloc_test
 28.09%     0.01%  [test_vmalloc]  [k] long_busy_list_alloc_test
 23.53%     0.25%  [kernel]        [k] vfree.part.0
 21.72%     0.00%  [kernel]        [k] remove_vm_area
 20.08%     0.21%  [kernel]        [k] find_unlink_vmap_area
  2.34%     0.61%  [kernel]        [k] free_vmap_area_noflush
<default perf>
   vs
<patch-series perf>
 82.32%     0.22%  [test_vmalloc]  [k] long_busy_list_alloc_test
 63.36%     0.02%  [kernel]        [k] vmalloc
 63.34%     2.64%  [kernel]        [k] __vmalloc_node_range
 30.42%     4.46%  [kernel]        [k] vfree.part.0
 28.98%     2.51%  [kernel]        [k] __alloc_pages_bulk
 27.28%     0.19%  [kernel]        [k] __get_vm_area_node
 26.13%     1.50%  [kernel]        [k] alloc_vmap_area
 21.72%    21.67%  [kernel]        [k] clear_page_rep
 19.51%     2.43%  [kernel]        [k] _raw_spin_lock
 16.61%    16.51%  [kernel]        [k] native_queued_spin_lock_slowpath
 13.40%     2.07%  [kernel]        [k] free_unref_page
 10.62%     0.01%  [kernel]        [k] remove_vm_area
  9.02%     8.73%  [kernel]        [k] insert_vmap_area
  8.94%     0.00%  [kernel]        [k] ret_from_fork_asm
  8.94%     0.00%  [kernel]        [k] ret_from_fork
  8.94%     0.00%  [kernel]        [k] kthread
  8.29%     0.00%  [test_vmalloc]  [k] test_func
  7.81%     0.05%  [test_vmalloc]  [k] full_fit_alloc_test
  5.30%     4.73%  [kernel]        [k] purge_vmap_node
  4.47%     2.65%  [kernel]        [k] free_vmap_area_noflush
<patch-series perf>

confirms that a native_queued_spin_lock_slowpath goes down to
16.51% percent from 93.07%.

The throughput is ~12x higher:

urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64
Run the test with following parameters: run_test_mask=7 nr_threads=64
Done.
Check the kernel ring buffer to see the summary.

real    10m51.271s
user    0m0.013s
sys     0m0.187s
urezki@pc638:~$

urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64
Run the test with following parameters: run_test_mask=7 nr_threads=64
Done.
Check the kernel ring buffer to see the summary.

real    0m51.301s
user    0m0.015s
sys     0m0.040s
urezki@pc638:~$

Link: https://lkml.kernel.org/r/20240102184633.748113-11-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f0aaf926e3ccd2..af986953d2d7ed 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4879,10 +4879,27 @@ static void __init vmap_init_free_space(void)
 static void vmap_init_nodes(void)
 {
 	struct vmap_node *vn;
-	int i, j;
+	int i, n;
+
+#if BITS_PER_LONG == 64
+	/* A high threshold of max nodes is fixed and bound to 128. */
+	n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
+
+	if (n > 1) {
+		vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
+		if (vn) {
+			/* Node partition is 16 pages. */
+			vmap_zone_size = (1 << 4) * PAGE_SIZE;
+			nr_vmap_nodes = n;
+			vmap_nodes = vn;
+		} else {
+			pr_err("Failed to allocate an array. Disable a node layer\n");
+		}
+	}
+#endif
 
-	for (i = 0; i < nr_vmap_nodes; i++) {
-		vn = &vmap_nodes[i];
+	for (n = 0; n < nr_vmap_nodes; n++) {
+		vn = &vmap_nodes[n];
 		vn->busy.root = RB_ROOT;
 		INIT_LIST_HEAD(&vn->busy.head);
 		spin_lock_init(&vn->busy.lock);
@@ -4891,9 +4908,9 @@ static void vmap_init_nodes(void)
 		INIT_LIST_HEAD(&vn->lazy.head);
 		spin_lock_init(&vn->lazy.lock);
 
-		for (j = 0; j < MAX_VA_SIZE_PAGES; j++) {
-			INIT_LIST_HEAD(&vn->pool[j].head);
-			WRITE_ONCE(vn->pool[j].len, 0);
+		for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
+			INIT_LIST_HEAD(&vn->pool[i].head);
+			WRITE_ONCE(vn->pool[i].len, 0);
 		}
 
 		spin_lock_init(&vn->pool_lock);

From 84f8958cd667d1c192bd69a0504e59e21dc75680 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 2 Jan 2024 19:46:33 +0100
Subject: [PATCH 431/707] mm: vmalloc: add a shrinker to drain vmap pools

The added shrinker is used to return back current cached VAs into a global
vmap space, when a system enters into a low memory mode.

Link: https://lkml.kernel.org/r/20240102184633.748113-12-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Kazuhito Hagio <k-hagio-ab@nec.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index af986953d2d7ed..257981e37936cd 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4917,8 +4917,37 @@ static void vmap_init_nodes(void)
 	}
 }
 
+static unsigned long
+vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	unsigned long count;
+	struct vmap_node *vn;
+	int i, j;
+
+	for (count = 0, i = 0; i < nr_vmap_nodes; i++) {
+		vn = &vmap_nodes[i];
+
+		for (j = 0; j < MAX_VA_SIZE_PAGES; j++)
+			count += READ_ONCE(vn->pool[j].len);
+	}
+
+	return count ? count : SHRINK_EMPTY;
+}
+
+static unsigned long
+vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	int i;
+
+	for (i = 0; i < nr_vmap_nodes; i++)
+		decay_va_pool_node(&vmap_nodes[i], true);
+
+	return SHRINK_STOP;
+}
+
 void __init vmalloc_init(void)
 {
+	struct shrinker *vmap_node_shrinker;
 	struct vmap_area *va;
 	struct vmap_node *vn;
 	struct vm_struct *tmp;
@@ -4966,4 +4995,14 @@ void __init vmalloc_init(void)
 	 */
 	vmap_init_free_space();
 	vmap_initialized = true;
+
+	vmap_node_shrinker = shrinker_alloc(0, "vmap-node");
+	if (!vmap_node_shrinker) {
+		pr_err("Failed to allocate vmap-node shrinker!\n");
+		return;
+	}
+
+	vmap_node_shrinker->count_objects = vmap_node_shrink_count;
+	vmap_node_shrinker->scan_objects = vmap_node_shrink_scan;
+	shrinker_register(vmap_node_shrinker);
 }

From 525847c5902cfc4a5620018351cf00fc2fa6e157 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Wed, 24 Jan 2024 19:09:19 +0100
Subject: [PATCH 432/707] mm: vmalloc: improve description of vmap node layer

This patch adds extra explanation of recently added vmap node layer based
on community feedback.  No functional change.

Link: https://lkml.kernel.org/r/20240124180920.50725-1-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 60 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 14 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 257981e37936cd..b8be601b056d8f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -765,9 +765,10 @@ static struct rb_root free_vmap_area_root = RB_ROOT;
 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
 
 /*
- * An effective vmap-node logic. Users make use of nodes instead
- * of a global heap. It allows to balance an access and mitigate
- * contention.
+ * This structure defines a single, solid model where a list and
+ * rb-tree are part of one entity protected by the lock. Nodes are
+ * sorted in ascending order, thus for O(1) access to left/right
+ * neighbors a list is used as well as for sequential traversal.
  */
 struct rb_list {
 	struct rb_root root;
@@ -775,16 +776,23 @@ struct rb_list {
 	spinlock_t lock;
 };
 
+/*
+ * A fast size storage contains VAs up to 1M size. A pool consists
+ * of linked between each other ready to go VAs of certain sizes.
+ * An index in the pool-array corresponds to number of pages + 1.
+ */
+#define MAX_VA_SIZE_PAGES 256
+
 struct vmap_pool {
 	struct list_head head;
 	unsigned long len;
 };
 
 /*
- * A fast size storage contains VAs up to 1M size.
+ * An effective vmap-node logic. Users make use of nodes instead
+ * of a global heap. It allows to balance an access and mitigate
+ * contention.
  */
-#define MAX_VA_SIZE_PAGES 256
-
 static struct vmap_node {
 	/* Simple size segregated storage. */
 	struct vmap_pool pool[MAX_VA_SIZE_PAGES];
@@ -803,6 +811,11 @@ static struct vmap_node {
 	unsigned long nr_purged;
 } single;
 
+/*
+ * Initial setup consists of one single node, i.e. a balancing
+ * is fully disabled. Later on, after vmap is initialized these
+ * parameters are updated based on a system capacity.
+ */
 static struct vmap_node *vmap_nodes = &single;
 static __read_mostly unsigned int nr_vmap_nodes = 1;
 static __read_mostly unsigned int vmap_zone_size = 1;
@@ -2048,7 +2061,12 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
 			}
 		}
 
-		/* Attach the pool back if it has been partly decayed. */
+		/*
+		 * Attach the pool back if it has been partly decayed.
+		 * Please note, it is supposed that nobody(other contexts)
+		 * can populate the pool therefore a simple list replace
+		 * operation takes place here.
+		 */
 		if (!full_decay && !list_empty(&tmp_list)) {
 			spin_lock(&vn->pool_lock);
 			list_replace_init(&tmp_list, &vn->pool[i].head);
@@ -2257,16 +2275,14 @@ struct vmap_area *find_vmap_area(unsigned long addr)
 	 * An addr_to_node_id(addr) converts an address to a node index
 	 * where a VA is located. If VA spans several zones and passed
 	 * addr is not the same as va->va_start, what is not common, we
-	 * may need to scan an extra nodes. See an example:
+	 * may need to scan extra nodes. See an example:
 	 *
-	 *      <--va-->
+	 *      <----va---->
 	 * -|-----|-----|-----|-----|-
 	 *     1     2     0     1
 	 *
-	 * VA resides in node 1 whereas it spans 1 and 2. If passed
-	 * addr is within a second node we should do extra work. We
-	 * should mention that it is rare and is a corner case from
-	 * the other hand it has to be covered.
+	 * VA resides in node 1 whereas it spans 1, 2 an 0. If passed
+	 * addr is within 2 or 0 nodes we should do extra work.
 	 */
 	i = j = addr_to_node_id(addr);
 	do {
@@ -2289,6 +2305,9 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
 	struct vmap_area *va;
 	int i, j;
 
+	/*
+	 * Check the comment in the find_vmap_area() about the loop.
+	 */
 	i = j = addr_to_node_id(addr);
 	do {
 		vn = &vmap_nodes[i];
@@ -4882,7 +4901,20 @@ static void vmap_init_nodes(void)
 	int i, n;
 
 #if BITS_PER_LONG == 64
-	/* A high threshold of max nodes is fixed and bound to 128. */
+	/*
+	 * A high threshold of max nodes is fixed and bound to 128,
+	 * thus a scale factor is 1 for systems where number of cores
+	 * are less or equal to specified threshold.
+	 *
+	 * As for NUMA-aware notes. For bigger systems, for example
+	 * NUMA with multi-sockets, where we can end-up with thousands
+	 * of cores in total, a "sub-numa-clustering" should be added.
+	 *
+	 * In this case a NUMA domain is considered as a single entity
+	 * with dedicated sub-nodes in it which describe one group or
+	 * set of cores. Therefore a per-domain purging is supposed to
+	 * be added as well as a per-domain balancing.
+	 */
 	n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
 
 	if (n > 1) {

From ed82e5bbef9c54b438055c9ebfe52d669f1f3fdb Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Wed, 24 Jan 2024 19:09:20 +0100
Subject: [PATCH 433/707] mm: vmalloc: refactor vmalloc_dump_obj() function

This patch tends to simplify the function in question, by removing an
extra stack "objp" variable, returning back to an early exit approach if
spin_trylock() fails or VA was not found.

Link: https://lkml.kernel.org/r/20240124180920.50725-2-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmalloc.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b8be601b056d8f..449f45b0e47497 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4696,34 +4696,35 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 #ifdef CONFIG_PRINTK
 bool vmalloc_dump_obj(void *object)
 {
-	void *objp = (void *)PAGE_ALIGN((unsigned long)object);
 	const void *caller;
+	struct vm_struct *vm;
 	struct vmap_area *va;
 	struct vmap_node *vn;
 	unsigned long addr;
 	unsigned int nr_pages;
-	bool success = false;
-
-	vn = addr_to_node((unsigned long)objp);
 
-	if (spin_trylock(&vn->busy.lock)) {
-		va = __find_vmap_area((unsigned long)objp, &vn->busy.root);
+	addr = PAGE_ALIGN((unsigned long) object);
+	vn = addr_to_node(addr);
 
-		if (va && va->vm) {
-			addr = (unsigned long)va->vm->addr;
-			caller = va->vm->caller;
-			nr_pages = va->vm->nr_pages;
-			success = true;
-		}
+	if (!spin_trylock(&vn->busy.lock))
+		return false;
 
+	va = __find_vmap_area(addr, &vn->busy.root);
+	if (!va || !va->vm) {
 		spin_unlock(&vn->busy.lock);
+		return false;
 	}
 
-	if (success)
-		pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
-			nr_pages, addr, caller);
+	vm = va->vm;
+	addr = (unsigned long) vm->addr;
+	caller = vm->caller;
+	nr_pages = vm->nr_pages;
+	spin_unlock(&vn->busy.lock);
 
-	return success;
+	pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
+		nr_pages, addr, caller);
+
+	return true;
 }
 #endif
 

From 7edf0a77cf93b772ff8004f42eeb9d6403ecc3e8 Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Wed, 10 Jan 2024 16:46:22 +0800
Subject: [PATCH 434/707] mm/mmap: simplify vma link and unlink

The file parameter in the __remove_shared_vm_struct is no longer used,
remove it.

These functions vma_link() and mmap_region() have some of the same code,
introduce vma_link_file() helper function to simplify the code.

Link: https://lkml.kernel.org/r/20240110084622.2425927-1-yajun.deng@linux.dev
Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mmap.c | 44 +++++++++++++++++++-------------------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index d89770eaab6b61..282ed6d0914b07 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -105,7 +105,7 @@ void vma_set_page_prot(struct vm_area_struct *vma)
  * Requires inode->i_mapping->i_mmap_rwsem
  */
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
-		struct file *file, struct address_space *mapping)
+				      struct address_space *mapping)
 {
 	if (vma_is_shared_maywrite(vma))
 		mapping_unmap_writable(mapping);
@@ -126,7 +126,7 @@ void unlink_file_vma(struct vm_area_struct *vma)
 	if (file) {
 		struct address_space *mapping = file->f_mapping;
 		i_mmap_lock_write(mapping);
-		__remove_shared_vm_struct(vma, file, mapping);
+		__remove_shared_vm_struct(vma, mapping);
 		i_mmap_unlock_write(mapping);
 	}
 }
@@ -392,26 +392,30 @@ static void __vma_link_file(struct vm_area_struct *vma,
 	flush_dcache_mmap_unlock(mapping);
 }
 
+static void vma_link_file(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct address_space *mapping;
+
+	if (file) {
+		mapping = file->f_mapping;
+		i_mmap_lock_write(mapping);
+		__vma_link_file(vma, mapping);
+		i_mmap_unlock_write(mapping);
+	}
+}
+
 static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
 {
 	VMA_ITERATOR(vmi, mm, 0);
-	struct address_space *mapping = NULL;
 
 	vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
 	if (vma_iter_prealloc(&vmi, vma))
 		return -ENOMEM;
 
 	vma_start_write(vma);
-
 	vma_iter_store(&vmi, vma);
-
-	if (vma->vm_file) {
-		mapping = vma->vm_file->f_mapping;
-		i_mmap_lock_write(mapping);
-		__vma_link_file(vma, mapping);
-		i_mmap_unlock_write(mapping);
-	}
-
+	vma_link_file(vma);
 	mm->map_count++;
 	validate_mm(mm);
 	return 0;
@@ -519,10 +523,9 @@ static inline void vma_complete(struct vma_prepare *vp,
 	}
 
 	if (vp->remove && vp->file) {
-		__remove_shared_vm_struct(vp->remove, vp->file, vp->mapping);
+		__remove_shared_vm_struct(vp->remove, vp->mapping);
 		if (vp->remove2)
-			__remove_shared_vm_struct(vp->remove2, vp->file,
-						  vp->mapping);
+			__remove_shared_vm_struct(vp->remove2, vp->mapping);
 	} else if (vp->insert) {
 		/*
 		 * split_vma has split insert from vma, and needs
@@ -2891,16 +2894,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	vma_start_write(vma);
 	vma_iter_store(&vmi, vma);
 	mm->map_count++;
-	if (vma->vm_file) {
-		i_mmap_lock_write(vma->vm_file->f_mapping);
-		if (vma_is_shared_maywrite(vma))
-			mapping_allow_writable(vma->vm_file->f_mapping);
-
-		flush_dcache_mmap_lock(vma->vm_file->f_mapping);
-		vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap);
-		flush_dcache_mmap_unlock(vma->vm_file->f_mapping);
-		i_mmap_unlock_write(vma->vm_file->f_mapping);
-	}
+	vma_link_file(vma);
 
 	/*
 	 * vma_merge() calls khugepaged_enter_vma() either, the below

From 67dfd41fbce86e1a59593c9e122cf4b094a77f18 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 29 Dec 2023 16:22:07 +0800
Subject: [PATCH 435/707] mm: memory: use nth_page() in clear/copy_subpage()

The clear and copy of huge gigantic page has converted to use nth_page()
to handle the possible discontinuous struct page(SPARSEMEM without
VMEMMAP), but not change for the non-gigantic part, fix it too.

Link: https://lkml.kernel.org/r/20231229082207.60235-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Zi Yan <ziy@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 89bcae0b224d6d..762458faec6969 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6143,7 +6143,7 @@ static int clear_subpage(unsigned long addr, int idx, void *arg)
 {
 	struct page *page = arg;
 
-	clear_user_highpage(page + idx, addr);
+	clear_user_highpage(nth_page(page, idx), addr);
 	return 0;
 }
 
@@ -6193,10 +6193,11 @@ struct copy_subpage_arg {
 static int copy_subpage(unsigned long addr, int idx, void *arg)
 {
 	struct copy_subpage_arg *copy_arg = arg;
+	struct page *dst = nth_page(copy_arg->dst, idx);
+	struct page *src = nth_page(copy_arg->src, idx);
 
-	if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
-				  addr, copy_arg->vma)) {
-		memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0);
+	if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) {
+		memory_failure_queue(page_to_pfn(src), 0);
 		return -EHWPOISON;
 	}
 	return 0;

From 7269896acbba7b62463b4b14a2fb5f6c98fe07fd Mon Sep 17 00:00:00 2001
From: Haifeng Xu <haifeng.xu@shopee.com>
Date: Thu, 28 Dec 2023 06:27:14 +0000
Subject: [PATCH 436/707] mm: list_lru: disable memcg_aware when cgroup.memory
 is set to "nokmem"

Actually, when using a boot time kernel option "cgroup.memory=nokmem", all
lru items are inserted to list_lru_node.  But for those users who invoke
list_lru_init_memcg() to initialize list_lru, list_lru_memcg_aware()
returns true.  And this brings unneeded operations related to memcg.

To make things more convenient, let's disable memcg_aware when
cgroup.memory is set to "nokmem".

Link: https://lkml.kernel.org/r/20231228062715.338672-1-haifeng.xu@shopee.com
Signed-off-by: Haifeng Xu <haifeng.xu@shopee.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/list_lru.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/list_lru.c b/mm/list_lru.c
index 35b0147542a9de..158781d1d3c215 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -567,6 +567,9 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
 		lru->shrinker_id = shrinker->id;
 	else
 		lru->shrinker_id = -1;
+
+	if (mem_cgroup_kmem_disabled())
+		memcg_aware = false;
 #endif
 
 	lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);

From bc72e937a1de5ab1189c42def85494530da20f8e Mon Sep 17 00:00:00 2001
From: Haifeng Xu <haifeng.xu@shopee.com>
Date: Thu, 28 Dec 2023 06:27:15 +0000
Subject: [PATCH 437/707] mm: list_lru: remove unused macro list_lru_init_key()

list_lru_init_key() isn't used by anyone, remove it to clean up.

Link: https://lkml.kernel.org/r/20231228062715.338672-2-haifeng.xu@shopee.com
Signed-off-by: Haifeng Xu <haifeng.xu@shopee.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/list_lru.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 7675a48a070108..c679e6b293c4c4 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -62,8 +62,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
 
 #define list_lru_init(lru)				\
 	__list_lru_init((lru), false, NULL, NULL)
-#define list_lru_init_key(lru, key)			\
-	__list_lru_init((lru), false, (key), NULL)
 #define list_lru_init_memcg(lru, shrinker)		\
 	__list_lru_init((lru), true, NULL, shrinker)
 

From f84490c9c892898560d7f58fd8765bdfecb0ad07 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang@os.amperecomputing.com>
Date: Wed, 20 Dec 2023 22:59:42 -0800
Subject: [PATCH 438/707] mm: mmap: no need to call khugepaged_enter_vma() for
 stack

We avoid allocating THP for temporary stack, even though
khugepaged_enter_vma() is called for stack VMAs, it actualy returns
false.  So no need to call it in the first place at all.

Link: https://lkml.kernel.org/r/20231221065943.2803551-1-shy828301@gmail.com
Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mmap.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 282ed6d0914b07..66f534ec90a55e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2051,7 +2051,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 		}
 	}
 	anon_vma_unlock_write(vma->anon_vma);
-	khugepaged_enter_vma(vma, vma->vm_flags);
 	mas_destroy(&mas);
 	validate_mm(mm);
 	return error;
@@ -2145,7 +2144,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
 		}
 	}
 	anon_vma_unlock_write(vma->anon_vma);
-	khugepaged_enter_vma(vma, vma->vm_flags);
 	mas_destroy(&mas);
 	validate_mm(mm);
 	return error;

From 096a4624ec8e550103329db6f157c9bae9c0c9c0 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 11 Jan 2024 18:12:16 +0000
Subject: [PATCH 439/707] memcg: convert mem_cgroup_move_charge_pte_range() to
 use a folio

Patch series "Convert memcontrol charge moving to use folios".

No part of these patches should change behaviour; all the called functions
already convert from page to folio, so this ought to simply be a reduction
in the number of calls to compound_head().


This patch (of 4):

Remove many calls to compound_head() by calling page_folio() once at the
start of each stanza which receives a struct page from 'target'.  There
should be no change in behaviour here as all the called functions start
out by converting the page to its folio.

Link: https://lkml.kernel.org/r/20240111181219.3462852-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20240111181219.3462852-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 49 ++++++++++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d9ca0fdbe4ab04..516f811f740a20 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5966,23 +5966,22 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 }
 
 /**
- * mem_cgroup_move_account - move account of the page
- * @page: the page
+ * mem_cgroup_move_account - move account of the folio
+ * @folio: The folio.
  * @compound: charge the page as compound or small page
- * @from: mem_cgroup which the page is moved from.
- * @to:	mem_cgroup which the page is moved to. @from != @to.
+ * @from: mem_cgroup which the folio is moved from.
+ * @to:	mem_cgroup which the folio is moved to. @from != @to.
  *
- * The page must be locked and not on the LRU.
+ * The folio must be locked and not on the LRU.
  *
  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
  * from old cgroup.
  */
-static int mem_cgroup_move_account(struct page *page,
+static int mem_cgroup_move_account(struct folio *folio,
 				   bool compound,
 				   struct mem_cgroup *from,
 				   struct mem_cgroup *to)
 {
-	struct folio *folio = page_folio(page);
 	struct lruvec *from_vec, *to_vec;
 	struct pglist_data *pgdat;
 	unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
@@ -6432,7 +6431,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 	spinlock_t *ptl;
 	enum mc_target_type target_type;
 	union mc_target target;
-	struct page *page;
+	struct folio *folio;
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
@@ -6442,26 +6441,26 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 		}
 		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
 		if (target_type == MC_TARGET_PAGE) {
-			page = target.page;
-			if (isolate_lru_page(page)) {
-				if (!mem_cgroup_move_account(page, true,
+			folio = page_folio(target.page);
+			if (folio_isolate_lru(folio)) {
+				if (!mem_cgroup_move_account(folio, true,
 							     mc.from, mc.to)) {
 					mc.precharge -= HPAGE_PMD_NR;
 					mc.moved_charge += HPAGE_PMD_NR;
 				}
-				putback_lru_page(page);
+				folio_putback_lru(folio);
 			}
-			unlock_page(page);
-			put_page(page);
+			folio_unlock(folio);
+			folio_put(folio);
 		} else if (target_type == MC_TARGET_DEVICE) {
-			page = target.page;
-			if (!mem_cgroup_move_account(page, true,
+			folio = page_folio(target.page);
+			if (!mem_cgroup_move_account(folio, true,
 						     mc.from, mc.to)) {
 				mc.precharge -= HPAGE_PMD_NR;
 				mc.moved_charge += HPAGE_PMD_NR;
 			}
-			unlock_page(page);
-			put_page(page);
+			folio_unlock(folio);
+			folio_put(folio);
 		}
 		spin_unlock(ptl);
 		return 0;
@@ -6484,28 +6483,28 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 			device = true;
 			fallthrough;
 		case MC_TARGET_PAGE:
-			page = target.page;
+			folio = page_folio(target.page);
 			/*
 			 * We can have a part of the split pmd here. Moving it
 			 * can be done but it would be too convoluted so simply
 			 * ignore such a partial THP and keep it in original
 			 * memcg. There should be somebody mapping the head.
 			 */
-			if (PageTransCompound(page))
+			if (folio_test_large(folio))
 				goto put;
-			if (!device && !isolate_lru_page(page))
+			if (!device && !folio_isolate_lru(folio))
 				goto put;
-			if (!mem_cgroup_move_account(page, false,
+			if (!mem_cgroup_move_account(folio, false,
 						mc.from, mc.to)) {
 				mc.precharge--;
 				/* we uncharge from mc.from later. */
 				mc.moved_charge++;
 			}
 			if (!device)
-				putback_lru_page(page);
+				folio_putback_lru(folio);
 put:			/* get_mctgt_type() gets & locks the page */
-			unlock_page(page);
-			put_page(page);
+			folio_unlock(folio);
+			folio_put(folio);
 			break;
 		case MC_TARGET_SWAP:
 			ent = target.ent;

From 0635b2765fbfd9789bd08389b189197c6b113baa Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 11 Jan 2024 18:12:17 +0000
Subject: [PATCH 440/707] memcg: return the folio in union mc_target

All users of target.page convert it to the folio, so we can just return
the folio directly and save a few calls to compound_head().

Link: https://lkml.kernel.org/r/20240111181219.3462852-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 516f811f740a20..d06066444244d3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5874,7 +5874,7 @@ static int mem_cgroup_do_precharge(unsigned long count)
 }
 
 union mc_target {
-	struct page	*page;
+	struct folio	*folio;
 	swp_entry_t	ent;
 };
 
@@ -6096,7 +6096,7 @@ static int mem_cgroup_move_account(struct folio *folio,
  * Return:
  * * MC_TARGET_NONE - If the pte is not a target for move charge.
  * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for
- *   move charge. If @target is not NULL, the page is stored in target->page
+ *   move charge. If @target is not NULL, the folio is stored in target->folio
  *   with extra refcnt taken (Caller should release it).
  * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a
  *   target for charge migration.  If @target is not NULL, the entry is
@@ -6161,7 +6161,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 			    is_device_coherent_page(page))
 				ret = MC_TARGET_DEVICE;
 			if (target)
-				target->page = page;
+				target->folio = page_folio(page);
 		}
 		if (!ret || !target) {
 			if (target)
@@ -6211,7 +6211,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 				put_page(page);
 				return MC_TARGET_NONE;
 			}
-			target->page = page;
+			target->folio = page_folio(page);
 		}
 	}
 	return ret;
@@ -6441,7 +6441,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 		}
 		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
 		if (target_type == MC_TARGET_PAGE) {
-			folio = page_folio(target.page);
+			folio = target.folio;
 			if (folio_isolate_lru(folio)) {
 				if (!mem_cgroup_move_account(folio, true,
 							     mc.from, mc.to)) {
@@ -6453,7 +6453,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 			folio_unlock(folio);
 			folio_put(folio);
 		} else if (target_type == MC_TARGET_DEVICE) {
-			folio = page_folio(target.page);
+			folio = target.folio;
 			if (!mem_cgroup_move_account(folio, true,
 						     mc.from, mc.to)) {
 				mc.precharge -= HPAGE_PMD_NR;
@@ -6483,7 +6483,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 			device = true;
 			fallthrough;
 		case MC_TARGET_PAGE:
-			folio = page_folio(target.page);
+			folio = target.folio;
 			/*
 			 * We can have a part of the split pmd here. Moving it
 			 * can be done but it would be too convoluted so simply

From 7122ca3287bf6b90f97ba7b3a0950af5f82d1be3 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 11 Jan 2024 18:12:18 +0000
Subject: [PATCH 441/707] memcg: use a folio in get_mctgt_type

Replace seven calls to compound_head() with one.  We still use the page as
page_mapped() is different from folio_mapped().

Link: https://lkml.kernel.org/r/20240111181219.3462852-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d06066444244d3..66aceb4ae2bac6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6110,6 +6110,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		unsigned long addr, pte_t ptent, union mc_target *target)
 {
 	struct page *page = NULL;
+	struct folio *folio;
 	enum mc_target_type ret = MC_TARGET_NONE;
 	swp_entry_t ent = { .val = 0 };
 
@@ -6124,9 +6125,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 	else if (is_swap_pte(ptent))
 		page = mc_handle_swap_pte(vma, ptent, &ent);
 
+	if (page)
+		folio = page_folio(page);
 	if (target && page) {
-		if (!trylock_page(page)) {
-			put_page(page);
+		if (!folio_trylock(folio)) {
+			folio_put(folio);
 			return ret;
 		}
 		/*
@@ -6141,8 +6144,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		 * Alas, skip moving the page in this case.
 		 */
 		if (!pte_present(ptent) && page_mapped(page)) {
-			unlock_page(page);
-			put_page(page);
+			folio_unlock(folio);
+			folio_put(folio);
 			return ret;
 		}
 	}
@@ -6155,18 +6158,18 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		 * mem_cgroup_move_account() checks the page is valid or
 		 * not under LRU exclusion.
 		 */
-		if (page_memcg(page) == mc.from) {
+		if (folio_memcg(folio) == mc.from) {
 			ret = MC_TARGET_PAGE;
-			if (is_device_private_page(page) ||
-			    is_device_coherent_page(page))
+			if (folio_is_device_private(folio) ||
+			    folio_is_device_coherent(folio))
 				ret = MC_TARGET_DEVICE;
 			if (target)
-				target->folio = page_folio(page);
+				target->folio = folio;
 		}
 		if (!ret || !target) {
 			if (target)
-				unlock_page(page);
-			put_page(page);
+				folio_unlock(folio);
+			folio_put(folio);
 		}
 	}
 	/*

From 66f4e1e3f973f30af7ee40d05c77c87af2d20587 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 11 Jan 2024 18:12:19 +0000
Subject: [PATCH 442/707] memcg: use a folio in get_mctgt_type_thp

Replace five calls to compound_head() with one.

Link: https://lkml.kernel.org/r/20240111181219.3462852-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 66aceb4ae2bac6..6cc6b3a2a60c97 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6195,6 +6195,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t pmd, union mc_target *target)
 {
 	struct page *page = NULL;
+	struct folio *folio;
 	enum mc_target_type ret = MC_TARGET_NONE;
 
 	if (unlikely(is_swap_pmd(pmd))) {
@@ -6204,17 +6205,18 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 	}
 	page = pmd_page(pmd);
 	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
+	folio = page_folio(page);
 	if (!(mc.flags & MOVE_ANON))
 		return ret;
-	if (page_memcg(page) == mc.from) {
+	if (folio_memcg(folio) == mc.from) {
 		ret = MC_TARGET_PAGE;
 		if (target) {
-			get_page(page);
-			if (!trylock_page(page)) {
-				put_page(page);
+			folio_get(folio);
+			if (!folio_trylock(folio)) {
+				folio_put(folio);
 				return MC_TARGET_NONE;
 			}
-			target->folio = page_folio(page);
+			target->folio = folio;
 		}
 	}
 	return ret;

From 26f2f9d6d594d1b0c222b1d8792fe7c76b44ecad Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 11 Jan 2024 15:24:20 +0000
Subject: [PATCH 443/707] mm: add pfn_swap_entry_folio()

Patch series "mm: convert mm counter to take a folio", v3.

Make sure all mm_counter() and mm_counter_file() callers have a folio,
then convert mm counter functions to take a folio, which saves some
compound_head() calls.


This patch (of 10):

Thanks to the compound_head() hidden inside PageLocked(), this saves a
call to compound_head() over calling page_folio(pfn_swap_entry_to_page())

Link: https://lkml.kernel.org/r/20240111152429.3374566-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20240111152429.3374566-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swapops.h | 13 +++++++++++++
 mm/filemap.c            |  2 +-
 mm/huge_memory.c        |  2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index bff1e8d97de0e0..48b700ba1d188a 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -468,6 +468,19 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
 	return p;
 }
 
+static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry)
+{
+	struct folio *folio = pfn_folio(swp_offset_pfn(entry));
+
+	/*
+	 * Any use of migration entries may only occur while the
+	 * corresponding folio is locked
+	 */
+	BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio));
+
+	return folio;
+}
+
 /*
  * A pfn swap entry is a special type of swap entry that always has a pfn stored
  * in the swap offset. They are used to represent unaddressable device memory
diff --git a/mm/filemap.c b/mm/filemap.c
index 0d7e20edf46f59..142864338ca4f2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1354,7 +1354,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
 	unsigned long pflags;
 	bool in_thrashing;
 	wait_queue_head_t *q;
-	struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
+	struct folio *folio = pfn_swap_entry_folio(entry);
 
 	q = folio_waitqueue(folio);
 	if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 94c958f7ebb50d..5468b2f97cbf70 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2045,7 +2045,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 	if (is_swap_pmd(*pmd)) {
 		swp_entry_t entry = pmd_to_swp_entry(*pmd);
-		struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
+		struct folio *folio = pfn_swap_entry_folio(entry);
 		pmd_t newpmd;
 
 		VM_BUG_ON(!is_pmd_migration_entry(*pmd));

From fcb48a5ab5300b67337c7f40c47947b436c167ad Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 11 Jan 2024 15:24:21 +0000
Subject: [PATCH 444/707] proc: use pfn_swap_entry_folio where obvious

These callers only pass the result to PageAnon(), so we can save the extra
call to compound_head() by using pfn_swap_entry_folio().

Link: https://lkml.kernel.org/r/20240111152429.3374566-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3f78ebbb795fe2..ac6ea2cc2ee8fe 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1807,7 +1807,7 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
 		if (p->masks_of_interest & PAGE_IS_FILE) {
 			swp = pte_to_swp_entry(pte);
 			if (is_pfn_swap_entry(swp) &&
-			    !PageAnon(pfn_swap_entry_to_page(swp)))
+			    !folio_test_anon(pfn_swap_entry_folio(swp)))
 				categories |= PAGE_IS_FILE;
 		}
 		if (pte_swp_soft_dirty(pte))
@@ -1873,7 +1873,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
 		if (p->masks_of_interest & PAGE_IS_FILE) {
 			swp = pmd_to_swp_entry(pmd);
 			if (is_pfn_swap_entry(swp) &&
-			    !PageAnon(pfn_swap_entry_to_page(swp)))
+			    !folio_test_anon(pfn_swap_entry_folio(swp)))
 				categories |= PAGE_IS_FILE;
 		}
 	}

From aebc55ef159e549fe3fa3c926c9a05cfd0783b28 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 11 Jan 2024 15:24:22 +0000
Subject: [PATCH 445/707] mprotect: use pfn_swap_entry_folio

We only want to know whether the folio is anonymous, so use
pfn_swap_entry_folio() and save a call to compound_head().

Link: https://lkml.kernel.org/r/20240111152429.3374566-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mprotect.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 81991102f7859e..f8a4544b4601db 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -198,13 +198,13 @@ static long change_pte_range(struct mmu_gather *tlb,
 			pte_t newpte;
 
 			if (is_writable_migration_entry(entry)) {
-				struct page *page = pfn_swap_entry_to_page(entry);
+				struct folio *folio = pfn_swap_entry_folio(entry);
 
 				/*
 				 * A protection check is difficult so
 				 * just be safe and disable write
 				 */
-				if (PageAnon(page))
+				if (folio_test_anon(folio))
 					entry = make_readable_exclusive_migration_entry(
 							     swp_offset(entry));
 				else

From e088a5658e9131b6f5931b4e7c41308e93840671 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 11 Jan 2024 15:24:23 +0000
Subject: [PATCH 446/707] s390: use pfn_swap_entry_folio() in
 ptep_zap_swap_entry()

Call pfn_swap_entry_folio() in ptep_zap_swap_entry() as preparation for
converting mm counter functions to take a folio.

Link: https://lkml.kernel.org/r/20240111152429.3374566-5-willy@infradead.org
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/mm/pgtable.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 99422926efe1b5..7e5dd4b176642c 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -721,9 +721,9 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
 	if (!non_swap_entry(entry))
 		dec_mm_counter(mm, MM_SWAPENTS);
 	else if (is_migration_entry(entry)) {
-		struct page *page = pfn_swap_entry_to_page(entry);
+		struct folio *folio = pfn_swap_entry_folio(entry);
 
-		dec_mm_counter(mm, mm_counter(page));
+		dec_mm_counter(mm, mm_counter(&folio->page));
 	}
 	free_swap_and_cache(entry);
 }

From 92473e04c1a9278e9cf74cdb079a892d52e39aa0 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 11 Jan 2024 15:24:24 +0000
Subject: [PATCH 447/707] mm: use pfn_swap_entry_folio() in
 __split_huge_pmd_locked()

Call pfn_swap_entry_folio() in __split_huge_pmd_locked() as preparation
for converting mm counter functions to take a folio.

Link: https://lkml.kernel.org/r/20240111152429.3374566-6-willy@infradead.org
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/huge_memory.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5468b2f97cbf70..33b720037ab725 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2442,7 +2442,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			swp_entry_t entry;
 
 			entry = pmd_to_swp_entry(old_pmd);
-			page = pfn_swap_entry_to_page(entry);
+			folio = pfn_swap_entry_folio(entry);
 		} else {
 			page = pmd_page(old_pmd);
 			folio = page_folio(page);
@@ -2453,7 +2453,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			folio_remove_rmap_pmd(folio, page, vma);
 			folio_put(folio);
 		}
-		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
+		add_mm_counter(mm, mm_counter_file(&folio->page), -HPAGE_PMD_NR);
 		return;
 	}
 

From 18f2fbe5cffb41513aaea760ad95f3c4f0b14ce7 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 11 Jan 2024 15:24:25 +0000
Subject: [PATCH 448/707] mm: use pfn_swap_entry_to_folio() in zap_huge_pmd()

Call pfn_swap_entry_to_folio() in zap_huge_pmd() as preparation for
converting mm counter functions to take a folio.  Saves a call to
compound_head() embedded inside PageAnon().

Link: https://lkml.kernel.org/r/20240111152429.3374566-7-willy@infradead.org
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/huge_memory.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 33b720037ab725..7a28a7db08ea0d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1905,12 +1905,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		zap_deposited_table(tlb->mm, pmd);
 		spin_unlock(ptl);
 	} else {
-		struct page *page = NULL;
+		struct folio *folio = NULL;
 		int flush_needed = 1;
 
 		if (pmd_present(orig_pmd)) {
-			page = pmd_page(orig_pmd);
-			folio_remove_rmap_pmd(page_folio(page), page, vma);
+			struct page *page = pmd_page(orig_pmd);
+
+			folio = page_folio(page);
+			folio_remove_rmap_pmd(folio, page, vma);
 			VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
 			VM_BUG_ON_PAGE(!PageHead(page), page);
 		} else if (thp_migration_supported()) {
@@ -1918,23 +1920,24 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
 			entry = pmd_to_swp_entry(orig_pmd);
-			page = pfn_swap_entry_to_page(entry);
+			folio = pfn_swap_entry_folio(entry);
 			flush_needed = 0;
 		} else
 			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
 
-		if (PageAnon(page)) {
+		if (folio_test_anon(folio)) {
 			zap_deposited_table(tlb->mm, pmd);
 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
 		} else {
 			if (arch_needs_pgtable_deposit())
 				zap_deposited_table(tlb->mm, pmd);
-			add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
+			add_mm_counter(tlb->mm, mm_counter_file(&folio->page),
+				       -HPAGE_PMD_NR);
 		}
 
 		spin_unlock(ptl);
 		if (flush_needed)
-			tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
+			tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
 	}
 	return 1;
 }

From 948d3bb7dc9eb922cca81b0b454e9c9ec9ba7c4c Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 11 Jan 2024 15:24:26 +0000
Subject: [PATCH 449/707] mm: use pfn_swap_entry_folio() in
 copy_nonpresent_pte()

Call pfn_swap_entry_folio() as preparation for converting mm counter
functions to take a folio.

Link: https://lkml.kernel.org/r/20240111152429.3374566-8-willy@infradead.org
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 762458faec6969..3d28705b00616d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -806,9 +806,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		}
 		rss[MM_SWAPENTS]++;
 	} else if (is_migration_entry(entry)) {
-		page = pfn_swap_entry_to_page(entry);
+		folio = pfn_swap_entry_folio(entry);
 
-		rss[mm_counter(page)]++;
+		rss[mm_counter(&folio->page)]++;
 
 		if (!is_readable_migration_entry(entry) &&
 				is_cow_mapping(vm_flags)) {

From 9134bed5edf76d66a67741dfea662aacab55e7b7 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 11 Jan 2024 15:24:27 +0000
Subject: [PATCH 450/707] mm: convert to should_zap_page() to
 should_zap_folio()

Make should_zap_page() take a folio and rename it to should_zap_folio() as
preparation for converting mm counter functions to take a folio.  Saves a
call to compound_head() hidden inside PageAnon().

Link: https://lkml.kernel.org/r/20240111152429.3374566-9-willy@infradead.org
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 3d28705b00616d..25455674c047bc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1369,19 +1369,20 @@ static inline bool should_zap_cows(struct zap_details *details)
 	return details->even_cows;
 }
 
-/* Decides whether we should zap this page with the page pointer specified */
-static inline bool should_zap_page(struct zap_details *details, struct page *page)
+/* Decides whether we should zap this folio with the folio pointer specified */
+static inline bool should_zap_folio(struct zap_details *details,
+				    struct folio *folio)
 {
-	/* If we can make a decision without *page.. */
+	/* If we can make a decision without *folio.. */
 	if (should_zap_cows(details))
 		return true;
 
-	/* E.g. the caller passes NULL for the case of a zero page */
-	if (!page)
+	/* E.g. the caller passes NULL for the case of a zero folio */
+	if (!folio)
 		return true;
 
-	/* Otherwise we should only zap non-anon pages */
-	return !PageAnon(page);
+	/* Otherwise we should only zap non-anon folios */
+	return !folio_test_anon(folio);
 }
 
 static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
@@ -1447,7 +1448,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			unsigned int delay_rmap;
 
 			page = vm_normal_page(vma, addr, ptent);
-			if (unlikely(!should_zap_page(details, page)))
+			if (page)
+				folio = page_folio(page);
+
+			if (unlikely(!should_zap_folio(details, folio)))
 				continue;
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
@@ -1460,7 +1464,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				continue;
 			}
 
-			folio = page_folio(page);
 			delay_rmap = 0;
 			if (!folio_test_anon(folio)) {
 				if (pte_dirty(ptent)) {
@@ -1492,7 +1495,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		    is_device_exclusive_entry(entry)) {
 			page = pfn_swap_entry_to_page(entry);
 			folio = page_folio(page);
-			if (unlikely(!should_zap_page(details, page)))
+			if (unlikely(!should_zap_folio(details, folio)))
 				continue;
 			/*
 			 * Both device private/exclusive mappings should only
@@ -1513,10 +1516,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			if (unlikely(!free_swap_and_cache(entry)))
 				print_bad_pte(vma, addr, ptent, NULL);
 		} else if (is_migration_entry(entry)) {
-			page = pfn_swap_entry_to_page(entry);
-			if (!should_zap_page(details, page))
+			folio = pfn_swap_entry_folio(entry);
+			if (!should_zap_folio(details, folio))
 				continue;
-			rss[mm_counter(page)]--;
+			rss[mm_counter(&folio->page)]--;
 		} else if (pte_marker_entry_uffd_wp(entry)) {
 			/*
 			 * For anon: always drop the marker; for file: only

From c221b4d007869f0da73e12b3685ba8c2434f586a Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 12 Jan 2024 18:14:32 +0800
Subject: [PATCH 451/707] mm-convert-to-should_zap_page-to-should_zap_folio-fix

fix used-uninitialized warning

Link: https://lkml.kernel.org/r/962a7993-fce9-4de8-85cd-25e290f25736@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202401121250.A221BL2D-lkp@intel.com/
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index 25455674c047bc..96eae99bb25c37 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1435,7 +1435,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	arch_enter_lazy_mmu_mode();
 	do {
 		pte_t ptent = ptep_get(pte);
-		struct folio *folio;
+		struct folio *folio = NULL;
 		struct page *page;
 
 		if (pte_none(ptent))

From dbf44e93baa056ca5b60a25afa50329f5046138f Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 11 Jan 2024 15:24:28 +0000
Subject: [PATCH 452/707] mm: convert mm_counter() to take a folio

Now all callers of mm_counter() have a folio, convert mm_counter() to take
a folio.  Saves a call to compound_head() hidden inside PageAnon().

Link: https://lkml.kernel.org/r/20240111152429.3374566-10-willy@infradead.org
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/mm/pgtable.c |  2 +-
 include/linux/mm.h     |  6 +++---
 mm/memory.c            | 10 +++++-----
 mm/rmap.c              |  8 ++++----
 mm/userfaultfd.c       |  2 +-
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 7e5dd4b176642c..b71432b15d665c 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -723,7 +723,7 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
 	else if (is_migration_entry(entry)) {
 		struct folio *folio = pfn_swap_entry_folio(entry);
 
-		dec_mm_counter(mm, mm_counter(&folio->page));
+		dec_mm_counter(mm, mm_counter(folio));
 	}
 	free_swap_and_cache(entry);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f5a97dec516948..22e597b36b3887 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2603,11 +2603,11 @@ static inline int mm_counter_file(struct page *page)
 	return MM_FILEPAGES;
 }
 
-static inline int mm_counter(struct page *page)
+static inline int mm_counter(struct folio *folio)
 {
-	if (PageAnon(page))
+	if (folio_test_anon(folio))
 		return MM_ANONPAGES;
-	return mm_counter_file(page);
+	return mm_counter_file(&folio->page);
 }
 
 static inline unsigned long get_mm_rss(struct mm_struct *mm)
diff --git a/mm/memory.c b/mm/memory.c
index 96eae99bb25c37..b04315a7ad2db2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -808,7 +808,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	} else if (is_migration_entry(entry)) {
 		folio = pfn_swap_entry_folio(entry);
 
-		rss[mm_counter(&folio->page)]++;
+		rss[mm_counter(folio)]++;
 
 		if (!is_readable_migration_entry(entry) &&
 				is_cow_mapping(vm_flags)) {
@@ -840,7 +840,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		 * keep things as they are.
 		 */
 		folio_get(folio);
-		rss[mm_counter(page)]++;
+		rss[mm_counter(folio)]++;
 		/* Cannot fail as these pages cannot get pinned. */
 		folio_try_dup_anon_rmap_pte(folio, page, src_vma);
 
@@ -1476,7 +1476,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				if (pte_young(ptent) && likely(vma_has_recency(vma)))
 					folio_mark_accessed(folio);
 			}
-			rss[mm_counter(page)]--;
+			rss[mm_counter(folio)]--;
 			if (!delay_rmap) {
 				folio_remove_rmap_pte(folio, page, vma);
 				if (unlikely(page_mapcount(page) < 0))
@@ -1504,7 +1504,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			 * see zap_install_uffd_wp_if_needed().
 			 */
 			WARN_ON_ONCE(!vma_is_anonymous(vma));
-			rss[mm_counter(page)]--;
+			rss[mm_counter(folio)]--;
 			if (is_device_private_entry(entry))
 				folio_remove_rmap_pte(folio, page, vma);
 			folio_put(folio);
@@ -1519,7 +1519,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			folio = pfn_swap_entry_folio(entry);
 			if (!should_zap_folio(details, folio))
 				continue;
-			rss[mm_counter(&folio->page)]--;
+			rss[mm_counter(folio)]--;
 		} else if (pte_marker_entry_uffd_wp(entry)) {
 			/*
 			 * For anon: always drop the marker; for file: only
diff --git a/mm/rmap.c b/mm/rmap.c
index f5d43edad529a7..4648cf1d8178b5 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1780,7 +1780,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 				set_huge_pte_at(mm, address, pvmw.pte, pteval,
 						hsz);
 			} else {
-				dec_mm_counter(mm, mm_counter(&folio->page));
+				dec_mm_counter(mm, mm_counter(folio));
 				set_pte_at(mm, address, pvmw.pte, pteval);
 			}
 
@@ -1795,7 +1795,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			 * migration) will not expect userfaults on already
 			 * copied pages.
 			 */
-			dec_mm_counter(mm, mm_counter(&folio->page));
+			dec_mm_counter(mm, mm_counter(folio));
 		} else if (folio_test_anon(folio)) {
 			swp_entry_t entry = page_swap_entry(subpage);
 			pte_t swp_pte;
@@ -2181,7 +2181,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 				set_huge_pte_at(mm, address, pvmw.pte, pteval,
 						hsz);
 			} else {
-				dec_mm_counter(mm, mm_counter(&folio->page));
+				dec_mm_counter(mm, mm_counter(folio));
 				set_pte_at(mm, address, pvmw.pte, pteval);
 			}
 
@@ -2196,7 +2196,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			 * migration) will not expect userfaults on already
 			 * copied pages.
 			 */
-			dec_mm_counter(mm, mm_counter(&folio->page));
+			dec_mm_counter(mm, mm_counter(folio));
 		} else {
 			swp_entry_t entry;
 			pte_t swp_pte;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 7cf7d43842590c..ae80c37148290a 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -124,7 +124,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
 	 * Must happen after rmap, as mm_counter() checks mapping (via
 	 * PageAnon()), which is set by __page_set_anon_rmap().
 	 */
-	inc_mm_counter(dst_mm, mm_counter(page));
+	inc_mm_counter(dst_mm, mm_counter(folio));
 
 	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 

From f06d3e9d1d39d9b67d691902d0debf94c0dcd2c8 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 11 Jan 2024 15:24:29 +0000
Subject: [PATCH 453/707] mm: convert mm_counter_file() to take a folio

Now all callers of mm_counter_file() have a folio, convert
mm_counter_file() to take a folio.  Saves a call to compound_head() hidden
inside PageSwapBacked().

Link: https://lkml.kernel.org/r/20240111152429.3374566-11-willy@infradead.org
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h      |  8 ++++----
 kernel/events/uprobes.c |  2 +-
 mm/huge_memory.c        |  4 ++--
 mm/khugepaged.c         |  4 ++--
 mm/memory.c             | 10 +++++-----
 mm/rmap.c               |  2 +-
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 22e597b36b3887..ac6b71cbdffbfa 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2595,10 +2595,10 @@ static inline void dec_mm_counter(struct mm_struct *mm, int member)
 	mm_trace_rss_stat(mm, member);
 }
 
-/* Optimized variant when page is already known not to be PageAnon */
-static inline int mm_counter_file(struct page *page)
+/* Optimized variant when folio is already known not to be anon */
+static inline int mm_counter_file(struct folio *folio)
 {
-	if (PageSwapBacked(page))
+	if (folio_test_swapbacked(folio))
 		return MM_SHMEMPAGES;
 	return MM_FILEPAGES;
 }
@@ -2607,7 +2607,7 @@ static inline int mm_counter(struct folio *folio)
 {
 	if (folio_test_anon(folio))
 		return MM_ANONPAGES;
-	return mm_counter_file(&folio->page);
+	return mm_counter_file(folio);
 }
 
 static inline unsigned long get_mm_rss(struct mm_struct *mm)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 929e98c629652a..e4834d23e1d1a2 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -188,7 +188,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 		dec_mm_counter(mm, MM_ANONPAGES);
 
 	if (!folio_test_anon(old_folio)) {
-		dec_mm_counter(mm, mm_counter_file(old_page));
+		dec_mm_counter(mm, mm_counter_file(old_folio));
 		inc_mm_counter(mm, MM_ANONPAGES);
 	}
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7a28a7db08ea0d..f005f04247355f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1931,7 +1931,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		} else {
 			if (arch_needs_pgtable_deposit())
 				zap_deposited_table(tlb->mm, pmd);
-			add_mm_counter(tlb->mm, mm_counter_file(&folio->page),
+			add_mm_counter(tlb->mm, mm_counter_file(folio),
 				       -HPAGE_PMD_NR);
 		}
 
@@ -2456,7 +2456,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			folio_remove_rmap_pmd(folio, page, vma);
 			folio_put(folio);
 		}
-		add_mm_counter(mm, mm_counter_file(&folio->page), -HPAGE_PMD_NR);
+		add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
 		return;
 	}
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 2b219acb528e25..fe43fbc4452539 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1634,7 +1634,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 	/* step 3: set proper refcount and mm_counters. */
 	if (nr_ptes) {
 		folio_ref_sub(folio, nr_ptes);
-		add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
+		add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
 	}
 
 	/* step 4: remove empty page table */
@@ -1665,7 +1665,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 	if (nr_ptes) {
 		flush_tlb_mm(mm);
 		folio_ref_sub(folio, nr_ptes);
-		add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
+		add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
 	}
 	if (start_pte)
 		pte_unmap_unlock(start_pte, ptl);
diff --git a/mm/memory.c b/mm/memory.c
index b04315a7ad2db2..f5750bfcdc058f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -966,7 +966,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	} else if (page) {
 		folio_get(folio);
 		folio_dup_file_rmap_pte(folio, page);
-		rss[mm_counter_file(page)]++;
+		rss[mm_counter_file(folio)]++;
 	}
 
 	/*
@@ -1873,7 +1873,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
 		return -EBUSY;
 	/* Ok, finally just insert the thing.. */
 	folio_get(folio);
-	inc_mm_counter(vma->vm_mm, mm_counter_file(page));
+	inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
 	folio_add_file_rmap_pte(folio, page, vma);
 	set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
 	return 0;
@@ -3178,7 +3178,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
 		if (old_folio) {
 			if (!folio_test_anon(old_folio)) {
-				dec_mm_counter(mm, mm_counter_file(&old_folio->page));
+				dec_mm_counter(mm, mm_counter_file(old_folio));
 				inc_mm_counter(mm, MM_ANONPAGES);
 			}
 		} else {
@@ -4463,7 +4463,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 	if (write)
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 
-	add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
+	add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR);
 	folio_add_file_rmap_pmd(folio, page, vma);
 
 	/*
@@ -4526,7 +4526,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
 		folio_add_new_anon_rmap(folio, vma, addr);
 		folio_add_lru_vma(folio, vma);
 	} else {
-		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
+		add_mm_counter(vma->vm_mm, mm_counter_file(folio), nr);
 		folio_add_file_rmap_ptes(folio, page, nr, vma);
 	}
 	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
diff --git a/mm/rmap.c b/mm/rmap.c
index 4648cf1d8178b5..1cf2bffa48ed87 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1903,7 +1903,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			 *
 			 * See Documentation/mm/mmu_notifier.rst
 			 */
-			dec_mm_counter(mm, mm_counter_file(&folio->page));
+			dec_mm_counter(mm, mm_counter_file(folio));
 		}
 discard:
 		if (unlikely(folio_test_hugetlb(folio)))

From c57c2645447bdc671c8f7c0810a39f1a77028420 Mon Sep 17 00:00:00 2001
From: Hui Zhu <teawater@antgroup.com>
Date: Thu, 11 Jan 2024 08:45:33 +0000
Subject: [PATCH 454/707] fs/proc/task_mmu.c: add_to_pagemap: remove useless
 parameter addr

Function parameter addr of add_to_pagemap() is useless.  Remove it.

Link: https://lkml.kernel.org/r/20240111084533.40038-1-teawaterz@linux.alibaba.com
Signed-off-by: Hui Zhu <teawater@antgroup.com>
Reviewed-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Tested-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrei Vagin <avagin@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ac6ea2cc2ee8fe..23fbab954c20b6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1352,8 +1352,7 @@ static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
 	return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
 }
 
-static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
-			  struct pagemapread *pm)
+static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm)
 {
 	pm->buffer[pm->pos++] = *pme;
 	if (pm->pos >= pm->len)
@@ -1380,7 +1379,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 			hole_end = end;
 
 		for (; addr < hole_end; addr += PAGE_SIZE) {
-			err = add_to_pagemap(addr, &pme, pm);
+			err = add_to_pagemap(&pme, pm);
 			if (err)
 				goto out;
 		}
@@ -1392,7 +1391,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 		if (vma->vm_flags & VM_SOFTDIRTY)
 			pme = make_pme(0, PM_SOFT_DIRTY);
 		for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
-			err = add_to_pagemap(addr, &pme, pm);
+			err = add_to_pagemap(&pme, pm);
 			if (err)
 				goto out;
 		}
@@ -1519,7 +1518,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		for (; addr != end; addr += PAGE_SIZE) {
 			pagemap_entry_t pme = make_pme(frame, flags);
 
-			err = add_to_pagemap(addr, &pme, pm);
+			err = add_to_pagemap(&pme, pm);
 			if (err)
 				break;
 			if (pm->show_pfn) {
@@ -1547,7 +1546,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		pagemap_entry_t pme;
 
 		pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
-		err = add_to_pagemap(addr, &pme, pm);
+		err = add_to_pagemap(&pme, pm);
 		if (err)
 			break;
 	}
@@ -1597,7 +1596,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
 	for (; addr != end; addr += PAGE_SIZE) {
 		pagemap_entry_t pme = make_pme(frame, flags);
 
-		err = add_to_pagemap(addr, &pme, pm);
+		err = add_to_pagemap(&pme, pm);
 		if (err)
 			return err;
 		if (pm->show_pfn && (flags & PM_PRESENT))

From 98158c18452b502a76900ddfaec31f68a54a82e3 Mon Sep 17 00:00:00 2001
From: Carlos Galo <carlosgalo@google.com>
Date: Thu, 11 Jan 2024 21:05:30 +0000
Subject: [PATCH 455/707] mm: update mark_victim tracepoints fields

The current implementation of the mark_victim tracepoint provides only the
process ID (pid) of the victim process.  This limitation poses challenges
for userspace tools that need additional information about the OOM victim.
The association between pid and the additional data may be lost after the
kill, making it difficult for userspace to correlate the OOM event with
the specific process.

In order to mitigate this limitation, add the following fields:

- UID
   In Android each installed application has a unique UID. Including
   the `uid` assists in correlating OOM events with specific apps.

- Process Name (comm)
   Enables identification of the affected process.

- OOM Score
   Allows userspace to get additional insights of the relative kill
   priority of the OOM victim.

Link: https://lkml.kernel.org/r/20240111210539.636607-1-carlosgalo@google.com
Signed-off-by: Carlos Galo <carlosgalo@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/oom.h | 19 +++++++++++++++----
 mm/oom_kill.c              |  6 +++++-
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
index 26a11e4a2c361d..3c5941da80755b 100644
--- a/include/trace/events/oom.h
+++ b/include/trace/events/oom.h
@@ -72,19 +72,30 @@ TRACE_EVENT(reclaim_retry_zone,
 );
 
 TRACE_EVENT(mark_victim,
-	TP_PROTO(int pid),
+	TP_PROTO(struct task_struct *task, uid_t uid),
 
-	TP_ARGS(pid),
+	TP_ARGS(task, uid),
 
 	TP_STRUCT__entry(
 		__field(int, pid)
+		__field(uid_t, uid)
+		__string(comm, task->comm)
+		__field(short, oom_score_adj)
 	),
 
 	TP_fast_assign(
-		__entry->pid = pid;
+		__entry->pid = task->pid;
+		__entry->uid = uid;
+		__assign_str(comm, task->comm);
+		__entry->oom_score_adj = task->signal->oom_score_adj;
 	),
 
-	TP_printk("pid=%d", __entry->pid)
+	TP_printk("pid=%d uid=%u comm=%s oom_score_adj=%hd",
+		__entry->pid,
+		__entry->uid,
+		__get_str(comm),
+		__entry->oom_score_adj
+	)
 );
 
 TRACE_EVENT(wake_reaper,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 91ccd82097c2ba..8d6a207c3c5905 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
 #include <linux/kthread.h>
 #include <linux/init.h>
 #include <linux/mmu_notifier.h>
+#include <linux/cred.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -754,6 +755,7 @@ static inline void queue_oom_reaper(struct task_struct *tsk)
  */
 static void mark_oom_victim(struct task_struct *tsk)
 {
+	const struct cred *cred;
 	struct mm_struct *mm = tsk->mm;
 
 	WARN_ON(oom_killer_disabled);
@@ -773,7 +775,9 @@ static void mark_oom_victim(struct task_struct *tsk)
 	 */
 	__thaw_task(tsk);
 	atomic_inc(&oom_victims);
-	trace_mark_victim(tsk->pid);
+	cred = get_task_cred(tsk);
+	trace_mark_victim(tsk, cred->uid.val);
+	put_cred(cred);
 }
 
 /**

From 7eda6ba9dc62ea00c611e77c61ec18796e1428e4 Mon Sep 17 00:00:00 2001
From: Pankaj Raghav <p.raghav@samsung.com>
Date: Mon, 15 Jan 2024 11:25:22 +0100
Subject: [PATCH 456/707] readahead: use ilog2 instead of a while loop in
 page_cache_ra_order()

A while loop is used to adjust the new_order to be lower than the
ra->size.  ilog2 could be used to do the same instead of using a loop.

ilog2 typically resolves to a bit scan reverse instruction.  This is
particularly useful when ra->size is smaller than the 2^new_order as it
resolves in one instruction instead of looping to find the new_order.

No functional changes.

Link: https://lkml.kernel.org/r/20240115102523.2336742-1-kernel@pankajraghav.com
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/readahead.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index 2648ec4f04947b..1e74455f908e50 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -500,10 +500,8 @@ void page_cache_ra_order(struct readahead_control *ractl,
 
 	if (new_order < MAX_PAGECACHE_ORDER) {
 		new_order += 2;
-		if (new_order > MAX_PAGECACHE_ORDER)
-			new_order = MAX_PAGECACHE_ORDER;
-		while ((1 << new_order) > ra->size)
-			new_order--;
+		new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
+		new_order = min_t(unsigned int, new_order, ilog2(ra->size));
 	}
 
 	filemap_invalidate_lock_shared(mapping);

From ca6196c1e3f77ea919406fe4295630e29c23f7d5 Mon Sep 17 00:00:00 2001
From: Nanyong Sun <sunnanyong@huawei.com>
Date: Sat, 13 Jan 2024 17:44:34 +0800
Subject: [PATCH 457/707] mm: HVO: introduce helper function to update and
 flush pgtable

Patch series "A Solution to Re-enable hugetlb vmemmap optimize", v3.

HVO was previously disabled on arm64 [1] due to the lack of necessary
BBM(break-before-make) logic when changing page tables.  This set of
patches fix this by adding necessary BBM sequence when changing page
table, and supporting vmemmap page fault handling to fixup kernel address
translation fault if vmemmap is concurrently accessed.

I have tested this patch set with concurrently accessing the vmemmap
address when do BBM and can recover by vmemmap fault handler.  Also tested
under the config of 2/3/4 pgtable levels with 4K/64K page size and all
works well.


This patch (of 3):

Add pmd/pte update and tlb flush helper function to update page table.
This refactoring patch is designed to facilitate each architecture to
implement its own special logic in preparation for the arm64 architecture
to follow the necessary break-before-make sequence when updating page
tables.

Link: https://lkml.kernel.org/r/20240113094436.2506396-1-sunnanyong@huawei.com
Link: https://lkml.kernel.org/r/20240113094436.2506396-2-sunnanyong@huawei.com
Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/hugetlb_vmemmap.c | 55 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index da177e49d95648..f1f5702bce4f6a 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -46,6 +46,37 @@ struct vmemmap_remap_walk {
 	unsigned long		flags;
 };
 
+#ifndef vmemmap_update_pmd
+static inline void vmemmap_update_pmd(unsigned long addr,
+				      pmd_t *pmdp, pte_t *ptep)
+{
+	pmd_populate_kernel(&init_mm, pmdp, ptep);
+}
+#endif
+
+#ifndef vmemmap_update_pte
+static inline void vmemmap_update_pte(unsigned long addr,
+				      pte_t *ptep, pte_t pte)
+{
+	set_pte_at(&init_mm, addr, ptep, pte);
+}
+#endif
+
+#ifndef vmemmap_flush_tlb_all
+static inline void vmemmap_flush_tlb_all(void)
+{
+	flush_tlb_all();
+}
+#endif
+
+#ifndef vmemmap_flush_tlb_range
+static inline void vmemmap_flush_tlb_range(unsigned long start,
+					   unsigned long end)
+{
+	flush_tlb_kernel_range(start, end);
+}
+#endif
+
 static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
 			     struct vmemmap_remap_walk *walk)
 {
@@ -81,9 +112,9 @@ static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
 
 		/* Make pte visible before pmd. See comment in pmd_install(). */
 		smp_wmb();
-		pmd_populate_kernel(&init_mm, pmd, pgtable);
+		vmemmap_update_pmd(start, pmd, pgtable);
 		if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
-			flush_tlb_kernel_range(start, start + PMD_SIZE);
+			vmemmap_flush_tlb_range(start, start + PMD_SIZE);
 	} else {
 		pte_free_kernel(&init_mm, pgtable);
 	}
@@ -171,7 +202,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
 		return ret;
 
 	if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
-		flush_tlb_kernel_range(start, end);
+		vmemmap_flush_tlb_range(start, end);
 
 	return 0;
 }
@@ -217,15 +248,15 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
 
 		/*
 		 * Makes sure that preceding stores to the page contents from
-		 * vmemmap_remap_free() become visible before the set_pte_at()
-		 * write.
+		 * vmemmap_remap_free() become visible before the
+		 * vmemmap_update_pte() write.
 		 */
 		smp_wmb();
 	}
 
 	entry = mk_pte(walk->reuse_page, pgprot);
 	list_add(&page->lru, walk->vmemmap_pages);
-	set_pte_at(&init_mm, addr, pte, entry);
+	vmemmap_update_pte(addr, pte, entry);
 }
 
 /*
@@ -264,10 +295,10 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
 
 	/*
 	 * Makes sure that preceding stores to the page contents become visible
-	 * before the set_pte_at() write.
+	 * before the vmemmap_update_pte() write.
 	 */
 	smp_wmb();
-	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+	vmemmap_update_pte(addr, pte, mk_pte(page, pgprot));
 }
 
 /**
@@ -519,7 +550,7 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h,
 	}
 
 	if (restored)
-		flush_tlb_all();
+		vmemmap_flush_tlb_all();
 	if (!ret)
 		ret = restored;
 	return ret;
@@ -642,7 +673,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
 			break;
 	}
 
-	flush_tlb_all();
+	vmemmap_flush_tlb_all();
 
 	list_for_each_entry(folio, folio_list, lru) {
 		int ret;
@@ -659,7 +690,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
 		 * allowing more vmemmap remaps to occur.
 		 */
 		if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
-			flush_tlb_all();
+			vmemmap_flush_tlb_all();
 			free_vmemmap_page_list(&vmemmap_pages);
 			INIT_LIST_HEAD(&vmemmap_pages);
 			__hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
@@ -667,7 +698,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
 		}
 	}
 
-	flush_tlb_all();
+	vmemmap_flush_tlb_all();
 	free_vmemmap_page_list(&vmemmap_pages);
 }
 

From cb78d8ce2f43169251cc001f5c2a859295a1ecc4 Mon Sep 17 00:00:00 2001
From: Nanyong Sun <sunnanyong@huawei.com>
Date: Sat, 13 Jan 2024 17:44:35 +0800
Subject: [PATCH 458/707] arm64: mm: HVO: support BBM of vmemmap pgtable safely

Implement vmemmap_update_pmd and vmemmap_update_pte on arm64 to do
BBM(break-before-make) logic when change the page table of vmemmap
address, they will under the init_mm.page_table_lock.  If a translation
fault of vmemmap address concurrently happened after pte/pmd cleared,
vmemmap page fault handler will acquire the init_mm.page_table_lock to
wait for vmemmap update to complete, by then the virtual address is valid
again, so PF can return and access can continue.  In other case, do the
traditional kernel fault.

Implement vmemmap_flush_tlb_all/range on arm64 with nothing to do because
tlb already flushed in every single BBM.

Link: https://lkml.kernel.org/r/20240113094436.2506396-3-sunnanyong@huawei.com
Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/esr.h      |  4 ++
 arch/arm64/include/asm/pgtable.h  |  8 ++++
 arch/arm64/include/asm/tlbflush.h | 16 +++++++
 arch/arm64/mm/fault.c             | 78 +++++++++++++++++++++++++++++--
 arch/arm64/mm/mmu.c               | 28 +++++++++++
 5 files changed, 131 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 353fe08546cf90..f13d1a094fd1ab 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -116,6 +116,10 @@
 #define ESR_ELx_FSC_SERROR	(0x11)
 #define ESR_ELx_FSC_ACCESS	(0x08)
 #define ESR_ELx_FSC_FAULT	(0x04)
+#define ESR_ELx_FSC_FAULT_L0    (0x04)
+#define ESR_ELx_FSC_FAULT_L1    (0x05)
+#define ESR_ELx_FSC_FAULT_L2    (0x06)
+#define ESR_ELx_FSC_FAULT_L3    (0x07)
 #define ESR_ELx_FSC_PERM	(0x0C)
 #define ESR_ELx_FSC_SEA_TTW0	(0x14)
 #define ESR_ELx_FSC_SEA_TTW1	(0x15)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 79ce70fbb751c6..b50270107e2f02 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1124,6 +1124,14 @@ extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
 extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
 				    unsigned long addr, pte_t *ptep,
 				    pte_t old_pte, pte_t new_pte);
+
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+void vmemmap_update_pmd(unsigned long addr, pmd_t *pmdp, pte_t *ptep);
+#define vmemmap_update_pmd vmemmap_update_pmd
+void vmemmap_update_pte(unsigned long addr, pte_t *ptep, pte_t pte);
+#define vmemmap_update_pte vmemmap_update_pte
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_PGTABLE_H */
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 1deb5d789c2e23..79e932a1bdf87f 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -504,6 +504,22 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr)
 	dsb(ish);
 	isb();
 }
+
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+static inline void vmemmap_flush_tlb_all(void)
+{
+	/* do nothing, already flushed tlb in every single BBM */
+}
+#define vmemmap_flush_tlb_all vmemmap_flush_tlb_all
+
+static inline void vmemmap_flush_tlb_range(unsigned long start,
+					   unsigned long end)
+{
+	/* do nothing, already flushed tlb in every single BBM */
+}
+#define vmemmap_flush_tlb_range vmemmap_flush_tlb_range
+#endif
+
 #endif
 
 #endif
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 55f6455a828434..13189322a38ff0 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -368,6 +368,75 @@ static bool is_el1_mte_sync_tag_check_fault(unsigned long esr)
 	return false;
 }
 
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+static inline bool vmemmap_fault_may_fixup(unsigned long addr,
+					   unsigned long esr)
+{
+	if (addr < VMEMMAP_START || addr >= VMEMMAP_END)
+		return false;
+
+	/*
+	 * Only try to handle translation fault level 2 or level 3,
+	 * because hugetlb vmemmap optimize only clear pmd or pte.
+	 */
+	switch (esr & ESR_ELx_FSC) {
+	case ESR_ELx_FSC_FAULT_L2:
+	case ESR_ELx_FSC_FAULT_L3:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/*
+ * PMD mapped vmemmap should has been split as PTE mapped
+ * by HVO now, here we only check this case, other cases
+ * should fail.
+ * Also should check the addr is healthy enough that will not cause
+ * a level2 or level3 translation fault again after page fault
+ * handled with success, so we need check both bits[1:0] of PMD and
+ * PTE as ARM Spec mentioned below:
+ * A Translation fault is generated if bits[1:0] of a translation
+ * table descriptor identify the descriptor as either a Fault
+ * encoding or a reserved encoding.
+ */
+static inline bool vmemmap_addr_healthy(unsigned long addr)
+{
+	pmd_t *pmdp, pmd;
+	pte_t *ptep, pte;
+
+	pmdp = pmd_off_k(addr);
+	pmd = pmdp_get(pmdp);
+	if (!pmd_table(pmd))
+		return false;
+
+	ptep = pte_offset_kernel(pmdp, addr);
+	pte = ptep_get(ptep);
+	return (pte_val(pte) & PTE_TYPE_MASK) == PTE_TYPE_PAGE;
+}
+
+static bool vmemmap_handle_page_fault(unsigned long addr,
+				      unsigned long esr)
+{
+	bool ret;
+
+	if (likely(!vmemmap_fault_may_fixup(addr, esr)))
+		return false;
+
+	spin_lock(&init_mm.page_table_lock);
+	ret = vmemmap_addr_healthy(addr);
+	spin_unlock(&init_mm.page_table_lock);
+
+	return ret;
+}
+#else
+static inline bool vmemmap_handle_page_fault(unsigned long addr,
+					     unsigned long esr)
+{
+	return false;
+}
+#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
+
 static bool is_translation_fault(unsigned long esr)
 {
 	return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT;
@@ -405,9 +474,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
 	} else if (addr < PAGE_SIZE) {
 		msg = "NULL pointer dereference";
 	} else {
-		if (is_translation_fault(esr) &&
-		    kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
-			return;
+		if (is_translation_fault(esr)) {
+			if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
+				return;
+			if (vmemmap_handle_page_fault(addr, esr))
+				return;
+		}
 
 		msg = "paging request";
 	}
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 1ac7467d34c9c3..d794b2f4b5a3cd 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1146,6 +1146,34 @@ int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
 	return 1;
 }
 
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+/*
+ * In the window between the page table entry is cleared and filled
+ * with a new value, other threads have the opportunity to concurrently
+ * access the vmemmap area then page translation fault occur.
+ * Therefore, we need to ensure that the init_mm.page_table_lock is held
+ * to synchronize the vmemmap page fault handling which will wait for
+ * this lock to be released to ensure that the page table entry has been
+ * refreshed with a new valid value.
+ */
+void vmemmap_update_pmd(unsigned long addr, pmd_t *pmdp, pte_t *ptep)
+{
+	lockdep_assert_held(&init_mm.page_table_lock);
+	pmd_clear(pmdp);
+	flush_tlb_kernel_range(addr, addr + PMD_SIZE);
+	pmd_populate_kernel(&init_mm, pmdp, ptep);
+}
+
+void vmemmap_update_pte(unsigned long addr, pte_t *ptep, pte_t pte)
+{
+	spin_lock(&init_mm.page_table_lock);
+	pte_clear(&init_mm, addr, ptep);
+	flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+	set_pte_at(&init_mm, addr, ptep, pte);
+	spin_unlock(&init_mm.page_table_lock);
+}
+#endif
+
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 		struct vmem_altmap *altmap)
 {

From a3045e96974aa75213f24576697ada761b5df489 Mon Sep 17 00:00:00 2001
From: Nanyong Sun <sunnanyong@huawei.com>
Date: Sat, 13 Jan 2024 17:44:36 +0800
Subject: [PATCH 459/707] arm64: mm: re-enable OPTIMIZE_HUGETLB_VMEMMAP

Now update of vmemmap page table can follow the rule of break-before-make
safely for arm64 architecture, re-enable HVO on arm64.

Link: https://lkml.kernel.org/r/20240113094436.2506396-4-sunnanyong@huawei.com
Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index aa7c1d43513968..ad5dbaf3dc9f57 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -104,6 +104,7 @@ config ARM64
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
+	select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANTS_NO_INSTR
 	select ARCH_WANTS_THP_SWAP if ARM64_4K_PAGES

From 209826bbad1234aa8f683e73ab85ac5710399903 Mon Sep 17 00:00:00 2001
From: Ronald Monthero <debug.penguin32@gmail.com>
Date: Tue, 16 Jan 2024 23:31:45 +1000
Subject: [PATCH 460/707] mm/zswap: improve with alloc_workqueue() call

The core-api create_workqueue is deprecated, this patch replaces the
create_workqueue with alloc_workqueue.  The previous implementation
workqueue of zswap was a bounded workqueue, this patch uses
alloc_workqueue() to create an unbounded workqueue.  The WQ_UNBOUND
attribute is desirable making the workqueue not localized to a specific
cpu so that the scheduler is free to exercise improvisations in any
demanding scenarios for offloading cpu time slices for workqueues.  For
example if any other workqueues of the same primary cpu had to be served
which are WQ_HIGHPRI and WQ_CPU_INTENSIVE.  Also Unbound workqueue happens
to be more efficient in a system during memory pressure scenarios in
comparison to a bounded workqueue.

shrink_wq = alloc_workqueue("zswap-shrink",
                     WQ_UNBOUND|WQ_MEM_RECLAIM, 1);

Overall the change suggested in this patch should be seamless and does not
alter the existing behavior, other than the improvisation to be an
unbounded workqueue.

Link: https://lkml.kernel.org/r/20240116133145.12454-1-debug.penguin32@gmail.com
Signed-off-by: Ronald Monthero <debug.penguin32@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index d2423247acfd64..e7b38aefb9afe7 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1884,7 +1884,8 @@ static int zswap_setup(void)
 		zswap_enabled = false;
 	}
 
-	shrink_wq = create_workqueue("zswap-shrink");
+	shrink_wq = alloc_workqueue("zswap-shrink",
+			WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
 	if (!shrink_wq)
 		goto fallback_fail;
 

From 2282c541aefdb0a425b928e1fb8b7c1fe410f722 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Tue, 16 Jan 2024 14:12:35 +0000
Subject: [PATCH 461/707] tools/mm: add thpmaps script to dump THP usage info

With the proliferation of large folios for file-backed memory, and more
recently the introduction of multi-size THP for anonymous memory, it is
becoming useful to be able to see exactly how large folios are mapped into
processes.  For some architectures (e.g.  arm64), if most memory is mapped
using contpte-sized and -aligned blocks, TLB usage can be optimized so
it's useful to see where these requirements are and are not being met.

thpmaps is a Python utility that reads /proc/<pid>/smaps,
/proc/<pid>/pagemap and /proc/kpageflags to print information about how
transparent huge pages (both file and anon) are mapped to a specified
process or cgroup.  It aims to help users debug and optimize their
workloads.  In future we may wish to introduce stats directly into the
kernel (e.g.  smaps or similar), but for now this provides a short term
solution without the need to introduce any new ABI.

Run with help option for a full listing of the arguments:

    # ./thpmaps --help

--8<--
usage: thpmaps [-h] [--pid pid | --cgroup path] [--rollup]
               [--cont size[KMG]] [--inc-smaps] [--inc-empty]
               [--periodic sleep_ms]

Prints information about how transparent huge pages are mapped, either
system-wide, or for a specified process or cgroup.

When run with --pid, the user explicitly specifies the set of pids to
scan.  e.g.  "--pid 10 [--pid 134 ...]".  When run with --cgroup, the user
passes either a v1 or v2 cgroup and all pids that belong to the cgroup
subtree are scanned.  When run with neither --pid nor --cgroup, the full
set of pids on the system is gathered from /proc and scanned as if the
user had provided "--pid 1 --pid 2 ...".

A default set of statistics is always generated for THP mappings.
However, it is also possible to generate additional statistics for
"contiguous block mappings" where the block size is user-defined.

Statistics are maintained independently for anonymous and file-backed
(pagecache) memory and are shown both in kB and as a percentage of either
total anonymous or total file-backed memory as appropriate.

THP Statistics
--------------

Statistics are always generated for fully- and contiguously-mapped THPs
whose mapping address is aligned to their size, for each <size> supported
by the system.  Separate counters describe THPs mapped by PTE vs those
mapped by PMD.  (Although note a THP can only be mapped by PMD if it is
PMD-sized):

- anon-thp-pte-aligned-<size>kB
- file-thp-pte-aligned-<size>kB
- anon-thp-pmd-aligned-<size>kB
- file-thp-pmd-aligned-<size>kB

Similarly, statistics are always generated for fully- and contiguously-
mapped THPs whose mapping address is *not* aligned to their size, for each
<size> supported by the system.  Due to the unaligned mapping, it is
impossible to map by PMD, so there are only PTE counters for this case:

- anon-thp-pte-unaligned-<size>kB
- file-thp-pte-unaligned-<size>kB

Statistics are also always generated for mapped pages that belong to a THP
but where the is THP is *not* fully- and contiguously- mapped.  These
"partial" mappings are all counted in the same counter regardless of the
size of the THP that is partially mapped:

- anon-thp-pte-partial
- file-thp-pte-partial

Contiguous Block Statistics
---------------------------

An optional, additional set of statistics is generated for every
contiguous block size specified with `--cont <size>`.  These statistics
show how much memory is mapped in contiguous blocks of <size> and also
aligned to <size>.  A given contiguous block must all belong to the same
THP, but there is no requirement for it to be the *whole* THP.  Separate
counters describe contiguous blocks mapped by PTE vs those mapped by PMD:

- anon-cont-pte-aligned-<size>kB
- file-cont-pte-aligned-<size>kB
- anon-cont-pmd-aligned-<size>kB
- file-cont-pmd-aligned-<size>kB

As an example, if monitoring 64K contiguous blocks (--cont 64K), there are
a number of sources that could provide such blocks: a fully- and
contiguously-mapped 64K THP that is aligned to a 64K boundary would
provide 1 block.  A fully- and contiguously-mapped 128K THP that is
aligned to at least a 64K boundary would provide 2 blocks.  Or a 128K THP
that maps its first 100K, but contiguously and starting at a 64K boundary
would provide 1 block.  A fully- and contiguously-mapped 2M THP would
provide 32 blocks.  There are many other possible permutations.

options:
  -h, --help           show this help message and exit
  --pid pid            Process id of the target process. Maybe issued
                       multiple times to scan multiple processes. --pid
                       and --cgroup are mutually exclusive. If neither
                       are provided, all processes are scanned to
                       provide system-wide information.
  --cgroup path        Path to the target cgroup in sysfs. Iterates
                       over every pid in the cgroup and its children.
                       --pid and --cgroup are mutually exclusive. If
                       neither are provided, all processes are scanned
                       to provide system-wide information.
  --rollup             Sum the per-vma statistics to provide a summary
                       over the whole system, process or cgroup.
  --cont size[KMG]     Adds stats for memory that is mapped in
                       contiguous blocks of <size> and also aligned to
                       <size>. May be issued multiple times to track
                       multiple sized blocks. Useful to infer e.g.
                       arm64 contpte and hpa mappings. Size must be a
                       power-of-2 number of pages.
  --inc-smaps          Include all numerical, additive
                       /proc/<pid>/smaps stats in the output.
  --inc-empty          Show all statistics including those whose value
                       is 0.
  --periodic sleep_ms  Run in a loop, polling every sleep_ms
                       milliseconds.

Requires root privilege to access pagemap and kpageflags.
--8<--

Example command to summarise fully and partially mapped THPs and 64K
contiguous blocks over all VMAs in all processes in the system
(--inc-empty forces printing stats that are 0):

    # ./thpmaps --cont 64K --rollup --inc-empty

--8<--
anon-thp-pmd-aligned-2048kB:      139264 kB ( 6%)
file-thp-pmd-aligned-2048kB:           0 kB ( 0%)
anon-thp-pte-aligned-16kB:             0 kB ( 0%)
anon-thp-pte-aligned-32kB:             0 kB ( 0%)
anon-thp-pte-aligned-64kB:         72256 kB ( 3%)
anon-thp-pte-aligned-128kB:            0 kB ( 0%)
anon-thp-pte-aligned-256kB:            0 kB ( 0%)
anon-thp-pte-aligned-512kB:            0 kB ( 0%)
anon-thp-pte-aligned-1024kB:           0 kB ( 0%)
anon-thp-pte-aligned-2048kB:           0 kB ( 0%)
anon-thp-pte-unaligned-16kB:           0 kB ( 0%)
anon-thp-pte-unaligned-32kB:           0 kB ( 0%)
anon-thp-pte-unaligned-64kB:           0 kB ( 0%)
anon-thp-pte-unaligned-128kB:          0 kB ( 0%)
anon-thp-pte-unaligned-256kB:          0 kB ( 0%)
anon-thp-pte-unaligned-512kB:          0 kB ( 0%)
anon-thp-pte-unaligned-1024kB:         0 kB ( 0%)
anon-thp-pte-unaligned-2048kB:         0 kB ( 0%)
anon-thp-pte-partial:              63232 kB ( 3%)
file-thp-pte-aligned-16kB:        809024 kB (47%)
file-thp-pte-aligned-32kB:         43168 kB ( 3%)
file-thp-pte-aligned-64kB:         98496 kB ( 6%)
file-thp-pte-aligned-128kB:        17536 kB ( 1%)
file-thp-pte-aligned-256kB:            0 kB ( 0%)
file-thp-pte-aligned-512kB:            0 kB ( 0%)
file-thp-pte-aligned-1024kB:           0 kB ( 0%)
file-thp-pte-aligned-2048kB:           0 kB ( 0%)
file-thp-pte-unaligned-16kB:       21712 kB ( 1%)
file-thp-pte-unaligned-32kB:         704 kB ( 0%)
file-thp-pte-unaligned-64kB:         896 kB ( 0%)
file-thp-pte-unaligned-128kB:      44928 kB ( 3%)
file-thp-pte-unaligned-256kB:          0 kB ( 0%)
file-thp-pte-unaligned-512kB:          0 kB ( 0%)
file-thp-pte-unaligned-1024kB:         0 kB ( 0%)
file-thp-pte-unaligned-2048kB:         0 kB ( 0%)
file-thp-pte-partial:               9252 kB ( 1%)
anon-cont-pmd-aligned-64kB:       139264 kB ( 6%)
file-cont-pmd-aligned-64kB:            0 kB ( 0%)
anon-cont-pte-aligned-64kB:       100672 kB ( 4%)
file-cont-pte-aligned-64kB:       161856 kB ( 9%)
--8<--

Link: https://lkml.kernel.org/r/20240116141235.960842-1-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: Barry Song <v-songbaohua@oppo.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: Zenghui Yu <yuzenghui@huawei.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/mm/Makefile |   9 +-
 tools/mm/thpmaps  | 675 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 680 insertions(+), 4 deletions(-)
 create mode 100644 tools/mm/thpmaps

diff --git a/tools/mm/Makefile b/tools/mm/Makefile
index 1c5606cc33346b..7bb03606b9eaa2 100644
--- a/tools/mm/Makefile
+++ b/tools/mm/Makefile
@@ -3,7 +3,8 @@
 #
 include ../scripts/Makefile.include
 
-TARGETS=page-types slabinfo page_owner_sort
+BUILD_TARGETS=page-types slabinfo page_owner_sort
+INSTALL_TARGETS = $(BUILD_TARGETS) thpmaps
 
 LIB_DIR = ../lib/api
 LIBS = $(LIB_DIR)/libapi.a
@@ -11,9 +12,9 @@ LIBS = $(LIB_DIR)/libapi.a
 CFLAGS += -Wall -Wextra -I../lib/ -pthread
 LDFLAGS += $(LIBS) -pthread
 
-all: $(TARGETS)
+all: $(BUILD_TARGETS)
 
-$(TARGETS): $(LIBS)
+$(BUILD_TARGETS): $(LIBS)
 
 $(LIBS):
 	make -C $(LIB_DIR)
@@ -29,4 +30,4 @@ sbindir ?= /usr/sbin
 
 install: all
 	install -d $(DESTDIR)$(sbindir)
-	install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir)
+	install -m 755 -p $(INSTALL_TARGETS) $(DESTDIR)$(sbindir)
diff --git a/tools/mm/thpmaps b/tools/mm/thpmaps
new file mode 100644
index 00000000000000..803e0318f2fea1
--- /dev/null
+++ b/tools/mm/thpmaps
@@ -0,0 +1,675 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2024 ARM Ltd.
+#
+# Utility providing smaps-like output detailing transparent hugepage usage.
+# For more info, run:
+# ./thpmaps --help
+#
+# Requires numpy:
+# pip3 install numpy
+
+
+import argparse
+import collections
+import math
+import os
+import re
+import resource
+import shutil
+import sys
+import textwrap
+import time
+import numpy as np
+
+
+with open('/sys/kernel/mm/transparent_hugepage/hpage_pmd_size') as f:
+    PAGE_SIZE = resource.getpagesize()
+    PAGE_SHIFT = int(math.log2(PAGE_SIZE))
+    PMD_SIZE = int(f.read())
+    PMD_ORDER = int(math.log2(PMD_SIZE / PAGE_SIZE))
+
+
+def align_forward(v, a):
+    return (v + (a - 1)) & ~(a - 1)
+
+
+def align_offset(v, a):
+    return v & (a - 1)
+
+
+def kbnr(kb):
+    # Convert KB to number of pages.
+    return (kb << 10) >> PAGE_SHIFT
+
+
+def nrkb(nr):
+    # Convert number of pages to KB.
+    return (nr << PAGE_SHIFT) >> 10
+
+
+def odkb(order):
+    # Convert page order to KB.
+    return (PAGE_SIZE << order) >> 10
+
+
+def cont_ranges_all(search, index):
+    # Given a list of arrays, find the ranges for which values are monotonically
+    # incrementing in all arrays. all arrays in search and index must be the
+    # same size.
+    sz = len(search[0])
+    r = np.full(sz, 2)
+    d = np.diff(search[0]) == 1
+    for dd in [np.diff(arr) == 1 for arr in search[1:]]:
+        d &= dd
+    r[1:] -= d
+    r[:-1] -= d
+    return [np.repeat(arr, r).reshape(-1, 2) for arr in index]
+
+
+class ArgException(Exception):
+    pass
+
+
+class FileIOException(Exception):
+    pass
+
+
+class BinArrayFile:
+    # Base class used to read /proc/<pid>/pagemap and /proc/kpageflags into a
+    # numpy array. Use inherrited class in a with clause to ensure file is
+    # closed when it goes out of scope.
+    def __init__(self, filename, element_size):
+        self.element_size = element_size
+        self.filename = filename
+        self.fd = os.open(self.filename, os.O_RDONLY)
+
+    def cleanup(self):
+        os.close(self.fd)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.cleanup()
+
+    def _readin(self, offset, buffer):
+        length = os.preadv(self.fd, (buffer,), offset)
+        if len(buffer) != length:
+            raise FileIOException('error: {} failed to read {} bytes at {:x}'
+                            .format(self.filename, len(buffer), offset))
+
+    def _toarray(self, buf):
+        assert(self.element_size == 8)
+        return np.frombuffer(buf, dtype=np.uint64)
+
+    def getv(self, vec):
+        vec *= self.element_size
+        offsets = vec[:, 0]
+        lengths = (np.diff(vec) + self.element_size).reshape(len(vec))
+        buf = bytearray(int(np.sum(lengths)))
+        view = memoryview(buf)
+        pos = 0
+        for offset, length in zip(offsets, lengths):
+            offset = int(offset)
+            length = int(length)
+            self._readin(offset, view[pos:pos+length])
+            pos += length
+        return self._toarray(buf)
+
+    def get(self, index, nr=1):
+        offset = index * self.element_size
+        length = nr * self.element_size
+        buf = bytearray(length)
+        self._readin(offset, buf)
+        return self._toarray(buf)
+
+
+PM_PAGE_PRESENT = 1 << 63
+PM_PFN_MASK = (1 << 55) - 1
+
+class PageMap(BinArrayFile):
+    # Read ranges of a given pid's pagemap into a numpy array.
+    def __init__(self, pid='self'):
+        super().__init__(f'/proc/{pid}/pagemap', 8)
+
+
+KPF_ANON = 1 << 12
+KPF_COMPOUND_HEAD = 1 << 15
+KPF_COMPOUND_TAIL = 1 << 16
+KPF_THP = 1 << 22
+
+class KPageFlags(BinArrayFile):
+    # Read ranges of /proc/kpageflags into a numpy array.
+    def __init__(self):
+         super().__init__(f'/proc/kpageflags', 8)
+
+
+vma_all_stats = set([
+    "Size",
+    "Rss",
+    "Pss",
+    "Pss_Dirty",
+    "Shared_Clean",
+    "Shared_Dirty",
+    "Private_Clean",
+    "Private_Dirty",
+    "Referenced",
+    "Anonymous",
+    "KSM",
+    "LazyFree",
+    "AnonHugePages",
+    "ShmemPmdMapped",
+    "FilePmdMapped",
+    "Shared_Hugetlb",
+    "Private_Hugetlb",
+    "Swap",
+    "SwapPss",
+    "Locked",
+])
+
+vma_min_stats = set([
+    "Rss",
+    "Anonymous",
+    "AnonHugePages",
+    "ShmemPmdMapped",
+    "FilePmdMapped",
+])
+
+VMA = collections.namedtuple('VMA', [
+    'name',
+    'start',
+    'end',
+    'read',
+    'write',
+    'execute',
+    'private',
+    'pgoff',
+    'major',
+    'minor',
+    'inode',
+    'stats',
+])
+
+class VMAList:
+    # A container for VMAs, parsed from /proc/<pid>/smaps. Iterate over the
+    # instance to receive VMAs.
+    def __init__(self, pid='self', stats=[]):
+        self.vmas = []
+        with open(f'/proc/{pid}/smaps', 'r') as file:
+            for line in file:
+                elements = line.split()
+                if '-' in elements[0]:
+                    start, end = map(lambda x: int(x, 16), elements[0].split('-'))
+                    major, minor = map(lambda x: int(x, 16), elements[3].split(':'))
+                    self.vmas.append(VMA(
+                        name=elements[5] if len(elements) == 6 else '',
+                        start=start,
+                        end=end,
+                        read=elements[1][0] == 'r',
+                        write=elements[1][1] == 'w',
+                        execute=elements[1][2] == 'x',
+                        private=elements[1][3] == 'p',
+                        pgoff=int(elements[2], 16),
+                        major=major,
+                        minor=minor,
+                        inode=int(elements[4], 16),
+                        stats={},
+                    ))
+                else:
+                    param = elements[0][:-1]
+                    if param in stats:
+                        value = int(elements[1])
+                        self.vmas[-1].stats[param] = {'type': None, 'value': value}
+
+    def __iter__(self):
+        yield from self.vmas
+
+
+def thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads):
+    # Given 4 same-sized arrays representing a range within a page table backed
+    # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons:
+    # True if page is anonymous, heads: True if page is head of a THP), return a
+    # dictionary of statistics describing the mapped THPs.
+    stats = {
+        'file': {
+            'partial': 0,
+            'aligned': [0] * (PMD_ORDER + 1),
+            'unaligned': [0] * (PMD_ORDER + 1),
+        },
+        'anon': {
+            'partial': 0,
+            'aligned': [0] * (PMD_ORDER + 1),
+            'unaligned': [0] * (PMD_ORDER + 1),
+        },
+    }
+
+    for rindex, rpfn in zip(ranges[0], ranges[2]):
+        index_next = int(rindex[0])
+        index_end = int(rindex[1]) + 1
+        pfn_end = int(rpfn[1]) + 1
+
+        folios = indexes[index_next:index_end][heads[index_next:index_end]]
+
+        # Account pages for any partially mapped THP at the front. In that case,
+        # the first page of the range is a tail.
+        nr = (int(folios[0]) if len(folios) else index_end) - index_next
+        stats['anon' if anons[index_next] else 'file']['partial'] += nr
+
+        # Account pages for any partially mapped THP at the back. In that case,
+        # the next page after the range is a tail.
+        if len(folios):
+            flags = int(kpageflags.get(pfn_end)[0])
+            if flags & KPF_COMPOUND_TAIL:
+                nr = index_end - int(folios[-1])
+                folios = folios[:-1]
+                index_end -= nr
+                stats['anon' if anons[index_end - 1] else 'file']['partial'] += nr
+
+        # Account fully mapped THPs in the middle of the range.
+        if len(folios):
+            folio_nrs = np.append(np.diff(folios), np.uint64(index_end - folios[-1]))
+            folio_orders = np.log2(folio_nrs).astype(np.uint64)
+            for index, order in zip(folios, folio_orders):
+                index = int(index)
+                order = int(order)
+                nr = 1 << order
+                vfn = int(vfns[index])
+                align = 'aligned' if align_forward(vfn, nr) == vfn else 'unaligned'
+                anon = 'anon' if anons[index] else 'file'
+                stats[anon][align][order] += nr
+
+    # Account PMD-mapped THPs spearately, so filter out of the stats. There is a
+    # race between acquiring the smaps stats and reading pagemap, where memory
+    # could be deallocated. So clamp to zero incase it would have gone negative.
+    anon_pmd_mapped = vma.stats['AnonHugePages']['value']
+    file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
+                      vma.stats['FilePmdMapped']['value']
+    stats['anon']['aligned'][PMD_ORDER] = max(0, stats['anon']['aligned'][PMD_ORDER] - kbnr(anon_pmd_mapped))
+    stats['file']['aligned'][PMD_ORDER] = max(0, stats['file']['aligned'][PMD_ORDER] - kbnr(file_pmd_mapped))
+
+    rstats = {
+        f"anon-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'anon', 'value': anon_pmd_mapped},
+        f"file-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'file', 'value': file_pmd_mapped},
+    }
+
+    def flatten_sub(type, subtype, stats):
+        param = f"{type}-thp-pte-{subtype}-{{}}kB"
+        for od, nr in enumerate(stats[2:], 2):
+            rstats[param.format(odkb(od))] = {'type': type, 'value': nrkb(nr)}
+
+    def flatten_type(type, stats):
+        flatten_sub(type, 'aligned', stats['aligned'])
+        flatten_sub(type, 'unaligned', stats['unaligned'])
+        rstats[f"{type}-thp-pte-partial"] = {'type': type, 'value': nrkb(stats['partial'])}
+
+    flatten_type('anon', stats['anon'])
+    flatten_type('file', stats['file'])
+
+    return rstats
+
+
+def cont_parse(vma, order, ranges, anons, heads):
+    # Given 4 same-sized arrays representing a range within a page table backed
+    # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons:
+    # True if page is anonymous, heads: True if page is head of a THP), return a
+    # dictionary of statistics describing the contiguous blocks.
+    nr_cont = 1 << order
+    nr_anon = 0
+    nr_file = 0
+
+    for rindex, rvfn, rpfn in zip(*ranges):
+        index_next = int(rindex[0])
+        index_end = int(rindex[1]) + 1
+        vfn_start = int(rvfn[0])
+        pfn_start = int(rpfn[0])
+
+        if align_offset(pfn_start, nr_cont) != align_offset(vfn_start, nr_cont):
+            continue
+
+        off = align_forward(vfn_start, nr_cont) - vfn_start
+        index_next += off
+
+        while index_next + nr_cont <= index_end:
+            folio_boundary = heads[index_next+1:index_next+nr_cont].any()
+            if not folio_boundary:
+                if anons[index_next]:
+                    nr_anon += nr_cont
+                else:
+                    nr_file += nr_cont
+            index_next += nr_cont
+
+    # Account blocks that are PMD-mapped spearately, so filter out of the stats.
+    # There is a race between acquiring the smaps stats and reading pagemap,
+    # where memory could be deallocated. So clamp to zero incase it would have
+    # gone negative.
+    anon_pmd_mapped = vma.stats['AnonHugePages']['value']
+    file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
+                    vma.stats['FilePmdMapped']['value']
+    nr_anon = max(0, nr_anon - kbnr(anon_pmd_mapped))
+    nr_file = max(0, nr_file - kbnr(file_pmd_mapped))
+
+    rstats = {
+        f"anon-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'anon', 'value': anon_pmd_mapped},
+        f"file-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'file', 'value': file_pmd_mapped},
+    }
+
+    rstats[f"anon-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'anon', 'value': nrkb(nr_anon)}
+    rstats[f"file-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'file', 'value': nrkb(nr_file)}
+
+    return rstats
+
+
+def vma_print(vma, pid):
+    # Prints a VMA instance in a format similar to smaps. The main difference is
+    # that the pid is included as the first value.
+    print("{:010d}: {:016x}-{:016x} {}{}{}{} {:08x} {:02x}:{:02x} {:08x} {}"
+        .format(
+            pid, vma.start, vma.end,
+            'r' if vma.read else '-', 'w' if vma.write else '-',
+            'x' if vma.execute else '-', 'p' if vma.private else 's',
+            vma.pgoff, vma.major, vma.minor, vma.inode, vma.name
+        ))
+
+
+def stats_print(stats, tot_anon, tot_file, inc_empty):
+    # Print a statistics dictionary.
+    label_field = 32
+    for label, stat in stats.items():
+        type = stat['type']
+        value = stat['value']
+        if value or inc_empty:
+            pad = max(0, label_field - len(label) - 1)
+            if type == 'anon' and tot_anon > 0:
+                percent = f' ({value / tot_anon:3.0%})'
+            elif type == 'file' and tot_file > 0:
+                percent = f' ({value / tot_file:3.0%})'
+            else:
+                percent = ''
+            print(f"{label}:{' ' * pad}{value:8} kB{percent}")
+
+
+def vma_parse(vma, pagemap, kpageflags, contorders):
+    # Generate thp and cont statistics for a single VMA.
+    start = vma.start >> PAGE_SHIFT
+    end = vma.end >> PAGE_SHIFT
+
+    pmes = pagemap.get(start, end - start)
+    present = pmes & PM_PAGE_PRESENT != 0
+    pfns = pmes & PM_PFN_MASK
+    pfns = pfns[present]
+    vfns = np.arange(start, end, dtype=np.uint64)
+    vfns = vfns[present]
+
+    pfn_vec = cont_ranges_all([pfns], [pfns])[0]
+    flags = kpageflags.getv(pfn_vec)
+    anons = flags & KPF_ANON != 0
+    heads = flags & KPF_COMPOUND_HEAD != 0
+    thps = flags & KPF_THP != 0
+
+    vfns = vfns[thps]
+    pfns = pfns[thps]
+    anons = anons[thps]
+    heads = heads[thps]
+
+    indexes = np.arange(len(vfns), dtype=np.uint64)
+    ranges = cont_ranges_all([vfns, pfns], [indexes, vfns, pfns])
+
+    thpstats = thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads)
+    contstats = [cont_parse(vma, order, ranges, anons, heads) for order in contorders]
+
+    tot_anon = vma.stats['Anonymous']['value']
+    tot_file = vma.stats['Rss']['value'] - tot_anon
+
+    return {
+        **thpstats,
+        **{k: v for s in contstats for k, v in s.items()}
+    }, tot_anon, tot_file
+
+
+def do_main(args):
+    pids = set()
+    rollup = {}
+    rollup_anon = 0
+    rollup_file = 0
+
+    if args.cgroup:
+        strict = False
+        for walk_info in os.walk(args.cgroup):
+            cgroup = walk_info[0]
+            with open(f'{cgroup}/cgroup.procs') as pidfile:
+                for line in pidfile.readlines():
+                    pids.add(int(line.strip()))
+    elif args.pid:
+        strict = True
+        pids = pids.union(args.pid)
+    else:
+        strict = False
+        for pid in os.listdir('/proc'):
+            if pid.isdigit():
+                pids.add(int(pid))
+
+    if not args.rollup:
+        print("       PID             START              END PROT   OFFSET   DEV    INODE OBJECT")
+
+    for pid in pids:
+        try:
+            with PageMap(pid) as pagemap:
+                with KPageFlags() as kpageflags:
+                    for vma in VMAList(pid, vma_all_stats if args.inc_smaps else vma_min_stats):
+                        if (vma.read or vma.write or vma.execute) and vma.stats['Rss']['value'] > 0:
+                            stats, vma_anon, vma_file = vma_parse(vma, pagemap, kpageflags, args.cont)
+                        else:
+                            stats = {}
+                            vma_anon = 0
+                            vma_file = 0
+                        if args.inc_smaps:
+                            stats = {**vma.stats, **stats}
+                        if args.rollup:
+                            for k, v in stats.items():
+                                if k in rollup:
+                                    assert(rollup[k]['type'] == v['type'])
+                                    rollup[k]['value'] += v['value']
+                                else:
+                                    rollup[k] = v
+                            rollup_anon += vma_anon
+                            rollup_file += vma_file
+                        else:
+                            vma_print(vma, pid)
+                            stats_print(stats, vma_anon, vma_file, args.inc_empty)
+        except (FileNotFoundError, ProcessLookupError, FileIOException):
+            if strict:
+                raise
+
+    if args.rollup:
+        stats_print(rollup, rollup_anon, rollup_file, args.inc_empty)
+
+
+def main():
+    docs_width = shutil.get_terminal_size().columns
+    docs_width -= 2
+    docs_width = min(80, docs_width)
+
+    def format(string):
+        text = re.sub(r'\s+', ' ', string)
+        text = re.sub(r'\s*\\n\s*', '\n', text)
+        paras = text.split('\n')
+        paras = [textwrap.fill(p, width=docs_width) for p in paras]
+        return '\n'.join(paras)
+
+    def formatter(prog):
+        return argparse.RawDescriptionHelpFormatter(prog, width=docs_width)
+
+    def size2order(human):
+        units = {
+            "K": 2**10, "M": 2**20, "G": 2**30,
+            "k": 2**10, "m": 2**20, "g": 2**30,
+        }
+        unit = 1
+        if human[-1] in units:
+            unit = units[human[-1]]
+            human = human[:-1]
+        try:
+            size = int(human)
+        except ValueError:
+            raise ArgException('error: --cont value must be integer size with optional KMG unit')
+        size *= unit
+        order = int(math.log2(size / PAGE_SIZE))
+        if order < 1:
+            raise ArgException('error: --cont value must be size of at least 2 pages')
+        if (1 << order) * PAGE_SIZE != size:
+            raise ArgException('error: --cont value must be size of power-of-2 pages')
+        if order > PMD_ORDER:
+            raise ArgException('error: --cont value must be less than or equal to PMD order')
+        return order
+
+    parser = argparse.ArgumentParser(formatter_class=formatter,
+        description=format("""Prints information about how transparent huge
+                    pages are mapped, either system-wide, or for a specified
+                    process or cgroup.\\n
+                    \\n
+                    When run with --pid, the user explicitly specifies the set
+                    of pids to scan. e.g. "--pid 10 [--pid 134 ...]". When run
+                    with --cgroup, the user passes either a v1 or v2 cgroup and
+                    all pids that belong to the cgroup subtree are scanned. When
+                    run with neither --pid nor --cgroup, the full set of pids on
+                    the system is gathered from /proc and scanned as if the user
+                    had provided "--pid 1 --pid 2 ...".\\n
+                    \\n
+                    A default set of statistics is always generated for THP
+                    mappings. However, it is also possible to generate
+                    additional statistics for "contiguous block mappings" where
+                    the block size is user-defined.\\n
+                    \\n
+                    Statistics are maintained independently for anonymous and
+                    file-backed (pagecache) memory and are shown both in kB and
+                    as a percentage of either total anonymous or total
+                    file-backed memory as appropriate.\\n
+                    \\n
+                    THP Statistics\\n
+                    --------------\\n
+                    \\n
+                    Statistics are always generated for fully- and
+                    contiguously-mapped THPs whose mapping address is aligned to
+                    their size, for each <size> supported by the system.
+                    Separate counters describe THPs mapped by PTE vs those
+                    mapped by PMD. (Although note a THP can only be mapped by
+                    PMD if it is PMD-sized):\\n
+                    \\n
+                    - anon-thp-pte-aligned-<size>kB\\n
+                    - file-thp-pte-aligned-<size>kB\\n
+                    - anon-thp-pmd-aligned-<size>kB\\n
+                    - file-thp-pmd-aligned-<size>kB\\n
+                    \\n
+                    Similarly, statistics are always generated for fully- and
+                    contiguously-mapped THPs whose mapping address is *not*
+                    aligned to their size, for each <size> supported by the
+                    system. Due to the unaligned mapping, it is impossible to
+                    map by PMD, so there are only PTE counters for this case:\\n
+                    \\n
+                    - anon-thp-pte-unaligned-<size>kB\\n
+                    - file-thp-pte-unaligned-<size>kB\\n
+                    \\n
+                    Statistics are also always generated for mapped pages that
+                    belong to a THP but where the is THP is *not* fully- and
+                    contiguously- mapped. These "partial" mappings are all
+                    counted in the same counter regardless of the size of the
+                    THP that is partially mapped:\\n
+                    \\n
+                    - anon-thp-pte-partial\\n
+                    - file-thp-pte-partial\\n
+                    \\n
+                    Contiguous Block Statistics\\n
+                    ---------------------------\\n
+                    \\n
+                    An optional, additional set of statistics is generated for
+                    every contiguous block size specified with `--cont <size>`.
+                    These statistics show how much memory is mapped in
+                    contiguous blocks of <size> and also aligned to <size>. A
+                    given contiguous block must all belong to the same THP, but
+                    there is no requirement for it to be the *whole* THP.
+                    Separate counters describe contiguous blocks mapped by PTE
+                    vs those mapped by PMD:\\n
+                    \\n
+                    - anon-cont-pte-aligned-<size>kB\\n
+                    - file-cont-pte-aligned-<size>kB\\n
+                    - anon-cont-pmd-aligned-<size>kB\\n
+                    - file-cont-pmd-aligned-<size>kB\\n
+                    \\n
+                    As an example, if monitoring 64K contiguous blocks (--cont
+                    64K), there are a number of sources that could provide such
+                    blocks: a fully- and contiguously-mapped 64K THP that is
+                    aligned to a 64K boundary would provide 1 block. A fully-
+                    and contiguously-mapped 128K THP that is aligned to at least
+                    a 64K boundary would provide 2 blocks. Or a 128K THP that
+                    maps its first 100K, but contiguously and starting at a 64K
+                    boundary would provide 1 block. A fully- and
+                    contiguously-mapped 2M THP would provide 32 blocks. There
+                    are many other possible permutations.\\n"""),
+        epilog=format("""Requires root privilege to access pagemap and
+                    kpageflags."""))
+
+    group = parser.add_mutually_exclusive_group(required=False)
+    group.add_argument('--pid',
+        metavar='pid', required=False, type=int, default=[], action='append',
+        help="""Process id of the target process. Maybe issued multiple times to
+            scan multiple processes. --pid and --cgroup are mutually exclusive.
+            If neither are provided, all processes are scanned to provide
+            system-wide information.""")
+
+    group.add_argument('--cgroup',
+        metavar='path', required=False,
+        help="""Path to the target cgroup in sysfs. Iterates over every pid in
+            the cgroup and its children. --pid and --cgroup are mutually
+            exclusive. If neither are provided, all processes are scanned to
+            provide system-wide information.""")
+
+    parser.add_argument('--rollup',
+        required=False, default=False, action='store_true',
+        help="""Sum the per-vma statistics to provide a summary over the whole
+            system, process or cgroup.""")
+
+    parser.add_argument('--cont',
+        metavar='size[KMG]', required=False, default=[], action='append',
+        help="""Adds stats for memory that is mapped in contiguous blocks of
+            <size> and also aligned to <size>. May be issued multiple times to
+            track multiple sized blocks. Useful to infer e.g. arm64 contpte and
+            hpa mappings. Size must be a power-of-2 number of pages.""")
+
+    parser.add_argument('--inc-smaps',
+        required=False, default=False, action='store_true',
+        help="""Include all numerical, additive /proc/<pid>/smaps stats in the
+            output.""")
+
+    parser.add_argument('--inc-empty',
+        required=False, default=False, action='store_true',
+        help="""Show all statistics including those whose value is 0.""")
+
+    parser.add_argument('--periodic',
+        metavar='sleep_ms', required=False, type=int,
+        help="""Run in a loop, polling every sleep_ms milliseconds.""")
+
+    args = parser.parse_args()
+
+    try:
+        args.cont = [size2order(cont) for cont in args.cont]
+    except ArgException as e:
+        parser.print_usage()
+        raise
+
+    if args.periodic:
+        while True:
+            do_main(args)
+            print()
+            time.sleep(args.periodic / 1000)
+    else:
+        do_main(args)
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        prog = os.path.basename(sys.argv[0])
+        print(f'{prog}: {e}')
+        exit(1)

From 7876e99dd59f1d1257a4349986d9b52e3e088853 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 17 Jan 2024 18:39:54 +0800
Subject: [PATCH 462/707] mm: memory: move mem_cgroup_charge() into
 alloc_anon_folio()

The GFP flags from vma_thp_gfp_mask() according to user configuration only
used for large folio allocation but not for memory cgroup charge, and
GFP_KERNEL is used for both order-0 and large order folio when memory
cgroup charge at present.  However, mem_cgroup_charge() uses the GFP flags
in a fairly sophisticated way.  In addition to checking
gfpflags_allow_blocking(), it pays attention to __GFP_NORETRY and
__GFP_RETRY_MAYFAIL to ensure that processes within this memcg do not
exceed their quotas.

So we'd better to move mem_cgroup_charge() into alloc_anon_folio(),

1) it will make us to allocate as much as possible large order folio,
   because we could try the next order if mem_cgroup_charge() fails,
   although the memcg's memory usage is close to its limits.

2) using same GFP flags for allocation and charge is to be consistent
   with PMD THP firstly, in addition, according to GFP flag returned from
   vma_thp_gfp_mask(), GFP_TRANSHUGE_LIGHT could make us skip direct
   reclaim, _GFP_NORETRY will make us skip mem_cgroup_oom() and won't
   trigger memory cgroup oom from large order(order <= COSTLY_ORDER) folio
   charging.

Link: https://lkml.kernel.org/r/20240122011612.501029-1-wangkefeng.wang@huawei.com
Link: https://lkml.kernel.org/r/20240117103954.2756050-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index f5750bfcdc058f..8d14ba44092965 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4153,8 +4153,8 @@ static bool pte_range_none(pte_t *pte, int nr_pages)
 
 static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	struct vm_area_struct *vma = vmf->vma;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	unsigned long orders;
 	struct folio *folio;
 	unsigned long addr;
@@ -4206,15 +4206,21 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
 		folio = vma_alloc_folio(gfp, order, vma, addr, true);
 		if (folio) {
+			if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+				folio_put(folio);
+				goto next;
+			}
+			folio_throttle_swaprate(folio, gfp);
 			clear_huge_page(&folio->page, vmf->address, 1 << order);
 			return folio;
 		}
+next:
 		order = next_order(&orders, order);
 	}
 
 fallback:
 #endif
-	return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address);
+	return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
 }
 
 /*
@@ -4281,10 +4287,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	nr_pages = folio_nr_pages(folio);
 	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
 
-	if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
-		goto oom_free_page;
-	folio_throttle_swaprate(folio, GFP_KERNEL);
-
 	/*
 	 * The memory barrier inside __folio_mark_uptodate makes sure that
 	 * preceding stores to the page contents become visible before
@@ -4338,8 +4340,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 release:
 	folio_put(folio);
 	goto unlock;
-oom_free_page:
-	folio_put(folio);
 oom:
 	return VM_FAULT_OOM;
 }

From 3648cefe4682802156f45b64fafbcb7acf947a1c Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 18 Jan 2024 18:42:35 +0000
Subject: [PATCH 463/707] mm: writeback: ratelimit stat flush from
 mem_cgroup_wb_stats

One of our workloads (Postgres 14) has regressed when migrated from 5.10
to 6.1 upstream kernel.  The regression can be reproduced by sysbench's
oltp_write_only benchmark.  It seems like the always on rstat flush in
mem_cgroup_wb_stats() is causing the regression.  So, rate limit that
specific rstat flush.  One potential consequence would be the dirty
throttling might be decided on stale memcg stats.  However from our
benchmarks and production traffic we have not observed any change in the
dirty throttling behavior of the application.

Link: https://lkml.kernel.org/r/20240118184235.618164-1-shakeelb@google.com
Fixes: 2d146aa3aa84 ("mm: memcontrol: switch to rstat")
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6cc6b3a2a60c97..391ecdc5af68a0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4801,7 +4801,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
 	struct mem_cgroup *parent;
 
-	mem_cgroup_flush_stats(memcg);
+	mem_cgroup_flush_stats_ratelimited(memcg);
 
 	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
 	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);

From 12081a7ff341044138ac73f3c14406dbbe0d882a Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Thu, 18 Jan 2024 01:50:57 -0800
Subject: [PATCH 464/707] selftests/memfd: delete unused declarations

Commit 32d118ad50a5 ("selftests/memfd: add tests for F_SEAL_EXEC"):
- added several unused 'nbytes' local variables

Commit 6469b66e3f5a ("selftests: improve vm.memfd_noexec sysctl tests"):
- orphaned 'newpid_thread_fn2()' forward declaration
- orphaned 'join_newpid_thread()' forward declaration
- added unused 'pid' local in sysctl_simple_child()
- orphaned 'fd' local in sysctl_simple_child()
- added unused 'fd' in sysctl_nested_child()

Delete the unused locals and forward declarations.

Link: https://lkml.kernel.org/r/20240118095057.677544-1-gthelen@google.com
Signed-off-by: Greg Thelen <gthelen@google.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Daniel Verkamp <dverkamp@chromium.org>
Cc: Jeff Xu <jeffxu@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/memfd/memfd_test.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 3df00867723910..18f585684e2025 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -44,8 +44,6 @@
  */
 static size_t mfd_def_size = MFD_DEF_SIZE;
 static const char *memfd_str = MEMFD_STR;
-static int newpid_thread_fn2(void *arg);
-static void join_newpid_thread(pid_t pid);
 
 static ssize_t fd2name(int fd, char *buf, size_t bufsize)
 {
@@ -194,7 +192,6 @@ static unsigned int mfd_assert_get_seals(int fd)
 static void mfd_assert_has_seals(int fd, unsigned int seals)
 {
 	char buf[PATH_MAX];
-	int nbytes;
 	unsigned int s;
 	fd2name(fd, buf, PATH_MAX);
 
@@ -696,7 +693,6 @@ static void mfd_assert_mode(int fd, int mode)
 {
 	struct stat st;
 	char buf[PATH_MAX];
-	int nbytes;
 
 	fd2name(fd, buf, PATH_MAX);
 
@@ -715,7 +711,6 @@ static void mfd_assert_mode(int fd, int mode)
 static void mfd_assert_chmod(int fd, int mode)
 {
 	char buf[PATH_MAX];
-	int nbytes;
 
 	fd2name(fd, buf, PATH_MAX);
 
@@ -731,7 +726,6 @@ static void mfd_fail_chmod(int fd, int mode)
 {
 	struct stat st;
 	char buf[PATH_MAX];
-	int nbytes;
 
 	fd2name(fd, buf, PATH_MAX);
 
@@ -1254,9 +1248,6 @@ static void test_sysctl_set_sysctl2(void)
 
 static int sysctl_simple_child(void *arg)
 {
-	int fd;
-	int pid;
-
 	printf("%s sysctl 0\n", memfd_str);
 	test_sysctl_set_sysctl0();
 
@@ -1321,7 +1312,6 @@ static void test_sysctl_sysctl2_failset(void)
 
 static int sysctl_nested_child(void *arg)
 {
-	int fd;
 	int pid;
 
 	printf("%s nested sysctl 0\n", memfd_str);

From 2e7e97097d77028ced808783136a95e8f2e337fe Mon Sep 17 00:00:00 2001
From: Lokesh Gidra <lokeshgidra@google.com>
Date: Wed, 17 Jan 2024 14:39:21 -0800
Subject: [PATCH 465/707] userfaultfd: fix return error if mmap_changing is
 non-zero in MOVE ioctl

To be consistent with other uffd ioctl's returning EAGAIN when
mmap_changing is detected, we should change UFFDIO_MOVE to do the same.

Link: https://lkml.kernel.org/r/20240117223922.1445327-1-lokeshgidra@google.com
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 959551ff9a9514..05c8e8a054272f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -2047,7 +2047,7 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
 			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
 					 uffdio_move.len, uffdio_move.mode);
 		else
-			ret = -EINVAL;
+			ret = -EAGAIN;
 
 		mmap_read_unlock(mm);
 		mmput(mm);

From fd09a0ce540dedca104b19735fbf7e2bef9e1101 Mon Sep 17 00:00:00 2001
From: Nico Pache <npache@redhat.com>
Date: Wed, 17 Jan 2024 11:00:37 -0700
Subject: [PATCH 466/707] selftests: mm: perform some system cleanup before
 using hugepages

When running with CATEGORY= (thp | hugetlb) we see a large numbers of
tests failing.  These failures are due to not being able to allocate a
hugepage and normally occur on memory contrainted systems or when using
large page sizes.

drop_cache and compact_memory before the tests for a higher chance at a
successful hugepage allocation.

Link: https://lkml.kernel.org/r/20240117180037.15734-1-npache@redhat.com
Signed-off-by: Nico Pache <npache@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/run_vmtests.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 246d53a5d7f287..040f27e21f47a3 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -206,6 +206,15 @@ pretty_name() {
 # Usage: run_test [test binary] [arbitrary test arguments...]
 run_test() {
 	if test_selected ${CATEGORY}; then
+		# On memory constrainted systems some tests can fail to allocate hugepages.
+		# perform some cleanup before the test for a higher success rate.
+		if [ ${CATEGORY} == "thp" ] | [ ${CATEGORY} == "hugetlb" ]; then
+			echo 3 > /proc/sys/vm/drop_caches
+			sleep 2
+			echo 1 > /proc/sys/vm/compact_memory
+			sleep 2
+		fi
+
 		local test=$(pretty_name "$*")
 		local title="running $*"
 		local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -)

From e3ea1b826e160639ba53d117f438798bb0e1538f Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Mon, 22 Jan 2024 11:20:00 +0100
Subject: [PATCH 467/707] maple_tree: avoid duplicate variable init in
 mast_spanning_rebalance()

The local variables r_tmp and l_tmp in mast_spanning_rebalance() are
already initialized at its declaration; there is no need to assign the
value again.

Remove the duplicate initialization of {r,l}_tmp.  No functional change.
Due to common compiler optimizations, also no change to object code.

This issue was identified with clang-analyzer's dead stores analysis.

Link: https://lkml.kernel.org/r/20240122102000.29558-1-lukas.bulwahn@gmail.com
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/maple_tree.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 7b161802860bdb..82fb5195c2354f 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -2271,8 +2271,6 @@ bool mast_spanning_rebalance(struct maple_subtree_state *mast)
 	struct ma_state l_tmp = *mast->orig_l;
 	unsigned char depth = 0;
 
-	r_tmp = *mast->orig_r;
-	l_tmp = *mast->orig_l;
 	do {
 		mas_ascend(mast->orig_r);
 		mas_ascend(mast->orig_l);

From 11c2253562339fef6868973ffcf5c0071a5cd3af Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Mon, 22 Jan 2024 10:25:04 +0100
Subject: [PATCH 468/707] mempolicy: clean up minor dead code in
 queue_pages_test_walk()

Commit 2cafb582173f ("mempolicy: remove confusing MPOL_MF_LAZY dead code")
removes MPOL_MF_LAZY handling in queue_pages_test_walk(), and with that,
there is no effective use of the local variable endvma in that function
remaining.

Remove the local variable endvma and its dead code. No functional change.

This issue was identified with clang-analyzer's dead stores analysis.

Link: https://lkml.kernel.org/r/20240122092504.18377-1-lukas.bulwahn@gmail.com
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mempolicy.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 10a590ee1c8997..5e519163c4dcb6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -654,7 +654,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
 {
 	struct vm_area_struct *next, *vma = walk->vma;
 	struct queue_pages *qp = walk->private;
-	unsigned long endvma = vma->vm_end;
 	unsigned long flags = qp->flags;
 
 	/* range check first */
@@ -682,9 +681,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
 	    !(flags & MPOL_MF_STRICT))
 		return 1;
 
-	if (endvma > end)
-		endvma = end;
-
 	/*
 	 * Check page nodes, and queue pages to move, in the current vma.
 	 * But if no moving, and no strict checking, the scan can be skipped.

From 0b5f17a6861eceef820ccf2b2e7c152d006114c9 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:41 +0800
Subject: [PATCH 469/707] kexec: split crashkernel reservation code out from
 crash_core.c

Patch series "Split crash out from kexec and clean up related config
items", v3.

Motivation:
=============
Previously, LKP reported a building error. When investigating, it can't
be resolved reasonablly with the present messy kdump config items.

 https://lore.kernel.org/oe-kbuild-all/202312182200.Ka7MzifQ-lkp@intel.com/

The kdump (crash dumping) related config items could causes confusions:

Firstly,

CRASH_CORE enables codes including
 - crashkernel reservation;
 - elfcorehdr updating;
 - vmcoreinfo exporting;
 - crash hotplug handling;

Now fadump of powerpc, kcore dynamic debugging and kdump all selects
CRASH_CORE, while fadump
 - fadump needs crashkernel parsing, vmcoreinfo exporting, and accessing
   global variable 'elfcorehdr_addr';
 - kcore only needs vmcoreinfo exporting;
 - kdump needs all of the current kernel/crash_core.c.

So only enabling PROC_CORE or FA_DUMP will enable CRASH_CORE, this
mislead people that we enable crash dumping, actual it's not.

Secondly,

It's not reasonable to allow KEXEC_CORE select CRASH_CORE.

Because KEXEC_CORE enables codes which allocate control pages, copy
kexec/kdump segments, and prepare for switching. These codes are
shared by both kexec reboot and kdump. We could want kexec reboot,
but disable kdump. In that case, CRASH_CORE should not be selected.

 --------------------
 CONFIG_CRASH_CORE=y
 CONFIG_KEXEC_CORE=y
 CONFIG_KEXEC=y
 CONFIG_KEXEC_FILE=y
 ---------------------

Thirdly,

It's not reasonable to allow CRASH_DUMP select KEXEC_CORE.

That could make KEXEC_CORE, CRASH_DUMP are enabled independently from
KEXEC or KEXEC_FILE. However, w/o KEXEC or KEXEC_FILE, the KEXEC_CORE
code built in doesn't make any sense because no kernel loading or
switching will happen to utilize the KEXEC_CORE code.
 ---------------------
 CONFIG_CRASH_CORE=y
 CONFIG_KEXEC_CORE=y
 CONFIG_CRASH_DUMP=y
 ---------------------

In this case, what is worse, on arch sh and arm, KEXEC relies on MMU,
while CRASH_DUMP can still be enabled when !MMU, then compiling error is
seen as the lkp test robot reported in above link.

 ------arch/sh/Kconfig------
 config ARCH_SUPPORTS_KEXEC
         def_bool MMU

 config ARCH_SUPPORTS_CRASH_DUMP
         def_bool BROKEN_ON_SMP
 ---------------------------

Changes:
===========
1, split out crash_reserve.c from crash_core.c;
2, split out vmcore_infoc. from crash_core.c;
3, move crash related codes in kexec_core.c into crash_core.c;
4, remove dependency of FA_DUMP on CRASH_DUMP;
5, clean up kdump related config items;
6, wrap up crash codes in crash related ifdefs on all 8 arch-es
   which support crash dumping, except of ppc;

Achievement:
===========
With above changes, I can rearrange the config item logic as below (the right
item depends on or is selected by the left item):

    PROC_KCORE -----------> VMCORE_INFO

               |----------> VMCORE_INFO
    FA_DUMP----|
               |----------> CRASH_RESERVE

                                                    ---->VMCORE_INFO
                                                   /
                                                   |---->CRASH_RESERVE
    KEXEC      --|                                /|
                 |--> KEXEC_CORE--> CRASH_DUMP-->/-|---->PROC_VMCORE
    KEXEC_FILE --|                               \ |
                                                   \---->CRASH_HOTPLUG


    KEXEC      --|
                 |--> KEXEC_CORE (for kexec reboot only)
    KEXEC_FILE --|

Test
========
On all 8 architectures, including x86_64, arm64, s390x, sh, arm, mips,
riscv, loongarch, I did below three cases of config item setting and
building all passed. Take configs on x86_64 as exampmle here:

(1) Both CONFIG_KEXEC and KEXEC_FILE is unset, then all kexec/kdump
items are unset automatically:
# Kexec and crash features
# CONFIG_KEXEC is not set
# CONFIG_KEXEC_FILE is not set
# end of Kexec and crash features

(2) set CONFIG_KEXEC_FILE and 'make olddefconfig':
---------------
# Kexec and crash features
CONFIG_CRASH_RESERVE=y
CONFIG_VMCORE_INFO=y
CONFIG_KEXEC_CORE=y
CONFIG_KEXEC_FILE=y
CONFIG_CRASH_DUMP=y
CONFIG_CRASH_HOTPLUG=y
CONFIG_CRASH_MAX_MEMORY_RANGES=8192
# end of Kexec and crash features
---------------

(3) unset CONFIG_CRASH_DUMP in case 2 and execute 'make olddefconfig':
------------------------
# Kexec and crash features
CONFIG_KEXEC_CORE=y
CONFIG_KEXEC_FILE=y
# end of Kexec and crash features
------------------------

Note:
For ppc, it needs investigation to make clear how to split out crash
code in arch folder. Hope Hari and Pingfan can help have a look, see if
it's doable. Now, I make it either have both kexec and crash enabled, or
disable both of them altogether.


This patch (of 14):

Both kdump and fa_dump of ppc rely on crashkernel reservation.  Move the
relevant codes into separate files: crash_reserve.c,
include/linux/crash_reserve.h.

And also add config item CRASH_RESERVE to control its enabling of the
codes.  And update config items which has relationship with crashkernel
reservation.

And also change ifdeffery from CONFIG_CRASH_CORE to CONFIG_CRASH_RESERVE
when those scopes are only crashkernel reservation related.

And also rename arch/XXX/include/asm/{crash_core.h => crash_reserve.h} on
arm64, x86 and risc-v because those architectures' crash_core.h is only
related to crashkernel reservation.

Link: https://lkml.kernel.org/r/20240124051254.67105-1-bhe@redhat.com
Link: https://lkml.kernel.org/r/20240124051254.67105-2-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/Kconfig                            |   2 +-
 .../asm/{crash_core.h => crash_reserve.h}     |   4 +-
 arch/powerpc/Kconfig                          |   1 +
 arch/powerpc/mm/nohash/kaslr_booke.c          |   4 +-
 arch/riscv/Kconfig                            |   2 +-
 .../asm/{crash_core.h => crash_reserve.h}     |   4 +-
 arch/x86/Kconfig                              |   2 +-
 .../asm/{crash_core.h => crash_reserve.h}     |   6 +-
 include/linux/crash_core.h                    |  40 --
 include/linux/crash_reserve.h                 |  48 ++
 include/linux/kexec.h                         |   1 +
 kernel/Kconfig.kexec                          |   5 +-
 kernel/Makefile                               |   1 +
 kernel/crash_core.c                           | 438 -----------------
 kernel/crash_reserve.c                        | 464 ++++++++++++++++++
 15 files changed, 531 insertions(+), 491 deletions(-)
 rename arch/arm64/include/asm/{crash_core.h => crash_reserve.h} (81%)
 rename arch/riscv/include/asm/{crash_core.h => crash_reserve.h} (78%)
 rename arch/x86/include/asm/{crash_core.h => crash_reserve.h} (92%)
 create mode 100644 include/linux/crash_reserve.h
 create mode 100644 kernel/crash_reserve.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index ad5dbaf3dc9f57..d86d7f4758b54f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1520,7 +1520,7 @@ config ARCH_SUPPORTS_CRASH_DUMP
 	def_bool y
 
 config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-	def_bool CRASH_CORE
+	def_bool CRASH_RESERVE
 
 config TRANS_TABLE
 	def_bool y
diff --git a/arch/arm64/include/asm/crash_core.h b/arch/arm64/include/asm/crash_reserve.h
similarity index 81%
rename from arch/arm64/include/asm/crash_core.h
rename to arch/arm64/include/asm/crash_reserve.h
index 9f5c8d339f44f5..4afe027a4e7b2c 100644
--- a/arch/arm64/include/asm/crash_core.h
+++ b/arch/arm64/include/asm/crash_reserve.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef _ARM64_CRASH_CORE_H
-#define _ARM64_CRASH_CORE_H
+#ifndef _ARM64_CRASH_RESERVE_H
+#define _ARM64_CRASH_RESERVE_H
 
 /* Current arm64 boot protocol requires 2MB alignment */
 #define CRASH_ALIGN                     SZ_2M
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b9fc064d38d281..7f704ae5c5efcb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -691,6 +691,7 @@ config FA_DUMP
 	bool "Firmware-assisted dump"
 	depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
 	select CRASH_CORE
+	select CRASH_RESERVE
 	select CRASH_DUMP
 	help
 	  A robust mechanism to get reliable kernel crash dump with
diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c
index b4f2786a7d2b0b..cdff129abb1446 100644
--- a/arch/powerpc/mm/nohash/kaslr_booke.c
+++ b/arch/powerpc/mm/nohash/kaslr_booke.c
@@ -13,7 +13,7 @@
 #include <linux/delay.h>
 #include <linux/memblock.h>
 #include <linux/libfdt.h>
-#include <linux/crash_core.h>
+#include <linux/crash_reserve.h>
 #include <linux/of.h>
 #include <linux/of_fdt.h>
 #include <asm/cacheflush.h>
@@ -173,7 +173,7 @@ static __init bool overlaps_region(const void *fdt, u32 start,
 
 static void __init get_crash_kernel(void *fdt, unsigned long size)
 {
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_CRASH_RESERVE
 	unsigned long long crash_size, crash_base;
 	int ret;
 
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index bffbd869a06828..bd06ad1bb97cbb 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -767,7 +767,7 @@ config ARCH_SUPPORTS_CRASH_DUMP
 	def_bool y
 
 config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-	def_bool CRASH_CORE
+	def_bool CRASH_RESERVE
 
 config COMPAT
 	bool "Kernel support for 32-bit U-mode"
diff --git a/arch/riscv/include/asm/crash_core.h b/arch/riscv/include/asm/crash_reserve.h
similarity index 78%
rename from arch/riscv/include/asm/crash_core.h
rename to arch/riscv/include/asm/crash_reserve.h
index e1874b23feaf11..013962e63587f3 100644
--- a/arch/riscv/include/asm/crash_core.h
+++ b/arch/riscv/include/asm/crash_reserve.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef _RISCV_CRASH_CORE_H
-#define _RISCV_CRASH_CORE_H
+#ifndef _RISCV_CRASH_RESERVE_H
+#define _RISCV_CRASH_RESERVE_H
 
 #define CRASH_ALIGN			PMD_SIZE
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5edec175b9bfc9..71417c5b228c51 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2106,7 +2106,7 @@ config ARCH_SUPPORTS_CRASH_HOTPLUG
 	def_bool y
 
 config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-	def_bool CRASH_CORE
+	def_bool CRASH_RESEERVE
 
 config PHYSICAL_START
 	hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
diff --git a/arch/x86/include/asm/crash_core.h b/arch/x86/include/asm/crash_reserve.h
similarity index 92%
rename from arch/x86/include/asm/crash_core.h
rename to arch/x86/include/asm/crash_reserve.h
index 76af98f4e80126..152239f9554195 100644
--- a/arch/x86/include/asm/crash_core.h
+++ b/arch/x86/include/asm/crash_reserve.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _X86_CRASH_CORE_H
-#define _X86_CRASH_CORE_H
+#ifndef _X86_CRASH_RESERVE_H
+#define _X86_CRASH_RESERVE_H
 
 /* 16M alignment for crash kernel regions */
 #define CRASH_ALIGN             SZ_16M
@@ -39,4 +39,4 @@ static inline unsigned long crash_low_size_default(void)
 #endif
 }
 
-#endif /* _X86_CRASH_CORE_H */
+#endif /* _X86_CRASH_RESERVE_H */
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 9eaeaafe0cad3a..1fde49246fa6e3 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -5,14 +5,6 @@
 #include <linux/linkage.h>
 #include <linux/elfcore.h>
 #include <linux/elf.h>
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-#include <asm/crash_core.h>
-#endif
-
-/* Location of a reserved region to hold the crash kernel.
- */
-extern struct resource crashk_res;
-extern struct resource crashk_low_res;
 
 #define CRASH_CORE_NOTE_NAME	   "CORE"
 #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
@@ -87,38 +79,6 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len);
 void final_note(Elf_Word *buf);
 
-int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base,
-		unsigned long long *low_size, bool *high);
-
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
-#define DEFAULT_CRASH_KERNEL_LOW_SIZE	(128UL << 20)
-#endif
-#ifndef CRASH_ALIGN
-#define CRASH_ALIGN			SZ_2M
-#endif
-#ifndef CRASH_ADDR_LOW_MAX
-#define CRASH_ADDR_LOW_MAX		SZ_4G
-#endif
-#ifndef CRASH_ADDR_HIGH_MAX
-#define CRASH_ADDR_HIGH_MAX		memblock_end_of_DRAM()
-#endif
-
-void __init reserve_crashkernel_generic(char *cmdline,
-		unsigned long long crash_size,
-		unsigned long long crash_base,
-		unsigned long long crash_low_size,
-		bool high);
-#else
-static inline void __init reserve_crashkernel_generic(char *cmdline,
-		unsigned long long crash_size,
-		unsigned long long crash_base,
-		unsigned long long crash_low_size,
-		bool high)
-{}
-#endif
-
 /* Alignment required for elf header segment */
 #define ELF_CORE_HEADER_ALIGN   4096
 
diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h
new file mode 100644
index 00000000000000..5a9df944fb806a
--- /dev/null
+++ b/include/linux/crash_reserve.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_CRASH_RESERVE_H
+#define LINUX_CRASH_RESERVE_H
+
+#include <linux/linkage.h>
+#include <linux/elfcore.h>
+#include <linux/elf.h>
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#include <asm/crash_reserve.h>
+#endif
+
+/* Location of a reserved region to hold the crash kernel.
+ */
+extern struct resource crashk_res;
+extern struct resource crashk_low_res;
+
+int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
+		unsigned long long *crash_size, unsigned long long *crash_base,
+		unsigned long long *low_size, bool *high);
+
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
+#define DEFAULT_CRASH_KERNEL_LOW_SIZE	(128UL << 20)
+#endif
+#ifndef CRASH_ALIGN
+#define CRASH_ALIGN			SZ_2M
+#endif
+#ifndef CRASH_ADDR_LOW_MAX
+#define CRASH_ADDR_LOW_MAX		SZ_4G
+#endif
+#ifndef CRASH_ADDR_HIGH_MAX
+#define CRASH_ADDR_HIGH_MAX		memblock_end_of_DRAM()
+#endif
+
+void __init reserve_crashkernel_generic(char *cmdline,
+		unsigned long long crash_size,
+		unsigned long long crash_base,
+		unsigned long long crash_low_size,
+		bool high);
+#else
+static inline void __init reserve_crashkernel_generic(char *cmdline,
+		unsigned long long crash_size,
+		unsigned long long crash_base,
+		unsigned long long crash_low_size,
+		bool high)
+{}
+#endif
+#endif /* LINUX_CRASH_RESERVE_H */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 400cb6c02176e0..6d79bfb52e5bf0 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -16,6 +16,7 @@
 #if !defined(__ASSEMBLY__)
 
 #include <linux/crash_core.h>
+#include <linux/crash_reserve.h>
 #include <asm/io.h>
 #include <linux/range.h>
 
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 946dffa048b74c..8b7be71edd859e 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -2,11 +2,15 @@
 
 menu "Kexec and crash features"
 
+config CRASH_RESERVE
+	bool
+
 config CRASH_CORE
 	bool
 
 config KEXEC_CORE
 	select CRASH_CORE
+	select CRASH_RESERVE
 	bool
 
 config KEXEC_ELF
@@ -96,7 +100,6 @@ config KEXEC_JUMP
 config CRASH_DUMP
 	bool "kernel crash dumps"
 	depends on ARCH_SUPPORTS_CRASH_DUMP
-	select CRASH_CORE
 	select KEXEC_CORE
 	help
 	  Generate crash dump after being started by kexec.
diff --git a/kernel/Makefile b/kernel/Makefile
index ce105a5558fcfa..05fa88b3ab7499 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_CRASH_CORE) += crash_core.o
+obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
 obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 49b31e59d3ccd1..ae0d1ce89b46b8 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -34,444 +34,6 @@ u32 *vmcoreinfo_note;
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
 static unsigned char *vmcoreinfo_data_safecopy;
 
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
-	.name  = "Crash kernel",
-	.start = 0,
-	.end   = 0,
-	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
-	.desc  = IORES_DESC_CRASH_KERNEL
-};
-struct resource crashk_low_res = {
-	.name  = "Crash kernel",
-	.start = 0,
-	.end   = 0,
-	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
-	.desc  = IORES_DESC_CRASH_KERNEL
-};
-
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- *   crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
-					unsigned long long system_ram,
-					unsigned long long *crash_size,
-					unsigned long long *crash_base)
-{
-	char *cur = cmdline, *tmp;
-	unsigned long long total_mem = system_ram;
-
-	/*
-	 * Firmware sometimes reserves some memory regions for its own use,
-	 * so the system memory size is less than the actual physical memory
-	 * size. Work around this by rounding up the total size to 128M,
-	 * which is enough for most test cases.
-	 */
-	total_mem = roundup(total_mem, SZ_128M);
-
-	/* for each entry of the comma-separated list */
-	do {
-		unsigned long long start, end = ULLONG_MAX, size;
-
-		/* get the start of the range */
-		start = memparse(cur, &tmp);
-		if (cur == tmp) {
-			pr_warn("crashkernel: Memory value expected\n");
-			return -EINVAL;
-		}
-		cur = tmp;
-		if (*cur != '-') {
-			pr_warn("crashkernel: '-' expected\n");
-			return -EINVAL;
-		}
-		cur++;
-
-		/* if no ':' is here, than we read the end */
-		if (*cur != ':') {
-			end = memparse(cur, &tmp);
-			if (cur == tmp) {
-				pr_warn("crashkernel: Memory value expected\n");
-				return -EINVAL;
-			}
-			cur = tmp;
-			if (end <= start) {
-				pr_warn("crashkernel: end <= start\n");
-				return -EINVAL;
-			}
-		}
-
-		if (*cur != ':') {
-			pr_warn("crashkernel: ':' expected\n");
-			return -EINVAL;
-		}
-		cur++;
-
-		size = memparse(cur, &tmp);
-		if (cur == tmp) {
-			pr_warn("Memory value expected\n");
-			return -EINVAL;
-		}
-		cur = tmp;
-		if (size >= total_mem) {
-			pr_warn("crashkernel: invalid size\n");
-			return -EINVAL;
-		}
-
-		/* match ? */
-		if (total_mem >= start && total_mem < end) {
-			*crash_size = size;
-			break;
-		}
-	} while (*cur++ == ',');
-
-	if (*crash_size > 0) {
-		while (*cur && *cur != ' ' && *cur != '@')
-			cur++;
-		if (*cur == '@') {
-			cur++;
-			*crash_base = memparse(cur, &tmp);
-			if (cur == tmp) {
-				pr_warn("Memory value expected after '@'\n");
-				return -EINVAL;
-			}
-		}
-	} else
-		pr_info("crashkernel size resulted in zero bytes\n");
-
-	return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- *	crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
-					   unsigned long long *crash_size,
-					   unsigned long long *crash_base)
-{
-	char *cur = cmdline;
-
-	*crash_size = memparse(cmdline, &cur);
-	if (cmdline == cur) {
-		pr_warn("crashkernel: memory value expected\n");
-		return -EINVAL;
-	}
-
-	if (*cur == '@')
-		*crash_base = memparse(cur+1, &cur);
-	else if (*cur != ' ' && *cur != '\0') {
-		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW  1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
-	[SUFFIX_HIGH] = ",high",
-	[SUFFIX_LOW]  = ",low",
-	[SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix"  crashkernel command lines like
- *
- *	crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
-					   unsigned long long *crash_size,
-					   const char *suffix)
-{
-	char *cur = cmdline;
-
-	*crash_size = memparse(cmdline, &cur);
-	if (cmdline == cur) {
-		pr_warn("crashkernel: memory value expected\n");
-		return -EINVAL;
-	}
-
-	/* check with suffix */
-	if (strncmp(cur, suffix, strlen(suffix))) {
-		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
-		return -EINVAL;
-	}
-	cur += strlen(suffix);
-	if (*cur != ' ' && *cur != '\0') {
-		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
-			     const char *name,
-			     const char *suffix)
-{
-	char *p = cmdline, *ck_cmdline = NULL;
-
-	/* find crashkernel and use the last one if there are more */
-	p = strstr(p, name);
-	while (p) {
-		char *end_p = strchr(p, ' ');
-		char *q;
-
-		if (!end_p)
-			end_p = p + strlen(p);
-
-		if (!suffix) {
-			int i;
-
-			/* skip the one with any known suffix */
-			for (i = 0; suffix_tbl[i]; i++) {
-				q = end_p - strlen(suffix_tbl[i]);
-				if (!strncmp(q, suffix_tbl[i],
-					     strlen(suffix_tbl[i])))
-					goto next;
-			}
-			ck_cmdline = p;
-		} else {
-			q = end_p - strlen(suffix);
-			if (!strncmp(q, suffix, strlen(suffix)))
-				ck_cmdline = p;
-		}
-next:
-		p = strstr(p+1, name);
-	}
-
-	return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base,
-			     const char *suffix)
-{
-	char *first_colon, *first_space;
-	char *ck_cmdline;
-	char *name = "crashkernel=";
-
-	BUG_ON(!crash_size || !crash_base);
-	*crash_size = 0;
-	*crash_base = 0;
-
-	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-	if (!ck_cmdline)
-		return -ENOENT;
-
-	ck_cmdline += strlen(name);
-
-	if (suffix)
-		return parse_crashkernel_suffix(ck_cmdline, crash_size,
-				suffix);
-	/*
-	 * if the commandline contains a ':', then that's the extended
-	 * syntax -- if not, it must be the classic syntax
-	 */
-	first_colon = strchr(ck_cmdline, ':');
-	first_space = strchr(ck_cmdline, ' ');
-	if (first_colon && (!first_space || first_colon < first_space))
-		return parse_crashkernel_mem(ck_cmdline, system_ram,
-				crash_size, crash_base);
-
-	return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- *
- * If crashkernel=,high|low is supported on architecture, non-NULL values
- * should be passed to parameters 'low_size' and 'high'.
- */
-int __init parse_crashkernel(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base,
-			     unsigned long long *low_size,
-			     bool *high)
-{
-	int ret;
-
-	/* crashkernel=X[@offset] */
-	ret = __parse_crashkernel(cmdline, system_ram, crash_size,
-				crash_base, NULL);
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-	/*
-	 * If non-NULL 'high' passed in and no normal crashkernel
-	 * setting detected, try parsing crashkernel=,high|low.
-	 */
-	if (high && ret == -ENOENT) {
-		ret = __parse_crashkernel(cmdline, 0, crash_size,
-				crash_base, suffix_tbl[SUFFIX_HIGH]);
-		if (ret || !*crash_size)
-			return -EINVAL;
-
-		/*
-		 * crashkernel=Y,low can be specified or not, but invalid value
-		 * is not allowed.
-		 */
-		ret = __parse_crashkernel(cmdline, 0, low_size,
-				crash_base, suffix_tbl[SUFFIX_LOW]);
-		if (ret == -ENOENT) {
-			*low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
-			ret = 0;
-		} else if (ret) {
-			return ret;
-		}
-
-		*high = true;
-	}
-#endif
-	if (!*crash_size)
-		ret = -EINVAL;
-
-	return ret;
-}
-
-/*
- * Add a dummy early_param handler to mark crashkernel= as a known command line
- * parameter and suppress incorrect warnings in init/main.c.
- */
-static int __init parse_crashkernel_dummy(char *arg)
-{
-	return 0;
-}
-early_param("crashkernel", parse_crashkernel_dummy);
-
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-static int __init reserve_crashkernel_low(unsigned long long low_size)
-{
-#ifdef CONFIG_64BIT
-	unsigned long long low_base;
-
-	low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
-	if (!low_base) {
-		pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
-		return -ENOMEM;
-	}
-
-	pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
-		low_base, low_base + low_size, low_size >> 20);
-
-	crashk_low_res.start = low_base;
-	crashk_low_res.end   = low_base + low_size - 1;
-#endif
-	return 0;
-}
-
-void __init reserve_crashkernel_generic(char *cmdline,
-			     unsigned long long crash_size,
-			     unsigned long long crash_base,
-			     unsigned long long crash_low_size,
-			     bool high)
-{
-	unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0;
-	bool fixed_base = false;
-
-	/* User specifies base address explicitly. */
-	if (crash_base) {
-		fixed_base = true;
-		search_base = crash_base;
-		search_end = crash_base + crash_size;
-	} else if (high) {
-		search_base = CRASH_ADDR_LOW_MAX;
-		search_end = CRASH_ADDR_HIGH_MAX;
-	}
-
-retry:
-	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
-					       search_base, search_end);
-	if (!crash_base) {
-		/*
-		 * For crashkernel=size[KMG]@offset[KMG], print out failure
-		 * message if can't reserve the specified region.
-		 */
-		if (fixed_base) {
-			pr_warn("crashkernel reservation failed - memory is in use.\n");
-			return;
-		}
-
-		/*
-		 * For crashkernel=size[KMG], if the first attempt was for
-		 * low memory, fall back to high memory, the minimum required
-		 * low memory will be reserved later.
-		 */
-		if (!high && search_end == CRASH_ADDR_LOW_MAX) {
-			search_end = CRASH_ADDR_HIGH_MAX;
-			search_base = CRASH_ADDR_LOW_MAX;
-			crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
-			goto retry;
-		}
-
-		/*
-		 * For crashkernel=size[KMG],high, if the first attempt was
-		 * for high memory, fall back to low memory.
-		 */
-		if (high && search_end == CRASH_ADDR_HIGH_MAX) {
-			search_end = CRASH_ADDR_LOW_MAX;
-			search_base = 0;
-			goto retry;
-		}
-		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
-			crash_size);
-		return;
-	}
-
-	if ((crash_base >= CRASH_ADDR_LOW_MAX) &&
-	     crash_low_size && reserve_crashkernel_low(crash_low_size)) {
-		memblock_phys_free(crash_base, crash_size);
-		return;
-	}
-
-	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
-		crash_base, crash_base + crash_size, crash_size >> 20);
-
-	/*
-	 * The crashkernel memory will be removed from the kernel linear
-	 * map. Inform kmemleak so that it won't try to access it.
-	 */
-	kmemleak_ignore_phys(crash_base);
-	if (crashk_low_res.end)
-		kmemleak_ignore_phys(crashk_low_res.start);
-
-	crashk_res.start = crash_base;
-	crashk_res.end = crash_base + crash_size - 1;
-}
-
-static __init int insert_crashkernel_resources(void)
-{
-	if (crashk_res.start < crashk_res.end)
-		insert_resource(&iomem_resource, &crashk_res);
-
-	if (crashk_low_res.start < crashk_low_res.end)
-		insert_resource(&iomem_resource, &crashk_low_res);
-
-	return 0;
-}
-early_initcall(insert_crashkernel_resources);
-#endif
-
 int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
 			  void **addr, unsigned long *sz)
 {
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
new file mode 100644
index 00000000000000..bbb6c3cb00e460
--- /dev/null
+++ b/kernel/crash_reserve.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ */
+
+#include <linux/buildid.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/sizes.h>
+#include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kexec.h>
+#include <linux/kmemleak.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/sha1.h>
+
+#include "kallsyms_internal.h"
+#include "kexec_internal.h"
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+	.name  = "Crash kernel",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+	.desc  = IORES_DESC_CRASH_KERNEL
+};
+struct resource crashk_low_res = {
+	.name  = "Crash kernel",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+	.desc  = IORES_DESC_CRASH_KERNEL
+};
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ *   crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+					unsigned long long system_ram,
+					unsigned long long *crash_size,
+					unsigned long long *crash_base)
+{
+	char *cur = cmdline, *tmp;
+	unsigned long long total_mem = system_ram;
+
+	/*
+	 * Firmware sometimes reserves some memory regions for its own use,
+	 * so the system memory size is less than the actual physical memory
+	 * size. Work around this by rounding up the total size to 128M,
+	 * which is enough for most test cases.
+	 */
+	total_mem = roundup(total_mem, SZ_128M);
+
+	/* for each entry of the comma-separated list */
+	do {
+		unsigned long long start, end = ULLONG_MAX, size;
+
+		/* get the start of the range */
+		start = memparse(cur, &tmp);
+		if (cur == tmp) {
+			pr_warn("crashkernel: Memory value expected\n");
+			return -EINVAL;
+		}
+		cur = tmp;
+		if (*cur != '-') {
+			pr_warn("crashkernel: '-' expected\n");
+			return -EINVAL;
+		}
+		cur++;
+
+		/* if no ':' is here, than we read the end */
+		if (*cur != ':') {
+			end = memparse(cur, &tmp);
+			if (cur == tmp) {
+				pr_warn("crashkernel: Memory value expected\n");
+				return -EINVAL;
+			}
+			cur = tmp;
+			if (end <= start) {
+				pr_warn("crashkernel: end <= start\n");
+				return -EINVAL;
+			}
+		}
+
+		if (*cur != ':') {
+			pr_warn("crashkernel: ':' expected\n");
+			return -EINVAL;
+		}
+		cur++;
+
+		size = memparse(cur, &tmp);
+		if (cur == tmp) {
+			pr_warn("Memory value expected\n");
+			return -EINVAL;
+		}
+		cur = tmp;
+		if (size >= total_mem) {
+			pr_warn("crashkernel: invalid size\n");
+			return -EINVAL;
+		}
+
+		/* match ? */
+		if (total_mem >= start && total_mem < end) {
+			*crash_size = size;
+			break;
+		}
+	} while (*cur++ == ',');
+
+	if (*crash_size > 0) {
+		while (*cur && *cur != ' ' && *cur != '@')
+			cur++;
+		if (*cur == '@') {
+			cur++;
+			*crash_base = memparse(cur, &tmp);
+			if (cur == tmp) {
+				pr_warn("Memory value expected after '@'\n");
+				return -EINVAL;
+			}
+		}
+	} else
+		pr_info("crashkernel size resulted in zero bytes\n");
+
+	return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ *	crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+					   unsigned long long *crash_size,
+					   unsigned long long *crash_base)
+{
+	char *cur = cmdline;
+
+	*crash_size = memparse(cmdline, &cur);
+	if (cmdline == cur) {
+		pr_warn("crashkernel: memory value expected\n");
+		return -EINVAL;
+	}
+
+	if (*cur == '@')
+		*crash_base = memparse(cur+1, &cur);
+	else if (*cur != ' ' && *cur != '\0') {
+		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW  1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+	[SUFFIX_HIGH] = ",high",
+	[SUFFIX_LOW]  = ",low",
+	[SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix"  crashkernel command lines like
+ *
+ *	crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+					   unsigned long long *crash_size,
+					   const char *suffix)
+{
+	char *cur = cmdline;
+
+	*crash_size = memparse(cmdline, &cur);
+	if (cmdline == cur) {
+		pr_warn("crashkernel: memory value expected\n");
+		return -EINVAL;
+	}
+
+	/* check with suffix */
+	if (strncmp(cur, suffix, strlen(suffix))) {
+		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+		return -EINVAL;
+	}
+	cur += strlen(suffix);
+	if (*cur != ' ' && *cur != '\0') {
+		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+			     const char *name,
+			     const char *suffix)
+{
+	char *p = cmdline, *ck_cmdline = NULL;
+
+	/* find crashkernel and use the last one if there are more */
+	p = strstr(p, name);
+	while (p) {
+		char *end_p = strchr(p, ' ');
+		char *q;
+
+		if (!end_p)
+			end_p = p + strlen(p);
+
+		if (!suffix) {
+			int i;
+
+			/* skip the one with any known suffix */
+			for (i = 0; suffix_tbl[i]; i++) {
+				q = end_p - strlen(suffix_tbl[i]);
+				if (!strncmp(q, suffix_tbl[i],
+					     strlen(suffix_tbl[i])))
+					goto next;
+			}
+			ck_cmdline = p;
+		} else {
+			q = end_p - strlen(suffix);
+			if (!strncmp(q, suffix, strlen(suffix)))
+				ck_cmdline = p;
+		}
+next:
+		p = strstr(p+1, name);
+	}
+
+	return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base,
+			     const char *suffix)
+{
+	char *first_colon, *first_space;
+	char *ck_cmdline;
+	char *name = "crashkernel=";
+
+	BUG_ON(!crash_size || !crash_base);
+	*crash_size = 0;
+	*crash_base = 0;
+
+	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+	if (!ck_cmdline)
+		return -ENOENT;
+
+	ck_cmdline += strlen(name);
+
+	if (suffix)
+		return parse_crashkernel_suffix(ck_cmdline, crash_size,
+				suffix);
+	/*
+	 * if the commandline contains a ':', then that's the extended
+	 * syntax -- if not, it must be the classic syntax
+	 */
+	first_colon = strchr(ck_cmdline, ':');
+	first_space = strchr(ck_cmdline, ' ');
+	if (first_colon && (!first_space || first_colon < first_space))
+		return parse_crashkernel_mem(ck_cmdline, system_ram,
+				crash_size, crash_base);
+
+	return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ *
+ * If crashkernel=,high|low is supported on architecture, non-NULL values
+ * should be passed to parameters 'low_size' and 'high'.
+ */
+int __init parse_crashkernel(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base,
+			     unsigned long long *low_size,
+			     bool *high)
+{
+	int ret;
+
+	/* crashkernel=X[@offset] */
+	ret = __parse_crashkernel(cmdline, system_ram, crash_size,
+				crash_base, NULL);
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+	/*
+	 * If non-NULL 'high' passed in and no normal crashkernel
+	 * setting detected, try parsing crashkernel=,high|low.
+	 */
+	if (high && ret == -ENOENT) {
+		ret = __parse_crashkernel(cmdline, 0, crash_size,
+				crash_base, suffix_tbl[SUFFIX_HIGH]);
+		if (ret || !*crash_size)
+			return -EINVAL;
+
+		/*
+		 * crashkernel=Y,low can be specified or not, but invalid value
+		 * is not allowed.
+		 */
+		ret = __parse_crashkernel(cmdline, 0, low_size,
+				crash_base, suffix_tbl[SUFFIX_LOW]);
+		if (ret == -ENOENT) {
+			*low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+			ret = 0;
+		} else if (ret) {
+			return ret;
+		}
+
+		*high = true;
+	}
+#endif
+	if (!*crash_size)
+		ret = -EINVAL;
+
+	return ret;
+}
+
+/*
+ * Add a dummy early_param handler to mark crashkernel= as a known command line
+ * parameter and suppress incorrect warnings in init/main.c.
+ */
+static int __init parse_crashkernel_dummy(char *arg)
+{
+	return 0;
+}
+early_param("crashkernel", parse_crashkernel_dummy);
+
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+static int __init reserve_crashkernel_low(unsigned long long low_size)
+{
+#ifdef CONFIG_64BIT
+	unsigned long long low_base;
+
+	low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
+	if (!low_base) {
+		pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
+		return -ENOMEM;
+	}
+
+	pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
+		low_base, low_base + low_size, low_size >> 20);
+
+	crashk_low_res.start = low_base;
+	crashk_low_res.end   = low_base + low_size - 1;
+	insert_resource(&iomem_resource, &crashk_low_res);
+#endif
+	return 0;
+}
+
+void __init reserve_crashkernel_generic(char *cmdline,
+			     unsigned long long crash_size,
+			     unsigned long long crash_base,
+			     unsigned long long crash_low_size,
+			     bool high)
+{
+	unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0;
+	bool fixed_base = false;
+
+	/* User specifies base address explicitly. */
+	if (crash_base) {
+		fixed_base = true;
+		search_base = crash_base;
+		search_end = crash_base + crash_size;
+	} else if (high) {
+		search_base = CRASH_ADDR_LOW_MAX;
+		search_end = CRASH_ADDR_HIGH_MAX;
+	}
+
+retry:
+	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+					       search_base, search_end);
+	if (!crash_base) {
+		/*
+		 * For crashkernel=size[KMG]@offset[KMG], print out failure
+		 * message if can't reserve the specified region.
+		 */
+		if (fixed_base) {
+			pr_warn("crashkernel reservation failed - memory is in use.\n");
+			return;
+		}
+
+		/*
+		 * For crashkernel=size[KMG], if the first attempt was for
+		 * low memory, fall back to high memory, the minimum required
+		 * low memory will be reserved later.
+		 */
+		if (!high && search_end == CRASH_ADDR_LOW_MAX) {
+			search_end = CRASH_ADDR_HIGH_MAX;
+			search_base = CRASH_ADDR_LOW_MAX;
+			crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+			goto retry;
+		}
+
+		/*
+		 * For crashkernel=size[KMG],high, if the first attempt was
+		 * for high memory, fall back to low memory.
+		 */
+		if (high && search_end == CRASH_ADDR_HIGH_MAX) {
+			search_end = CRASH_ADDR_LOW_MAX;
+			search_base = 0;
+			goto retry;
+		}
+		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
+			crash_size);
+		return;
+	}
+
+	if ((crash_base >= CRASH_ADDR_LOW_MAX) &&
+	     crash_low_size && reserve_crashkernel_low(crash_low_size)) {
+		memblock_phys_free(crash_base, crash_size);
+		return;
+	}
+
+	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+		crash_base, crash_base + crash_size, crash_size >> 20);
+
+	/*
+	 * The crashkernel memory will be removed from the kernel linear
+	 * map. Inform kmemleak so that it won't try to access it.
+	 */
+	kmemleak_ignore_phys(crash_base);
+	if (crashk_low_res.end)
+		kmemleak_ignore_phys(crashk_low_res.start);
+
+	crashk_res.start = crash_base;
+	crashk_res.end = crash_base + crash_size - 1;
+}
+
+static __init int insert_crashkernel_resources(void)
+{
+	if (crashk_res.start < crashk_res.end)
+		insert_resource(&iomem_resource, &crashk_res);
+
+	if (crashk_low_res.start < crashk_low_res.end)
+		insert_resource(&iomem_resource, &crashk_low_res);
+
+	return 0;
+}
+early_initcall(insert_crashkernel_resources);
+#endif

From b2044843b278023d73c328cdbf9c906e224a8003 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sun, 28 Jan 2024 22:00:15 -0800
Subject: [PATCH 470/707] 
 kexec-split-crashkernel-reservation-code-out-from-crash_corec-fix

s/CRASH_RESEERVE/CRASH_RESERVE/, per Klara Modin

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Baoquan He <bhe@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 71417c5b228c51..5bd9258151546e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2106,7 +2106,7 @@ config ARCH_SUPPORTS_CRASH_HOTPLUG
 	def_bool y
 
 config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-	def_bool CRASH_RESEERVE
+	def_bool CRASH_RESERVE
 
 config PHYSICAL_START
 	hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)

From 6516ae2baeb5c04b3edd13020383df8c1a478095 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:42 +0800
Subject: [PATCH 471/707] crash: split vmcoreinfo exporting code out from
 crash_core.c

Now move the relevant codes into separate files:
kernel/crash_reserve.c, include/linux/crash_reserve.h.

And add config item CRASH_RESERVE to control its enabling.

And also update the old ifdeffery of CONFIG_CRASH_CORE, including of
<linux/crash_core.h> and config item dependency on CRASH_CORE
accordingly.

And also do renaming as follows:
 - arch/xxx/kernel/{crash_core.c => vmcore_info.c}
because they are only related to vmcoreinfo exporting on x86, arm64,
riscv.

And also Remove config item CRASH_CORE, and rely on CONFIG_KEXEC_CORE to
decide if build in crash_core.c.

Link: https://lkml.kernel.org/r/20240124051254.67105-3-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/kernel/Makefile                    |   2 +-
 .../kernel/{crash_core.c => vmcore_info.c}    |   2 +-
 arch/powerpc/Kconfig                          |   2 +-
 arch/powerpc/kernel/setup-common.c            |   2 +-
 arch/powerpc/platforms/powernv/opal-core.c    |   2 +-
 arch/riscv/kernel/Makefile                    |   2 +-
 .../kernel/{crash_core.c => vmcore_info.c}    |   2 +-
 arch/x86/kernel/Makefile                      |   2 +-
 .../{crash_core_32.c => vmcore_info_32.c}     |   2 +-
 .../{crash_core_64.c => vmcore_info_64.c}     |   2 +-
 drivers/firmware/qemu_fw_cfg.c                |  14 +-
 fs/proc/Kconfig                               |   2 +-
 fs/proc/kcore.c                               |   2 +-
 include/linux/buildid.h                       |   2 +-
 include/linux/crash_core.h                    |  73 ------
 include/linux/kexec.h                         |   1 +
 include/linux/vmcore_info.h                   |  81 ++++++
 kernel/Kconfig.kexec                          |   4 +-
 kernel/Makefile                               |   4 +-
 kernel/crash_core.c                           | 206 ----------------
 kernel/ksysfs.c                               |   6 +-
 kernel/printk/printk.c                        |   4 +-
 kernel/vmcore_info.c                          | 231 ++++++++++++++++++
 lib/buildid.c                                 |   2 +-
 24 files changed, 343 insertions(+), 309 deletions(-)
 rename arch/arm64/kernel/{crash_core.c => vmcore_info.c} (97%)
 rename arch/riscv/kernel/{crash_core.c => vmcore_info.c} (96%)
 rename arch/x86/kernel/{crash_core_32.c => vmcore_info_32.c} (90%)
 rename arch/x86/kernel/{crash_core_64.c => vmcore_info_64.c} (94%)
 create mode 100644 include/linux/vmcore_info.h
 create mode 100644 kernel/vmcore_info.c

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index e5d03a7039b4bf..9fd638b91f38d0 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -66,7 +66,7 @@ obj-$(CONFIG_KEXEC_FILE)		+= machine_kexec_file.o kexec_image.o
 obj-$(CONFIG_ARM64_RELOC_TEST)		+= arm64-reloc-test.o
 arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
 obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
-obj-$(CONFIG_CRASH_CORE)		+= crash_core.o
+obj-$(CONFIG_VMCORE_INFO)		+= vmcore_info.o
 obj-$(CONFIG_ARM_SDE_INTERFACE)		+= sdei.o
 obj-$(CONFIG_ARM64_PTR_AUTH)		+= pointer_auth.o
 obj-$(CONFIG_ARM64_MTE)			+= mte.o
diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/vmcore_info.c
similarity index 97%
rename from arch/arm64/kernel/crash_core.c
rename to arch/arm64/kernel/vmcore_info.c
index 2a24199a9b81e0..b19d5d6cb8b387 100644
--- a/arch/arm64/kernel/crash_core.c
+++ b/arch/arm64/kernel/vmcore_info.c
@@ -4,7 +4,7 @@
  * Copyright (C) Huawei Futurewei Technologies.
  */
 
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <asm/cpufeature.h>
 #include <asm/memory.h>
 #include <asm/pgtable-hwdef.h>
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7f704ae5c5efcb..495d197c9b2751 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -690,7 +690,7 @@ config ARCH_SELECTS_CRASH_DUMP
 config FA_DUMP
 	bool "Firmware-assisted dump"
 	depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
-	select CRASH_CORE
+	select VMCORE_INFO
 	select CRASH_RESERVE
 	select CRASH_DUMP
 	help
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 9b142b9d5187b2..733f210ffda1fe 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -109,7 +109,7 @@ int ppc_do_canonicalize_irqs;
 EXPORT_SYMBOL(ppc_do_canonicalize_irqs);
 #endif
 
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 /* This keeps a track of which one is the crashing cpu. */
 int crashing_cpu = -1;
 #endif
diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c
index bb7657115f1d27..c9a9b759cc928b 100644
--- a/arch/powerpc/platforms/powernv/opal-core.c
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -16,7 +16,7 @@
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
 #include <linux/slab.h>
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <linux/of.h>
 
 #include <asm/page.h>
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index f71910718053d8..d6fd8dcfceb5e3 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -92,7 +92,7 @@ obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_KEXEC_CORE)	+= kexec_relocate.o crash_save_regs.o machine_kexec.o
 obj-$(CONFIG_KEXEC_FILE)	+= elf_kexec.o machine_kexec_file.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
-obj-$(CONFIG_CRASH_CORE)	+= crash_core.o
+obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info.o
 
 obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o
 
diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/vmcore_info.c
similarity index 96%
rename from arch/riscv/kernel/crash_core.c
rename to arch/riscv/kernel/vmcore_info.c
index d18d529fd9b984..6d7a22522d6309 100644
--- a/arch/riscv/kernel/crash_core.c
+++ b/arch/riscv/kernel/vmcore_info.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <linux/pagemap.h>
 
 void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0000325ab98f4d..913d4022131eba 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -98,7 +98,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
 obj-$(CONFIG_TRACING)		+= trace.o
 obj-$(CONFIG_RETHOOK)		+= rethook.o
-obj-$(CONFIG_CRASH_CORE)	+= crash_core_$(BITS).o
+obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info_$(BITS).o
 obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC_CORE)	+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC_FILE)	+= kexec-bzimage64.o
diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/vmcore_info_32.c
similarity index 90%
rename from arch/x86/kernel/crash_core_32.c
rename to arch/x86/kernel/vmcore_info_32.c
index 8a89c109e20a6c..5995a749288a95 100644
--- a/arch/x86/kernel/crash_core_32.c
+++ b/arch/x86/kernel/vmcore_info_32.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <linux/pgtable.h>
 
 #include <asm/setup.h>
diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/vmcore_info_64.c
similarity index 94%
rename from arch/x86/kernel/crash_core_64.c
rename to arch/x86/kernel/vmcore_info_64.c
index 7d255f882afe6f..0dec7d86875447 100644
--- a/arch/x86/kernel/crash_core_64.c
+++ b/arch/x86/kernel/vmcore_info_64.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <linux/pgtable.h>
 
 #include <asm/setup.h>
diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c
index 03da9a4354f886..5f43dfa22f799c 100644
--- a/drivers/firmware/qemu_fw_cfg.c
+++ b/drivers/firmware/qemu_fw_cfg.c
@@ -37,7 +37,7 @@
 #include <uapi/linux/qemu_fw_cfg.h>
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 
 MODULE_AUTHOR("Gabriel L. Somlo <somlo@cmu.edu>");
 MODULE_DESCRIPTION("QEMU fw_cfg sysfs support");
@@ -67,7 +67,7 @@ static void fw_cfg_sel_endianness(u16 key)
 		iowrite16(key, fw_cfg_reg_ctrl);
 }
 
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 static inline bool fw_cfg_dma_enabled(void)
 {
 	return (fw_cfg_rev & FW_CFG_VERSION_DMA) && fw_cfg_reg_dma;
@@ -156,7 +156,7 @@ static ssize_t fw_cfg_read_blob(u16 key,
 	return count;
 }
 
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 /* write chunk of given fw_cfg blob (caller responsible for sanity-check) */
 static ssize_t fw_cfg_write_blob(u16 key,
 				 void *buf, loff_t pos, size_t count)
@@ -195,7 +195,7 @@ static ssize_t fw_cfg_write_blob(u16 key,
 
 	return ret;
 }
-#endif /* CONFIG_CRASH_CORE */
+#endif /* CONFIG_VMCORE_INFO */
 
 /* clean up fw_cfg device i/o */
 static void fw_cfg_io_cleanup(void)
@@ -319,7 +319,7 @@ struct fw_cfg_sysfs_entry {
 	struct list_head list;
 };
 
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 static ssize_t fw_cfg_write_vmcoreinfo(const struct fw_cfg_file *f)
 {
 	static struct fw_cfg_vmcoreinfo *data;
@@ -343,7 +343,7 @@ static ssize_t fw_cfg_write_vmcoreinfo(const struct fw_cfg_file *f)
 	kfree(data);
 	return ret;
 }
-#endif /* CONFIG_CRASH_CORE */
+#endif /* CONFIG_VMCORE_INFO */
 
 /* get fw_cfg_sysfs_entry from kobject member */
 static inline struct fw_cfg_sysfs_entry *to_entry(struct kobject *kobj)
@@ -583,7 +583,7 @@ static int fw_cfg_register_file(const struct fw_cfg_file *f)
 	int err;
 	struct fw_cfg_sysfs_entry *entry;
 
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 	if (fw_cfg_dma_enabled() &&
 		strcmp(f->name, FW_CFG_VMCOREINFO_FILENAME) == 0 &&
 		!is_kdump_kernel()) {
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 32b1116ae137c6..d80a1431ef7be0 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -32,7 +32,7 @@ config PROC_FS
 config PROC_KCORE
 	bool "/proc/kcore support" if !ARM
 	depends on PROC_FS && MMU
-	select CRASH_CORE
+	select VMCORE_INFO
 	help
 	  Provides a virtual ELF core file of the live kernel.  This can
 	  be read with gdb and other ELF tools.  No modifications can be
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6422e569b08085..8e08a9a1b7ed57 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,7 +10,7 @@
  *	Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
  */
 
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <linux/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/kcore.h>
diff --git a/include/linux/buildid.h b/include/linux/buildid.h
index 8a582d242f0672..20aa3c2d89f760 100644
--- a/include/linux/buildid.h
+++ b/include/linux/buildid.h
@@ -11,7 +11,7 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
 		   __u32 *size);
 int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size);
 
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_CRASH_CORE)
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO)
 extern unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX];
 void init_vmlinux_build_id(void);
 #else
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 1fde49246fa6e3..7f19f62018ef9c 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -6,79 +6,6 @@
 #include <linux/elfcore.h>
 #include <linux/elf.h>
 
-#define CRASH_CORE_NOTE_NAME	   "CORE"
-#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
-#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4)
-#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
-
-/*
- * The per-cpu notes area is a list of notes terminated by a "NULL"
- * note header.  For kdump, the code in vmcore.c runs in the context
- * of the second kernel to combine them into one note.
- */
-#define CRASH_CORE_NOTE_BYTES	   ((CRASH_CORE_NOTE_HEAD_BYTES * 2) +	\
-				     CRASH_CORE_NOTE_NAME_BYTES +	\
-				     CRASH_CORE_NOTE_DESC_BYTES)
-
-#define VMCOREINFO_BYTES	   PAGE_SIZE
-#define VMCOREINFO_NOTE_NAME	   "VMCOREINFO"
-#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
-#define VMCOREINFO_NOTE_SIZE	   ((CRASH_CORE_NOTE_HEAD_BYTES * 2) +	\
-				     VMCOREINFO_NOTE_NAME_BYTES +	\
-				     VMCOREINFO_BYTES)
-
-typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4];
-/* Per cpu memory for storing cpu states in case of system crash. */
-extern note_buf_t __percpu *crash_notes;
-
-void crash_update_vmcoreinfo_safecopy(void *ptr);
-void crash_save_vmcoreinfo(void);
-void arch_crash_save_vmcoreinfo(void);
-__printf(1, 2)
-void vmcoreinfo_append_str(const char *fmt, ...);
-phys_addr_t paddr_vmcoreinfo_note(void);
-
-#define VMCOREINFO_OSRELEASE(value) \
-	vmcoreinfo_append_str("OSRELEASE=%s\n", value)
-#define VMCOREINFO_BUILD_ID()						\
-	({								\
-		static_assert(sizeof(vmlinux_build_id) == 20);		\
-		vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \
-	})
-
-#define VMCOREINFO_PAGESIZE(value) \
-	vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
-#define VMCOREINFO_SYMBOL(name) \
-	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
-#define VMCOREINFO_SYMBOL_ARRAY(name) \
-	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name)
-#define VMCOREINFO_SIZE(name) \
-	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
-			      (unsigned long)sizeof(name))
-#define VMCOREINFO_STRUCT_SIZE(name) \
-	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
-			      (unsigned long)sizeof(struct name))
-#define VMCOREINFO_OFFSET(name, field) \
-	vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
-			      (unsigned long)offsetof(struct name, field))
-#define VMCOREINFO_TYPE_OFFSET(name, field) \
-	vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
-			      (unsigned long)offsetof(name, field))
-#define VMCOREINFO_LENGTH(name, value) \
-	vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value)
-#define VMCOREINFO_NUMBER(name) \
-	vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
-#define VMCOREINFO_CONFIG(name) \
-	vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
-
-extern unsigned char *vmcoreinfo_data;
-extern size_t vmcoreinfo_size;
-extern u32 *vmcoreinfo_note;
-
-Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
-			  void *data, size_t data_len);
-void final_note(Elf_Word *buf);
-
 /* Alignment required for elf header segment */
 #define ELF_CORE_HEADER_ALIGN   4096
 
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 6d79bfb52e5bf0..9c7bb8b56ed66d 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -16,6 +16,7 @@
 #if !defined(__ASSEMBLY__)
 
 #include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <linux/crash_reserve.h>
 #include <asm/io.h>
 #include <linux/range.h>
diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h
new file mode 100644
index 00000000000000..e1dec1a6a749dc
--- /dev/null
+++ b/include/linux/vmcore_info.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_VMCORE_INFO_H
+#define LINUX_VMCORE_INFO_H
+
+#include <linux/linkage.h>
+#include <linux/elfcore.h>
+#include <linux/elf.h>
+
+#define CRASH_CORE_NOTE_NAME	   "CORE"
+#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
+#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4)
+#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
+
+/*
+ * The per-cpu notes area is a list of notes terminated by a "NULL"
+ * note header.  For kdump, the code in vmcore.c runs in the context
+ * of the second kernel to combine them into one note.
+ */
+#define CRASH_CORE_NOTE_BYTES	   ((CRASH_CORE_NOTE_HEAD_BYTES * 2) +	\
+				     CRASH_CORE_NOTE_NAME_BYTES +	\
+				     CRASH_CORE_NOTE_DESC_BYTES)
+
+#define VMCOREINFO_BYTES	   PAGE_SIZE
+#define VMCOREINFO_NOTE_NAME	   "VMCOREINFO"
+#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
+#define VMCOREINFO_NOTE_SIZE	   ((CRASH_CORE_NOTE_HEAD_BYTES * 2) +	\
+				     VMCOREINFO_NOTE_NAME_BYTES +	\
+				     VMCOREINFO_BYTES)
+
+typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4];
+/* Per cpu memory for storing cpu states in case of system crash. */
+extern note_buf_t __percpu *crash_notes;
+
+void crash_update_vmcoreinfo_safecopy(void *ptr);
+void crash_save_vmcoreinfo(void);
+void arch_crash_save_vmcoreinfo(void);
+__printf(1, 2)
+void vmcoreinfo_append_str(const char *fmt, ...);
+phys_addr_t paddr_vmcoreinfo_note(void);
+
+#define VMCOREINFO_OSRELEASE(value) \
+	vmcoreinfo_append_str("OSRELEASE=%s\n", value)
+#define VMCOREINFO_BUILD_ID()						\
+	({								\
+		static_assert(sizeof(vmlinux_build_id) == 20);		\
+		vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \
+	})
+
+#define VMCOREINFO_PAGESIZE(value) \
+	vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
+#define VMCOREINFO_SYMBOL(name) \
+	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
+#define VMCOREINFO_SYMBOL_ARRAY(name) \
+	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name)
+#define VMCOREINFO_SIZE(name) \
+	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
+			      (unsigned long)sizeof(name))
+#define VMCOREINFO_STRUCT_SIZE(name) \
+	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
+			      (unsigned long)sizeof(struct name))
+#define VMCOREINFO_OFFSET(name, field) \
+	vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
+			      (unsigned long)offsetof(struct name, field))
+#define VMCOREINFO_TYPE_OFFSET(name, field) \
+	vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
+			      (unsigned long)offsetof(name, field))
+#define VMCOREINFO_LENGTH(name, value) \
+	vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value)
+#define VMCOREINFO_NUMBER(name) \
+	vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
+#define VMCOREINFO_CONFIG(name) \
+	vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+
+extern unsigned char *vmcoreinfo_data;
+extern size_t vmcoreinfo_size;
+extern u32 *vmcoreinfo_note;
+
+Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
+			  void *data, size_t data_len);
+void final_note(Elf_Word *buf);
+#endif /* LINUX_VMCORE_INFO_H */
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 8b7be71edd859e..8faf27043432fe 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -5,11 +5,11 @@ menu "Kexec and crash features"
 config CRASH_RESERVE
 	bool
 
-config CRASH_CORE
+config VMCORE_INFO
 	bool
 
 config KEXEC_CORE
-	select CRASH_CORE
+	select VMCORE_INFO
 	select CRASH_RESERVE
 	bool
 
diff --git a/kernel/Makefile b/kernel/Makefile
index 05fa88b3ab7499..649272a1d6b9f8 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,9 +68,9 @@ obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
-obj-$(CONFIG_CRASH_CORE) += crash_core.o
+obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o
 obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
-obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
+obj-$(CONFIG_KEXEC_CORE) += kexec_core.o crash_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index ae0d1ce89b46b8..2f4df1fe6f7af5 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -26,14 +26,6 @@
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
 
-/* vmcoreinfo stuff */
-unsigned char *vmcoreinfo_data;
-size_t vmcoreinfo_size;
-u32 *vmcoreinfo_note;
-
-/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
-static unsigned char *vmcoreinfo_data_safecopy;
-
 int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
 			  void **addr, unsigned long *sz)
 {
@@ -195,204 +187,6 @@ int crash_exclude_mem_range(struct crash_mem *mem,
 	return 0;
 }
 
-Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
-			  void *data, size_t data_len)
-{
-	struct elf_note *note = (struct elf_note *)buf;
-
-	note->n_namesz = strlen(name) + 1;
-	note->n_descsz = data_len;
-	note->n_type   = type;
-	buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
-	memcpy(buf, name, note->n_namesz);
-	buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
-	memcpy(buf, data, data_len);
-	buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
-
-	return buf;
-}
-
-void final_note(Elf_Word *buf)
-{
-	memset(buf, 0, sizeof(struct elf_note));
-}
-
-static void update_vmcoreinfo_note(void)
-{
-	u32 *buf = vmcoreinfo_note;
-
-	if (!vmcoreinfo_size)
-		return;
-	buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
-			      vmcoreinfo_size);
-	final_note(buf);
-}
-
-void crash_update_vmcoreinfo_safecopy(void *ptr)
-{
-	if (ptr)
-		memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
-
-	vmcoreinfo_data_safecopy = ptr;
-}
-
-void crash_save_vmcoreinfo(void)
-{
-	if (!vmcoreinfo_note)
-		return;
-
-	/* Use the safe copy to generate vmcoreinfo note if have */
-	if (vmcoreinfo_data_safecopy)
-		vmcoreinfo_data = vmcoreinfo_data_safecopy;
-
-	vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds());
-	update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
-	va_list args;
-	char buf[0x50];
-	size_t r;
-
-	va_start(args, fmt);
-	r = vscnprintf(buf, sizeof(buf), fmt, args);
-	va_end(args);
-
-	r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
-
-	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
-	vmcoreinfo_size += r;
-
-	WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES,
-		  "vmcoreinfo data exceeds allocated size, truncating");
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-phys_addr_t __weak paddr_vmcoreinfo_note(void)
-{
-	return __pa(vmcoreinfo_note);
-}
-EXPORT_SYMBOL(paddr_vmcoreinfo_note);
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
-	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
-	if (!vmcoreinfo_data) {
-		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
-		return -ENOMEM;
-	}
-
-	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
-						GFP_KERNEL | __GFP_ZERO);
-	if (!vmcoreinfo_note) {
-		free_page((unsigned long)vmcoreinfo_data);
-		vmcoreinfo_data = NULL;
-		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
-		return -ENOMEM;
-	}
-
-	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
-	VMCOREINFO_BUILD_ID();
-	VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
-	VMCOREINFO_SYMBOL(init_uts_ns);
-	VMCOREINFO_OFFSET(uts_namespace, name);
-	VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
-	VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
-#endif
-	VMCOREINFO_SYMBOL(_stext);
-	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START);
-
-#ifndef CONFIG_NUMA
-	VMCOREINFO_SYMBOL(mem_map);
-	VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
-	VMCOREINFO_SYMBOL_ARRAY(mem_section);
-	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
-	VMCOREINFO_STRUCT_SIZE(mem_section);
-	VMCOREINFO_OFFSET(mem_section, section_mem_map);
-	VMCOREINFO_NUMBER(SECTION_SIZE_BITS);
-	VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
-#endif
-	VMCOREINFO_STRUCT_SIZE(page);
-	VMCOREINFO_STRUCT_SIZE(pglist_data);
-	VMCOREINFO_STRUCT_SIZE(zone);
-	VMCOREINFO_STRUCT_SIZE(free_area);
-	VMCOREINFO_STRUCT_SIZE(list_head);
-	VMCOREINFO_SIZE(nodemask_t);
-	VMCOREINFO_OFFSET(page, flags);
-	VMCOREINFO_OFFSET(page, _refcount);
-	VMCOREINFO_OFFSET(page, mapping);
-	VMCOREINFO_OFFSET(page, lru);
-	VMCOREINFO_OFFSET(page, _mapcount);
-	VMCOREINFO_OFFSET(page, private);
-	VMCOREINFO_OFFSET(page, compound_head);
-	VMCOREINFO_OFFSET(pglist_data, node_zones);
-	VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLATMEM
-	VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
-	VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
-	VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
-	VMCOREINFO_OFFSET(pglist_data, node_id);
-	VMCOREINFO_OFFSET(zone, free_area);
-	VMCOREINFO_OFFSET(zone, vm_stat);
-	VMCOREINFO_OFFSET(zone, spanned_pages);
-	VMCOREINFO_OFFSET(free_area, free_list);
-	VMCOREINFO_OFFSET(list_head, next);
-	VMCOREINFO_OFFSET(list_head, prev);
-	VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
-	log_buf_vmcoreinfo_setup();
-	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
-	VMCOREINFO_NUMBER(NR_FREE_PAGES);
-	VMCOREINFO_NUMBER(PG_lru);
-	VMCOREINFO_NUMBER(PG_private);
-	VMCOREINFO_NUMBER(PG_swapcache);
-	VMCOREINFO_NUMBER(PG_swapbacked);
-	VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
-	VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
-	VMCOREINFO_NUMBER(PG_head_mask);
-#define PAGE_BUDDY_MAPCOUNT_VALUE	(~PG_buddy)
-	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLB_PAGE
-	VMCOREINFO_NUMBER(PG_hugetlb);
-#define PAGE_OFFLINE_MAPCOUNT_VALUE	(~PG_offline)
-	VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
-#endif
-
-#ifdef CONFIG_KALLSYMS
-	VMCOREINFO_SYMBOL(kallsyms_names);
-	VMCOREINFO_SYMBOL(kallsyms_num_syms);
-	VMCOREINFO_SYMBOL(kallsyms_token_table);
-	VMCOREINFO_SYMBOL(kallsyms_token_index);
-#ifdef CONFIG_KALLSYMS_BASE_RELATIVE
-	VMCOREINFO_SYMBOL(kallsyms_offsets);
-	VMCOREINFO_SYMBOL(kallsyms_relative_base);
-#else
-	VMCOREINFO_SYMBOL(kallsyms_addresses);
-#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */
-#endif /* CONFIG_KALLSYMS */
-
-	arch_crash_save_vmcoreinfo();
-	update_vmcoreinfo_note();
-
-	return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
 static int __init crash_notes_memory_init(void)
 {
 	/* Allocate memory for saving cpu registers. */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 1d4bc493b2f4b2..11526fc42bc24c 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -154,7 +154,7 @@ KERNEL_ATTR_RW(kexec_crash_size);
 
 #endif /* CONFIG_KEXEC_CORE */
 
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 
 static ssize_t vmcoreinfo_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *buf)
@@ -177,7 +177,7 @@ KERNEL_ATTR_RO(crash_elfcorehdr_size);
 
 #endif
 
-#endif /* CONFIG_CRASH_CORE */
+#endif /* CONFIG_VMCORE_INFO */
 
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
@@ -265,7 +265,7 @@ static struct attribute * kernel_attrs[] = {
 	&kexec_crash_loaded_attr.attr,
 	&kexec_crash_size_attr.attr,
 #endif
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 	&vmcoreinfo_attr.attr,
 #ifdef CONFIG_CRASH_HOTPLUG
 	&crash_elfcorehdr_size_attr.attr,
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f2444b581e16c3..7d74b000b43a9b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -34,7 +34,7 @@
 #include <linux/security.h>
 #include <linux/memblock.h>
 #include <linux/syscalls.h>
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <linux/ratelimit.h>
 #include <linux/kmsg_dump.h>
 #include <linux/syslog.h>
@@ -951,7 +951,7 @@ const struct file_operations kmsg_fops = {
 	.release = devkmsg_release,
 };
 
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
 /*
  * This appends the listed symbols to /proc/vmcore
  *
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
new file mode 100644
index 00000000000000..8f48c0a42e2eed
--- /dev/null
+++ b/kernel/vmcore_info.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ */
+
+#include <linux/buildid.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/sizes.h>
+#include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kexec.h>
+#include <linux/kmemleak.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/sha1.h>
+
+#include "kallsyms_internal.h"
+#include "kexec_internal.h"
+
+/* vmcoreinfo stuff */
+unsigned char *vmcoreinfo_data;
+size_t vmcoreinfo_size;
+u32 *vmcoreinfo_note;
+
+/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
+static unsigned char *vmcoreinfo_data_safecopy;
+
+Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
+			  void *data, size_t data_len)
+{
+	struct elf_note *note = (struct elf_note *)buf;
+
+	note->n_namesz = strlen(name) + 1;
+	note->n_descsz = data_len;
+	note->n_type   = type;
+	buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
+	memcpy(buf, name, note->n_namesz);
+	buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
+	memcpy(buf, data, data_len);
+	buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
+
+	return buf;
+}
+
+void final_note(Elf_Word *buf)
+{
+	memset(buf, 0, sizeof(struct elf_note));
+}
+
+static void update_vmcoreinfo_note(void)
+{
+	u32 *buf = vmcoreinfo_note;
+
+	if (!vmcoreinfo_size)
+		return;
+	buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+			      vmcoreinfo_size);
+	final_note(buf);
+}
+
+void crash_update_vmcoreinfo_safecopy(void *ptr)
+{
+	if (ptr)
+		memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
+
+	vmcoreinfo_data_safecopy = ptr;
+}
+
+void crash_save_vmcoreinfo(void)
+{
+	if (!vmcoreinfo_note)
+		return;
+
+	/* Use the safe copy to generate vmcoreinfo note if have */
+	if (vmcoreinfo_data_safecopy)
+		vmcoreinfo_data = vmcoreinfo_data_safecopy;
+
+	vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds());
+	update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+	va_list args;
+	char buf[0x50];
+	size_t r;
+
+	va_start(args, fmt);
+	r = vscnprintf(buf, sizeof(buf), fmt, args);
+	va_end(args);
+
+	r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
+
+	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+	vmcoreinfo_size += r;
+
+	WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES,
+		  "vmcoreinfo data exceeds allocated size, truncating");
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+phys_addr_t __weak paddr_vmcoreinfo_note(void)
+{
+	return __pa(vmcoreinfo_note);
+}
+EXPORT_SYMBOL(paddr_vmcoreinfo_note);
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
+	if (!vmcoreinfo_data) {
+		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
+		return -ENOMEM;
+	}
+
+	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
+						GFP_KERNEL | __GFP_ZERO);
+	if (!vmcoreinfo_note) {
+		free_page((unsigned long)vmcoreinfo_data);
+		vmcoreinfo_data = NULL;
+		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
+		return -ENOMEM;
+	}
+
+	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+	VMCOREINFO_BUILD_ID();
+	VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+	VMCOREINFO_SYMBOL(init_uts_ns);
+	VMCOREINFO_OFFSET(uts_namespace, name);
+	VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+	VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
+#endif
+	VMCOREINFO_SYMBOL(_stext);
+	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START);
+
+#ifndef CONFIG_NUMA
+	VMCOREINFO_SYMBOL(mem_map);
+	VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+	VMCOREINFO_SYMBOL_ARRAY(mem_section);
+	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+	VMCOREINFO_STRUCT_SIZE(mem_section);
+	VMCOREINFO_OFFSET(mem_section, section_mem_map);
+	VMCOREINFO_NUMBER(SECTION_SIZE_BITS);
+	VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
+#endif
+	VMCOREINFO_STRUCT_SIZE(page);
+	VMCOREINFO_STRUCT_SIZE(pglist_data);
+	VMCOREINFO_STRUCT_SIZE(zone);
+	VMCOREINFO_STRUCT_SIZE(free_area);
+	VMCOREINFO_STRUCT_SIZE(list_head);
+	VMCOREINFO_SIZE(nodemask_t);
+	VMCOREINFO_OFFSET(page, flags);
+	VMCOREINFO_OFFSET(page, _refcount);
+	VMCOREINFO_OFFSET(page, mapping);
+	VMCOREINFO_OFFSET(page, lru);
+	VMCOREINFO_OFFSET(page, _mapcount);
+	VMCOREINFO_OFFSET(page, private);
+	VMCOREINFO_OFFSET(page, compound_head);
+	VMCOREINFO_OFFSET(pglist_data, node_zones);
+	VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLATMEM
+	VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+	VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+	VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+	VMCOREINFO_OFFSET(pglist_data, node_id);
+	VMCOREINFO_OFFSET(zone, free_area);
+	VMCOREINFO_OFFSET(zone, vm_stat);
+	VMCOREINFO_OFFSET(zone, spanned_pages);
+	VMCOREINFO_OFFSET(free_area, free_list);
+	VMCOREINFO_OFFSET(list_head, next);
+	VMCOREINFO_OFFSET(list_head, prev);
+	VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
+	log_buf_vmcoreinfo_setup();
+	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+	VMCOREINFO_NUMBER(NR_FREE_PAGES);
+	VMCOREINFO_NUMBER(PG_lru);
+	VMCOREINFO_NUMBER(PG_private);
+	VMCOREINFO_NUMBER(PG_swapcache);
+	VMCOREINFO_NUMBER(PG_swapbacked);
+	VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+	VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+	VMCOREINFO_NUMBER(PG_head_mask);
+#define PAGE_BUDDY_MAPCOUNT_VALUE	(~PG_buddy)
+	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLB_PAGE
+	VMCOREINFO_NUMBER(PG_hugetlb);
+#define PAGE_OFFLINE_MAPCOUNT_VALUE	(~PG_offline)
+	VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
+#endif
+
+#ifdef CONFIG_KALLSYMS
+	VMCOREINFO_SYMBOL(kallsyms_names);
+	VMCOREINFO_SYMBOL(kallsyms_num_syms);
+	VMCOREINFO_SYMBOL(kallsyms_token_table);
+	VMCOREINFO_SYMBOL(kallsyms_token_index);
+#ifdef CONFIG_KALLSYMS_BASE_RELATIVE
+	VMCOREINFO_SYMBOL(kallsyms_offsets);
+	VMCOREINFO_SYMBOL(kallsyms_relative_base);
+#else
+	VMCOREINFO_SYMBOL(kallsyms_addresses);
+#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */
+#endif /* CONFIG_KALLSYMS */
+
+	arch_crash_save_vmcoreinfo();
+	update_vmcoreinfo_note();
+
+	return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
diff --git a/lib/buildid.c b/lib/buildid.c
index e3a7acdeef0ed4..3e6868c86b45a8 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -174,7 +174,7 @@ int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size)
 	return parse_build_id_buf(build_id, NULL, buf, buf_size);
 }
 
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_CRASH_CORE)
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO)
 unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX] __ro_after_init;
 
 /**

From 593b34d52d3b9dd4364eefcde8f52b9f1a915792 Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Fri, 26 Jan 2024 08:57:44 +0800
Subject: [PATCH 472/707] crash: remove duplicated include in vmcore_info.c

The header files kexec.h is included twice in vmcore_info.c,
so one inclusion can be removed.

Link: https://lkml.kernel.org/r/20240126005744.16561-1-yang.lee@linux.alibaba.com
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/vmcore_info.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index 8f48c0a42e2eed..8f77e238a54f54 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -13,7 +13,6 @@
 #include <linux/memory.h>
 #include <linux/cpuhotplug.h>
 #include <linux/memblock.h>
-#include <linux/kexec.h>
 #include <linux/kmemleak.h>
 
 #include <asm/page.h>

From 0a7afef35691c2c1a4dd5e7609f7de67d41713e7 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:43 +0800
Subject: [PATCH 473/707] crash: remove dependency of FA_DUMP on CRASH_DUMP

In kdump kernel, /proc/vmcore is an elf file mapping the crashed kernel's
old memory content. Its elf header is constructed in 1st kernel and passed
to kdump kernel via elfcorehdr_addr. Config CRASH_DUMP enables the code
of 1st kernel's old memory accessing in different architectures.

Currently, config FA_DUMP has dependency on CRASH_DUMP because fadump
needs access global variable 'elfcorehdr_addr' to judge if it's in
kdump kernel within function is_kdump_kernel(). In the current
kernel/crash_dump.c, variable 'elfcorehdr_addr' is defined, and function
setup_elfcorehdr() used to parse kernel parameter to fetch the passed
value of elfcorehdr_addr. Only for accessing elfcorehdr_addr, FA_DUMP
really doesn't have to depends on CRASH_DUMP.

To remove the dependency of FA_DUMP on CRASH_DUMP to avoid confusion,
rename kernel/crash_dump.c to kernel/elfcorehdr.c, and build it when
CONFIG_VMCORE_INFO is ebabled. With this, FA_DUMP doesn't need to depend
on CRASH_DUMP.

Link: https://lkml.kernel.org/r/20240124051254.67105-4-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/Kconfig                  | 1 -
 kernel/Makefile                       | 3 +--
 kernel/{crash_dump.c => elfcorehdr.c} | 0
 kernel/kexec_internal.h               | 2 ++
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename kernel/{crash_dump.c => elfcorehdr.c} (100%)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 495d197c9b2751..e66fd9923250ea 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -692,7 +692,6 @@ config FA_DUMP
 	depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
 	select VMCORE_INFO
 	select CRASH_RESERVE
-	select CRASH_DUMP
 	help
 	  A robust mechanism to get reliable kernel crash dump with
 	  assistance from firmware. This approach does not use kexec,
diff --git a/kernel/Makefile b/kernel/Makefile
index 649272a1d6b9f8..35abc65e1f1ade 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,7 +68,7 @@ obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
-obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o
+obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o
 obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
 obj-$(CONFIG_KEXEC_CORE) += kexec_core.o crash_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
@@ -121,7 +121,6 @@ obj-$(CONFIG_PERF_EVENTS) += events/
 
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 obj-$(CONFIG_PADATA) += padata.o
-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
diff --git a/kernel/crash_dump.c b/kernel/elfcorehdr.c
similarity index 100%
rename from kernel/crash_dump.c
rename to kernel/elfcorehdr.c
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 74da1409cd14b5..2595defe8c0d92 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -4,6 +4,8 @@
 
 #include <linux/kexec.h>
 
+struct kexec_segment;
+
 struct kimage *do_kimage_alloc_init(void);
 int sanity_check_segment_list(struct kimage *image);
 void kimage_free_page_list(struct list_head *list);

From a0537ef1e32e78f1e5c1d8979bc8a95410f32747 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:44 +0800
Subject: [PATCH 474/707] crash: split crash dumping code out from kexec_core.c

Currently, KEXEC_CORE select CRASH_CORE automatically because crash codes
need be built in to avoid compiling error when building kexec code even
though the crash dumping functionality is not enabled. E.g
--------------------
CONFIG_CRASH_CORE=y
CONFIG_KEXEC_CORE=y
CONFIG_KEXEC=y
CONFIG_KEXEC_FILE=y
---------------------

After splitting out crashkernel reservation code and vmcoreinfo exporting
code, there's only crash related code left in kernel/crash_core.c. Now
move crash related codes from kexec_core.c to crash_core.c and only build it
in when CONFIG_CRASH_DUMP=y.

And also wrap up crash codes inside CONFIG_CRASH_DUMP ifdeffery scope,
or replace inappropriate CONFIG_KEXEC_CORE ifdef with CONFIG_CRASH_DUMP
ifdef in generic kernel files.

With these changes, crash_core codes are abstracted from kexec codes and
can be disabled at all if only kexec reboot feature is wanted.

Link: https://lkml.kernel.org/r/20240124051254.67105-5-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/base/cpu.c         |   6 +-
 include/linux/crash_core.h |  61 +++++++++
 include/linux/kexec.h      |  45 +------
 init/initramfs.c           |   2 +-
 kernel/Makefile            |   3 +-
 kernel/crash_core.c        | 256 +++++++++++++++++++++++++++++++++++++
 kernel/kexec.c             |  11 +-
 kernel/kexec_core.c        | 250 ++----------------------------------
 kernel/kexec_file.c        |  13 +-
 kernel/ksysfs.c            |   4 +
 10 files changed, 359 insertions(+), 292 deletions(-)

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 47de0f140ba65e..b621a0fc75e15a 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -144,7 +144,7 @@ static DEVICE_ATTR(release, S_IWUSR, NULL, cpu_release_store);
 #endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
 #endif /* CONFIG_HOTPLUG_CPU */
 
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
 #include <linux/kexec.h>
 
 static ssize_t crash_notes_show(struct device *dev,
@@ -189,14 +189,14 @@ static const struct attribute_group crash_note_cpu_attr_group = {
 #endif
 
 static const struct attribute_group *common_cpu_attr_groups[] = {
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
 	&crash_note_cpu_attr_group,
 #endif
 	NULL
 };
 
 static const struct attribute_group *hotplugable_cpu_attr_groups[] = {
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
 	&crash_note_cpu_attr_group,
 #endif
 	NULL
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 7f19f62018ef9c..23270b16e1dbf3 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -6,6 +6,48 @@
 #include <linux/elfcore.h>
 #include <linux/elf.h>
 
+struct kimage;
+
+#ifdef CONFIG_CRASH_DUMP
+
+int crash_shrink_memory(unsigned long new_size);
+ssize_t crash_get_memory_size(void);
+
+#ifndef arch_kexec_protect_crashkres
+/*
+ * Protection mechanism for crashkernel reserved memory after
+ * the kdump kernel is loaded.
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+static inline void arch_kexec_protect_crashkres(void) { }
+#endif
+
+#ifndef arch_kexec_unprotect_crashkres
+static inline void arch_kexec_unprotect_crashkres(void) { }
+#endif
+
+
+
+#ifndef arch_crash_handle_hotplug_event
+static inline void arch_crash_handle_hotplug_event(struct kimage *image) { }
+#endif
+
+int crash_check_update_elfcorehdr(void);
+
+#ifndef crash_hotplug_cpu_support
+static inline int crash_hotplug_cpu_support(void) { return 0; }
+#endif
+
+#ifndef crash_hotplug_memory_support
+static inline int crash_hotplug_memory_support(void) { return 0; }
+#endif
+
+#ifndef crash_get_elfcorehdr_size
+static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; }
+#endif
+
 /* Alignment required for elf header segment */
 #define ELF_CORE_HEADER_ALIGN   4096
 
@@ -31,4 +73,23 @@ struct kexec_segment;
 #define KEXEC_CRASH_HP_REMOVE_MEMORY		4
 #define KEXEC_CRASH_HP_INVALID_CPU		-1U
 
+extern void __crash_kexec(struct pt_regs *regs);
+extern void crash_kexec(struct pt_regs *regs);
+int kexec_should_crash(struct task_struct *p);
+int kexec_crash_loaded(void);
+void crash_save_cpu(struct pt_regs *regs, int cpu);
+extern int kimage_crash_copy_vmcoreinfo(struct kimage *image);
+
+#else /* !CONFIG_CRASH_DUMP*/
+struct pt_regs;
+struct task_struct;
+struct kimage;
+static inline void __crash_kexec(struct pt_regs *regs) { }
+static inline void crash_kexec(struct pt_regs *regs) { }
+static inline int kexec_should_crash(struct task_struct *p) { return 0; }
+static inline int kexec_crash_loaded(void) { return 0; }
+static inline void crash_save_cpu(struct pt_regs *regs, int cpu) {};
+static inline int kimage_crash_copy_vmcoreinfo(struct kimage *image) { return 0; };
+#endif /* CONFIG_CRASH_DUMP*/
+
 #endif /* LINUX_CRASH_CORE_H */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 9c7bb8b56ed66d..060835bb82d52f 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -15,7 +15,6 @@
 
 #if !defined(__ASSEMBLY__)
 
-#include <linux/crash_core.h>
 #include <linux/vmcore_info.h>
 #include <linux/crash_reserve.h>
 #include <asm/io.h>
@@ -33,6 +32,7 @@ extern note_buf_t __percpu *crash_notes;
 #include <linux/module.h>
 #include <linux/highmem.h>
 #include <asm/kexec.h>
+#include <linux/crash_core.h>
 
 /* Verify architecture specific macros are defined */
 
@@ -380,13 +380,6 @@ extern struct page *kimage_alloc_control_pages(struct kimage *image,
 static inline int machine_kexec_post_load(struct kimage *image) { return 0; }
 #endif
 
-extern void __crash_kexec(struct pt_regs *);
-extern void crash_kexec(struct pt_regs *);
-int kexec_should_crash(struct task_struct *);
-int kexec_crash_loaded(void);
-void crash_save_cpu(struct pt_regs *regs, int cpu);
-extern int kimage_crash_copy_vmcoreinfo(struct kimage *image);
-
 extern struct kimage *kexec_image;
 extern struct kimage *kexec_crash_image;
 
@@ -410,24 +403,6 @@ bool kexec_load_permitted(int kexec_image_type);
 /* flag to track if kexec reboot is in progress */
 extern bool kexec_in_progress;
 
-int crash_shrink_memory(unsigned long new_size);
-ssize_t crash_get_memory_size(void);
-
-#ifndef arch_kexec_protect_crashkres
-/*
- * Protection mechanism for crashkernel reserved memory after
- * the kdump kernel is loaded.
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-static inline void arch_kexec_protect_crashkres(void) { }
-#endif
-
-#ifndef arch_kexec_unprotect_crashkres
-static inline void arch_kexec_unprotect_crashkres(void) { }
-#endif
-
 #ifndef page_to_boot_pfn
 static inline unsigned long page_to_boot_pfn(struct page *page)
 {
@@ -484,24 +459,6 @@ static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, g
 static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { }
 #endif
 
-#ifndef arch_crash_handle_hotplug_event
-static inline void arch_crash_handle_hotplug_event(struct kimage *image) { }
-#endif
-
-int crash_check_update_elfcorehdr(void);
-
-#ifndef crash_hotplug_cpu_support
-static inline int crash_hotplug_cpu_support(void) { return 0; }
-#endif
-
-#ifndef crash_hotplug_memory_support
-static inline int crash_hotplug_memory_support(void) { return 0; }
-#endif
-
-#ifndef crash_get_elfcorehdr_size
-static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; }
-#endif
-
 extern bool kexec_file_dbg_print;
 
 #define kexec_dprintk(fmt, ...)					\
diff --git a/init/initramfs.c b/init/initramfs.c
index 76deb48c38cb16..6f095f54eec976 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -642,7 +642,7 @@ void __weak __init free_initrd_mem(unsigned long start, unsigned long end)
 			"initrd");
 }
 
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_RESERVE
 static bool __init kexec_free_initrd(void)
 {
 	unsigned long crashk_start = (unsigned long)__va(crashk_res.start);
diff --git a/kernel/Makefile b/kernel/Makefile
index 35abc65e1f1ade..3c13240dfc9f09 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -70,7 +70,8 @@ obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o
 obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
-obj-$(CONFIG_KEXEC_CORE) += kexec_core.o crash_core.o
+obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
+obj-$(CONFIG_CRASH_DUMP) += crash_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 2f4df1fe6f7af5..78b5dc7cee3ab7 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -11,9 +11,14 @@
 #include <linux/sizes.h>
 #include <linux/kexec.h>
 #include <linux/memory.h>
+#include <linux/mm.h>
 #include <linux/cpuhotplug.h>
 #include <linux/memblock.h>
 #include <linux/kmemleak.h>
+#include <linux/crash_core.h>
+#include <linux/reboot.h>
+#include <linux/btf.h>
+#include <linux/objtool.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -26,6 +31,131 @@
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
 
+#ifdef CONFIG_CRASH_DUMP
+
+int kimage_crash_copy_vmcoreinfo(struct kimage *image)
+{
+	struct page *vmcoreinfo_page;
+	void *safecopy;
+
+	if (!IS_ENABLED(CONFIG_CRASH_DUMP))
+		return 0;
+	if (image->type != KEXEC_TYPE_CRASH)
+		return 0;
+
+	/*
+	 * For kdump, allocate one vmcoreinfo safe copy from the
+	 * crash memory. as we have arch_kexec_protect_crashkres()
+	 * after kexec syscall, we naturally protect it from write
+	 * (even read) access under kernel direct mapping. But on
+	 * the other hand, we still need to operate it when crash
+	 * happens to generate vmcoreinfo note, hereby we rely on
+	 * vmap for this purpose.
+	 */
+	vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
+	if (!vmcoreinfo_page) {
+		pr_warn("Could not allocate vmcoreinfo buffer\n");
+		return -ENOMEM;
+	}
+	safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
+	if (!safecopy) {
+		pr_warn("Could not vmap vmcoreinfo buffer\n");
+		return -ENOMEM;
+	}
+
+	image->vmcoreinfo_data_copy = safecopy;
+	crash_update_vmcoreinfo_safecopy(safecopy);
+
+	return 0;
+}
+
+
+
+int kexec_should_crash(struct task_struct *p)
+{
+	/*
+	 * If crash_kexec_post_notifiers is enabled, don't run
+	 * crash_kexec() here yet, which must be run after panic
+	 * notifiers in panic().
+	 */
+	if (crash_kexec_post_notifiers)
+		return 0;
+	/*
+	 * There are 4 panic() calls in make_task_dead() path, each of which
+	 * corresponds to each of these 4 conditions.
+	 */
+	if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+		return 1;
+	return 0;
+}
+
+int kexec_crash_loaded(void)
+{
+	return !!kexec_crash_image;
+}
+EXPORT_SYMBOL_GPL(kexec_crash_loaded);
+
+/*
+ * No panic_cpu check version of crash_kexec().  This function is called
+ * only when panic_cpu holds the current CPU number; this is the only CPU
+ * which processes crash_kexec routines.
+ */
+void __noclone __crash_kexec(struct pt_regs *regs)
+{
+	/* Take the kexec_lock here to prevent sys_kexec_load
+	 * running on one cpu from replacing the crash kernel
+	 * we are using after a panic on a different cpu.
+	 *
+	 * If the crash kernel was not located in a fixed area
+	 * of memory the xchg(&kexec_crash_image) would be
+	 * sufficient.  But since I reuse the memory...
+	 */
+	if (kexec_trylock()) {
+		if (kexec_crash_image) {
+			struct pt_regs fixed_regs;
+
+			crash_setup_regs(&fixed_regs, regs);
+			crash_save_vmcoreinfo();
+			machine_crash_shutdown(&fixed_regs);
+			machine_kexec(kexec_crash_image);
+		}
+		kexec_unlock();
+	}
+}
+STACK_FRAME_NON_STANDARD(__crash_kexec);
+
+__bpf_kfunc void crash_kexec(struct pt_regs *regs)
+{
+	int old_cpu, this_cpu;
+
+	/*
+	 * Only one CPU is allowed to execute the crash_kexec() code as with
+	 * panic().  Otherwise parallel calls of panic() and crash_kexec()
+	 * may stop each other.  To exclude them, we use panic_cpu here too.
+	 */
+	old_cpu = PANIC_CPU_INVALID;
+	this_cpu = raw_smp_processor_id();
+
+	if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
+		/* This is the 1st CPU which comes here, so go ahead. */
+		__crash_kexec(regs);
+
+		/*
+		 * Reset panic_cpu to allow another panic()/crash_kexec()
+		 * call.
+		 */
+		atomic_set(&panic_cpu, PANIC_CPU_INVALID);
+	}
+}
+
+static inline resource_size_t crash_resource_size(const struct resource *res)
+{
+	return !res->end ? 0 : resource_size(res);
+}
+
+
+
+
 int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
 			  void **addr, unsigned long *sz)
 {
@@ -187,6 +317,130 @@ int crash_exclude_mem_range(struct crash_mem *mem,
 	return 0;
 }
 
+ssize_t crash_get_memory_size(void)
+{
+	ssize_t size = 0;
+
+	if (!kexec_trylock())
+		return -EBUSY;
+
+	size += crash_resource_size(&crashk_res);
+	size += crash_resource_size(&crashk_low_res);
+
+	kexec_unlock();
+	return size;
+}
+
+static int __crash_shrink_memory(struct resource *old_res,
+				 unsigned long new_size)
+{
+	struct resource *ram_res;
+
+	ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+	if (!ram_res)
+		return -ENOMEM;
+
+	ram_res->start = old_res->start + new_size;
+	ram_res->end   = old_res->end;
+	ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
+	ram_res->name  = "System RAM";
+
+	if (!new_size) {
+		release_resource(old_res);
+		old_res->start = 0;
+		old_res->end   = 0;
+	} else {
+		crashk_res.end = ram_res->start - 1;
+	}
+
+	crash_free_reserved_phys_range(ram_res->start, ram_res->end);
+	insert_resource(&iomem_resource, ram_res);
+
+	return 0;
+}
+
+int crash_shrink_memory(unsigned long new_size)
+{
+	int ret = 0;
+	unsigned long old_size, low_size;
+
+	if (!kexec_trylock())
+		return -EBUSY;
+
+	if (kexec_crash_image) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	low_size = crash_resource_size(&crashk_low_res);
+	old_size = crash_resource_size(&crashk_res) + low_size;
+	new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN);
+	if (new_size >= old_size) {
+		ret = (new_size == old_size) ? 0 : -EINVAL;
+		goto unlock;
+	}
+
+	/*
+	 * (low_size > new_size) implies that low_size is greater than zero.
+	 * This also means that if low_size is zero, the else branch is taken.
+	 *
+	 * If low_size is greater than 0, (low_size > new_size) indicates that
+	 * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res
+	 * needs to be shrunken.
+	 */
+	if (low_size > new_size) {
+		ret = __crash_shrink_memory(&crashk_res, 0);
+		if (ret)
+			goto unlock;
+
+		ret = __crash_shrink_memory(&crashk_low_res, new_size);
+	} else {
+		ret = __crash_shrink_memory(&crashk_res, new_size - low_size);
+	}
+
+	/* Swap crashk_res and crashk_low_res if needed */
+	if (!crashk_res.end && crashk_low_res.end) {
+		crashk_res.start = crashk_low_res.start;
+		crashk_res.end   = crashk_low_res.end;
+		release_resource(&crashk_low_res);
+		crashk_low_res.start = 0;
+		crashk_low_res.end   = 0;
+		insert_resource(&iomem_resource, &crashk_res);
+	}
+
+unlock:
+	kexec_unlock();
+	return ret;
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+	struct elf_prstatus prstatus;
+	u32 *buf;
+
+	if ((cpu < 0) || (cpu >= nr_cpu_ids))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * I need a well defined structure format
+	 * for the data I pass, and I need tags
+	 * on the data to indicate what information I have
+	 * squirrelled away.  ELF notes happen to provide
+	 * all of that, so there is no need to invent something new.
+	 */
+	buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+	if (!buf)
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	prstatus.common.pr_pid = current->pid;
+	elf_core_copy_regs(&prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+			      &prstatus, sizeof(prstatus));
+	final_note(buf);
+}
+
+
+
 static int __init crash_notes_memory_init(void)
 {
 	/* Allocate memory for saving cpu registers. */
@@ -220,6 +474,8 @@ static int __init crash_notes_memory_init(void)
 }
 subsys_initcall(crash_notes_memory_init);
 
+#endif /*CONFIG_CRASH_DUMP*/
+
 #ifdef CONFIG_CRASH_HOTPLUG
 #undef pr_fmt
 #define pr_fmt(fmt) "crash hp: " fmt
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 8f35a5a42af852..bab542fc1463d2 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -28,12 +28,14 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 	struct kimage *image;
 	bool kexec_on_panic = flags & KEXEC_ON_CRASH;
 
+#ifdef CONFIG_CRASH_DUMP
 	if (kexec_on_panic) {
 		/* Verify we have a valid entry point */
 		if ((entry < phys_to_boot_phys(crashk_res.start)) ||
 		    (entry > phys_to_boot_phys(crashk_res.end)))
 			return -EADDRNOTAVAIL;
 	}
+#endif
 
 	/* Allocate and initialize a controlling structure */
 	image = do_kimage_alloc_init();
@@ -44,11 +46,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 	image->nr_segments = nr_segments;
 	memcpy(image->segment, segments, nr_segments * sizeof(*segments));
 
+#ifdef CONFIG_CRASH_DUMP
 	if (kexec_on_panic) {
 		/* Enable special crash kernel control page alloc policy. */
 		image->control_page = crashk_res.start;
 		image->type = KEXEC_TYPE_CRASH;
 	}
+#endif
 
 	ret = sanity_check_segment_list(image);
 	if (ret)
@@ -99,13 +103,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
 	if (!kexec_trylock())
 		return -EBUSY;
 
+#ifdef CONFIG_CRASH_DUMP
 	if (flags & KEXEC_ON_CRASH) {
 		dest_image = &kexec_crash_image;
 		if (kexec_crash_image)
 			arch_kexec_unprotect_crashkres();
-	} else {
+	} else
+#endif
 		dest_image = &kexec_image;
-	}
 
 	if (nr_segments == 0) {
 		/* Uninstall image */
@@ -162,8 +167,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
 	image = xchg(dest_image, image);
 
 out:
+#ifdef CONFIG_CRASH_DUMP
 	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
 		arch_kexec_protect_crashkres();
+#endif
 
 	kimage_free(image);
 out_unlock:
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d08fc7b5db9790..ce3429e7972ccd 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -54,30 +54,6 @@ bool kexec_in_progress = false;
 
 bool kexec_file_dbg_print;
 
-int kexec_should_crash(struct task_struct *p)
-{
-	/*
-	 * If crash_kexec_post_notifiers is enabled, don't run
-	 * crash_kexec() here yet, which must be run after panic
-	 * notifiers in panic().
-	 */
-	if (crash_kexec_post_notifiers)
-		return 0;
-	/*
-	 * There are 4 panic() calls in make_task_dead() path, each of which
-	 * corresponds to each of these 4 conditions.
-	 */
-	if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
-		return 1;
-	return 0;
-}
-
-int kexec_crash_loaded(void)
-{
-	return !!kexec_crash_image;
-}
-EXPORT_SYMBOL_GPL(kexec_crash_loaded);
-
 /*
  * When kexec transitions to the new kernel there is a one-to-one
  * mapping between physical and virtual addresses.  On processors
@@ -209,6 +185,7 @@ int sanity_check_segment_list(struct kimage *image)
 	if (total_pages > nr_pages / 2)
 		return -EINVAL;
 
+#ifdef CONFIG_CRASH_DUMP
 	/*
 	 * Verify we have good destination addresses.  Normally
 	 * the caller is responsible for making certain we don't
@@ -231,6 +208,7 @@ int sanity_check_segment_list(struct kimage *image)
 				return -EADDRNOTAVAIL;
 		}
 	}
+#endif
 
 	return 0;
 }
@@ -403,6 +381,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 	return pages;
 }
 
+#ifdef CONFIG_CRASH_DUMP
 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 						      unsigned int order)
 {
@@ -468,6 +447,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 
 	return pages;
 }
+#endif
 
 
 struct page *kimage_alloc_control_pages(struct kimage *image,
@@ -479,48 +459,16 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
 	case KEXEC_TYPE_DEFAULT:
 		pages = kimage_alloc_normal_control_pages(image, order);
 		break;
+#ifdef CONFIG_CRASH_DUMP
 	case KEXEC_TYPE_CRASH:
 		pages = kimage_alloc_crash_control_pages(image, order);
 		break;
+#endif
 	}
 
 	return pages;
 }
 
-int kimage_crash_copy_vmcoreinfo(struct kimage *image)
-{
-	struct page *vmcoreinfo_page;
-	void *safecopy;
-
-	if (image->type != KEXEC_TYPE_CRASH)
-		return 0;
-
-	/*
-	 * For kdump, allocate one vmcoreinfo safe copy from the
-	 * crash memory. as we have arch_kexec_protect_crashkres()
-	 * after kexec syscall, we naturally protect it from write
-	 * (even read) access under kernel direct mapping. But on
-	 * the other hand, we still need to operate it when crash
-	 * happens to generate vmcoreinfo note, hereby we rely on
-	 * vmap for this purpose.
-	 */
-	vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
-	if (!vmcoreinfo_page) {
-		pr_warn("Could not allocate vmcoreinfo buffer\n");
-		return -ENOMEM;
-	}
-	safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
-	if (!safecopy) {
-		pr_warn("Could not vmap vmcoreinfo buffer\n");
-		return -ENOMEM;
-	}
-
-	image->vmcoreinfo_data_copy = safecopy;
-	crash_update_vmcoreinfo_safecopy(safecopy);
-
-	return 0;
-}
-
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
 	if (*image->entry != 0)
@@ -603,10 +551,12 @@ void kimage_free(struct kimage *image)
 	if (!image)
 		return;
 
+#ifdef CONFIG_CRASH_DUMP
 	if (image->vmcoreinfo_data_copy) {
 		crash_update_vmcoreinfo_safecopy(NULL);
 		vunmap(image->vmcoreinfo_data_copy);
 	}
+#endif
 
 	kimage_free_extra_pages(image);
 	for_each_kimage_entry(image, ptr, entry) {
@@ -824,6 +774,7 @@ static int kimage_load_normal_segment(struct kimage *image,
 	return result;
 }
 
+#ifdef CONFIG_CRASH_DUMP
 static int kimage_load_crash_segment(struct kimage *image,
 					struct kexec_segment *segment)
 {
@@ -891,6 +842,7 @@ static int kimage_load_crash_segment(struct kimage *image,
 out:
 	return result;
 }
+#endif
 
 int kimage_load_segment(struct kimage *image,
 				struct kexec_segment *segment)
@@ -901,9 +853,11 @@ int kimage_load_segment(struct kimage *image,
 	case KEXEC_TYPE_DEFAULT:
 		result = kimage_load_normal_segment(image, segment);
 		break;
+#ifdef CONFIG_CRASH_DUMP
 	case KEXEC_TYPE_CRASH:
 		result = kimage_load_crash_segment(image, segment);
 		break;
+#endif
 	}
 
 	return result;
@@ -1027,186 +981,6 @@ bool kexec_load_permitted(int kexec_image_type)
 	return true;
 }
 
-/*
- * No panic_cpu check version of crash_kexec().  This function is called
- * only when panic_cpu holds the current CPU number; this is the only CPU
- * which processes crash_kexec routines.
- */
-void __noclone __crash_kexec(struct pt_regs *regs)
-{
-	/* Take the kexec_lock here to prevent sys_kexec_load
-	 * running on one cpu from replacing the crash kernel
-	 * we are using after a panic on a different cpu.
-	 *
-	 * If the crash kernel was not located in a fixed area
-	 * of memory the xchg(&kexec_crash_image) would be
-	 * sufficient.  But since I reuse the memory...
-	 */
-	if (kexec_trylock()) {
-		if (kexec_crash_image) {
-			struct pt_regs fixed_regs;
-
-			crash_setup_regs(&fixed_regs, regs);
-			crash_save_vmcoreinfo();
-			machine_crash_shutdown(&fixed_regs);
-			machine_kexec(kexec_crash_image);
-		}
-		kexec_unlock();
-	}
-}
-STACK_FRAME_NON_STANDARD(__crash_kexec);
-
-__bpf_kfunc void crash_kexec(struct pt_regs *regs)
-{
-	int old_cpu, this_cpu;
-
-	/*
-	 * Only one CPU is allowed to execute the crash_kexec() code as with
-	 * panic().  Otherwise parallel calls of panic() and crash_kexec()
-	 * may stop each other.  To exclude them, we use panic_cpu here too.
-	 */
-	old_cpu = PANIC_CPU_INVALID;
-	this_cpu = raw_smp_processor_id();
-
-	if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
-		/* This is the 1st CPU which comes here, so go ahead. */
-		__crash_kexec(regs);
-
-		/*
-		 * Reset panic_cpu to allow another panic()/crash_kexec()
-		 * call.
-		 */
-		atomic_set(&panic_cpu, PANIC_CPU_INVALID);
-	}
-}
-
-static inline resource_size_t crash_resource_size(const struct resource *res)
-{
-	return !res->end ? 0 : resource_size(res);
-}
-
-ssize_t crash_get_memory_size(void)
-{
-	ssize_t size = 0;
-
-	if (!kexec_trylock())
-		return -EBUSY;
-
-	size += crash_resource_size(&crashk_res);
-	size += crash_resource_size(&crashk_low_res);
-
-	kexec_unlock();
-	return size;
-}
-
-static int __crash_shrink_memory(struct resource *old_res,
-				 unsigned long new_size)
-{
-	struct resource *ram_res;
-
-	ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
-	if (!ram_res)
-		return -ENOMEM;
-
-	ram_res->start = old_res->start + new_size;
-	ram_res->end   = old_res->end;
-	ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
-	ram_res->name  = "System RAM";
-
-	if (!new_size) {
-		release_resource(old_res);
-		old_res->start = 0;
-		old_res->end   = 0;
-	} else {
-		crashk_res.end = ram_res->start - 1;
-	}
-
-	crash_free_reserved_phys_range(ram_res->start, ram_res->end);
-	insert_resource(&iomem_resource, ram_res);
-
-	return 0;
-}
-
-int crash_shrink_memory(unsigned long new_size)
-{
-	int ret = 0;
-	unsigned long old_size, low_size;
-
-	if (!kexec_trylock())
-		return -EBUSY;
-
-	if (kexec_crash_image) {
-		ret = -ENOENT;
-		goto unlock;
-	}
-
-	low_size = crash_resource_size(&crashk_low_res);
-	old_size = crash_resource_size(&crashk_res) + low_size;
-	new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN);
-	if (new_size >= old_size) {
-		ret = (new_size == old_size) ? 0 : -EINVAL;
-		goto unlock;
-	}
-
-	/*
-	 * (low_size > new_size) implies that low_size is greater than zero.
-	 * This also means that if low_size is zero, the else branch is taken.
-	 *
-	 * If low_size is greater than 0, (low_size > new_size) indicates that
-	 * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res
-	 * needs to be shrunken.
-	 */
-	if (low_size > new_size) {
-		ret = __crash_shrink_memory(&crashk_res, 0);
-		if (ret)
-			goto unlock;
-
-		ret = __crash_shrink_memory(&crashk_low_res, new_size);
-	} else {
-		ret = __crash_shrink_memory(&crashk_res, new_size - low_size);
-	}
-
-	/* Swap crashk_res and crashk_low_res if needed */
-	if (!crashk_res.end && crashk_low_res.end) {
-		crashk_res.start = crashk_low_res.start;
-		crashk_res.end   = crashk_low_res.end;
-		release_resource(&crashk_low_res);
-		crashk_low_res.start = 0;
-		crashk_low_res.end   = 0;
-		insert_resource(&iomem_resource, &crashk_res);
-	}
-
-unlock:
-	kexec_unlock();
-	return ret;
-}
-
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
-	struct elf_prstatus prstatus;
-	u32 *buf;
-
-	if ((cpu < 0) || (cpu >= nr_cpu_ids))
-		return;
-
-	/* Using ELF notes here is opportunistic.
-	 * I need a well defined structure format
-	 * for the data I pass, and I need tags
-	 * on the data to indicate what information I have
-	 * squirrelled away.  ELF notes happen to provide
-	 * all of that, so there is no need to invent something new.
-	 */
-	buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
-	if (!buf)
-		return;
-	memset(&prstatus, 0, sizeof(prstatus));
-	prstatus.common.pr_pid = current->pid;
-	elf_core_copy_regs(&prstatus.pr_reg, regs);
-	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
-			      &prstatus, sizeof(prstatus));
-	final_note(buf);
-}
-
 /*
  * Move into place and start executing a preloaded standalone
  * executable.  If nothing was preloaded return an error.
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index bef2f6f2571b42..ce7ce2ae27cdfe 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -285,11 +285,13 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
 	kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG);
 	image->file_mode = 1;
 
+#ifdef CONFIG_CRASH_DUMP
 	if (kexec_on_panic) {
 		/* Enable special crash kernel control page alloc policy. */
 		image->control_page = crashk_res.start;
 		image->type = KEXEC_TYPE_CRASH;
 	}
+#endif
 
 	ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
 					   cmdline_ptr, cmdline_len, flags);
@@ -349,13 +351,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 	if (!kexec_trylock())
 		return -EBUSY;
 
+#ifdef CONFIG_CRASH_DUMP
 	if (image_type == KEXEC_TYPE_CRASH) {
 		dest_image = &kexec_crash_image;
 		if (kexec_crash_image)
 			arch_kexec_unprotect_crashkres();
-	} else {
+	} else
+#endif
 		dest_image = &kexec_image;
-	}
 
 	if (flags & KEXEC_FILE_UNLOAD)
 		goto exchange;
@@ -419,8 +422,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 exchange:
 	image = xchg(dest_image, image);
 out:
+#ifdef CONFIG_CRASH_DUMP
 	if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
 		arch_kexec_protect_crashkres();
+#endif
 
 	kexec_unlock();
 	kimage_free(image);
@@ -595,12 +600,14 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
 static int kexec_walk_resources(struct kexec_buf *kbuf,
 				int (*func)(struct resource *, void *))
 {
+#ifdef CONFIG_CRASH_DUMP
 	if (kbuf->image->type == KEXEC_TYPE_CRASH)
 		return walk_iomem_res_desc(crashk_res.desc,
 					   IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
 					   crashk_res.start, crashk_res.end,
 					   kbuf, func);
-	else if (kbuf->top_down)
+#endif
+	if (kbuf->top_down)
 		return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func);
 	else
 		return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 11526fc42bc24c..fe7a517fc4abbf 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -120,6 +120,7 @@ static ssize_t kexec_loaded_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(kexec_loaded);
 
+#ifdef CONFIG_CRASH_DUMP
 static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
 				       struct kobj_attribute *attr, char *buf)
 {
@@ -152,6 +153,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
 }
 KERNEL_ATTR_RW(kexec_crash_size);
 
+#endif /* CONFIG_CRASH_DUMP*/
 #endif /* CONFIG_KEXEC_CORE */
 
 #ifdef CONFIG_VMCORE_INFO
@@ -262,9 +264,11 @@ static struct attribute * kernel_attrs[] = {
 #endif
 #ifdef CONFIG_KEXEC_CORE
 	&kexec_loaded_attr.attr,
+#ifdef CONFIG_CRASH_DUMP
 	&kexec_crash_loaded_attr.attr,
 	&kexec_crash_size_attr.attr,
 #endif
+#endif
 #ifdef CONFIG_VMCORE_INFO
 	&vmcoreinfo_attr.attr,
 #ifdef CONFIG_CRASH_HOTPLUG

From 945e488db1666b3a765c7ad65dee061a3d2099e1 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:45 +0800
Subject: [PATCH 475/707] crash: clean up kdump related config items

By splitting CRASH_RESERVE and VMCORE_INFO out from CRASH_CORE, cleaning
up the dependency of FA_DMUMP on CRASH_DUMP, and moving crash codes from
kexec_core.c to crash_core.c, now we can rearrange CRASH_DUMP to
depend on KEXEC_CORE, and make CRASH_DUMP select CRASH_RESERVE and
VMCORE_INFO.

KEXEC_CORE won't select CRASH_RESERVE and VMCORE_INFO any more because
KEXEC_CORE enables codes which allocate control pages, copy
kexec/kdump segments, and prepare for switching. These codes are shared
by both kexec reboot and crash dumping.

Doing this makes codes and the corresponding config items more
logical (the right item depends on or is selected by the left item).

PROC_KCORE -----------> VMCORE_INFO

           |----------> VMCORE_INFO
FA_DUMP----|
           |----------> CRASH_RESERVE

                                                ---->VMCORE_INFO
                                               /
                                               |---->CRASH_RESERVE
KEXEC      --|                                /|
             |--> KEXEC_CORE--> CRASH_DUMP-->/-|---->PROC_VMCORE
KEXEC_FILE --|                               \ |
                                               \---->CRASH_HOTPLUG

KEXEC      --|
             |--> KEXEC_CORE--> kexec reboot
KEXEC_FILE --|

Link: https://lkml.kernel.org/r/20240124051254.67105-6-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/Kconfig.kexec | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 8faf27043432fe..6c34e63c88ff4c 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -9,8 +9,6 @@ config VMCORE_INFO
 	bool
 
 config KEXEC_CORE
-	select VMCORE_INFO
-	select CRASH_RESERVE
 	bool
 
 config KEXEC_ELF
@@ -99,8 +97,11 @@ config KEXEC_JUMP
 
 config CRASH_DUMP
 	bool "kernel crash dumps"
+	default y
 	depends on ARCH_SUPPORTS_CRASH_DUMP
-	select KEXEC_CORE
+	depends on KEXEC_CORE
+	select VMCORE_INFO
+	select CRASH_RESERVE
 	help
 	  Generate crash dump after being started by kexec.
 	  This should be normally only set in special crash dump kernels

From 0d59e53ed6c3b33199d7a9f916278806e8b06661 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:46 +0800
Subject: [PATCH 476/707] x86, crash: wrap crash dumping code into crash
 related ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on x86 with some adjustments.

Here, also change some ifdefs or IS_ENABLED() check to more appropriate
ones, e,g
 - #ifdef CONFIG_KEXEC_CORE -> #ifdef CONFIG_CRASH_DUMP
 - (!IS_ENABLED(CONFIG_KEXEC_CORE)) - > (!IS_ENABLED(CONFIG_CRASH_RESERVE))

Link: https://lkml.kernel.org/r/20240124051254.67105-7-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/kernel/Makefile           | 4 ++--
 arch/x86/kernel/cpu/mshyperv.c     | 4 ++++
 arch/x86/kernel/kexec-bzimage64.c  | 4 ++++
 arch/x86/kernel/kvm.c              | 4 ++--
 arch/x86/kernel/machine_kexec_64.c | 3 +++
 arch/x86/kernel/reboot.c           | 2 +-
 arch/x86/kernel/setup.c            | 2 +-
 arch/x86/kernel/smp.c              | 2 +-
 arch/x86/xen/enlighten_hvm.c       | 4 ++++
 9 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 913d4022131eba..3668b1edef2d28 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -100,9 +100,9 @@ obj-$(CONFIG_TRACING)		+= trace.o
 obj-$(CONFIG_RETHOOK)		+= rethook.o
 obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info_$(BITS).o
 obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC_CORE)	+= relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_CORE)	+= relocate_kernel_$(BITS).o
 obj-$(CONFIG_KEXEC_FILE)	+= kexec-bzimage64.o
-obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
+obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o crash.o
 obj-y				+= kprobes/
 obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_X86_32)		+= doublefault_32.o
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 01fa06dd06b66c..f8163a59026ba5 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -210,6 +210,7 @@ static void hv_machine_shutdown(void)
 		hyperv_cleanup();
 }
 
+#ifdef CONFIG_CRASH_DUMP
 static void hv_machine_crash_shutdown(struct pt_regs *regs)
 {
 	if (hv_crash_handler)
@@ -221,6 +222,7 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs)
 	/* Disable the hypercall page when there is only 1 active CPU. */
 	hyperv_cleanup();
 }
+#endif
 #endif /* CONFIG_KEXEC_CORE */
 #endif /* CONFIG_HYPERV */
 
@@ -497,7 +499,9 @@ static void __init ms_hyperv_init_platform(void)
 
 #if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE)
 	machine_ops.shutdown = hv_machine_shutdown;
+#ifdef CONFIG_CRASH_DUMP
 	machine_ops.crash_shutdown = hv_machine_crash_shutdown;
+#endif
 #endif
 	if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) {
 		/*
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 2a422e00ed4b42..b55737b83a841a 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -263,11 +263,13 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
 	memset(&params->hd0_info, 0, sizeof(params->hd0_info));
 	memset(&params->hd1_info, 0, sizeof(params->hd1_info));
 
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH) {
 		ret = crash_setup_memmap_entries(image, params);
 		if (ret)
 			return ret;
 	} else
+#endif
 		setup_e820_entries(params);
 
 	nr_e820_entries = params->e820_entries;
@@ -433,12 +435,14 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 		return ERR_PTR(-EINVAL);
 	}
 
+#ifdef CONFIG_CRASH_DUMP
 	/* Allocate and load backup region */
 	if (image->type == KEXEC_TYPE_CRASH) {
 		ret = crash_load_segments(image);
 		if (ret)
 			return ERR_PTR(ret);
 	}
+#endif
 
 	/*
 	 * Load purgatory. For 64bit entry point, purgatory  code can be
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index dfe9945b9becee..acfc2d3183bce6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -769,7 +769,7 @@ static struct notifier_block kvm_pv_reboot_nb = {
  * won't be valid. In cases like kexec, in which you install a new kernel, this
  * means a random memory location will be kept being written.
  */
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
 	kvm_guest_cpu_offline(true);
@@ -852,7 +852,7 @@ static void __init kvm_guest_init(void)
 	kvm_guest_cpu_init();
 #endif
 
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
 	machine_ops.crash_shutdown = kvm_crash_shutdown;
 #endif
 
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index bc0a5348b4a627..b180d8e497c317 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -508,6 +508,8 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image)
 }
 #endif /* CONFIG_KEXEC_FILE */
 
+#ifdef CONFIG_CRASH_DUMP
+
 static int
 kexec_mark_range(unsigned long start, unsigned long end, bool protect)
 {
@@ -552,6 +554,7 @@ void arch_kexec_unprotect_crashkres(void)
 {
 	kexec_mark_crashkres(false);
 }
+#endif
 
 /*
  * During a traditional boot under SME, SME will encrypt the kernel,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 830425e6d38e2f..1287b0d5962f7f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -796,7 +796,7 @@ struct machine_ops machine_ops __ro_after_init = {
 	.emergency_restart = native_machine_emergency_restart,
 	.restart = native_machine_restart,
 	.halt = native_machine_halt,
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
 	.crash_shutdown = native_machine_crash_shutdown,
 #endif
 };
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 84201071dfacd1..899d839a2954a7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -471,7 +471,7 @@ static void __init arch_reserve_crashkernel(void)
 	bool high = false;
 	int ret;
 
-	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
 		return;
 
 	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 96a771f9f930a6..52c3823b721191 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -282,7 +282,7 @@ struct smp_ops smp_ops = {
 	.smp_cpus_done		= native_smp_cpus_done,
 
 	.stop_other_cpus	= native_stop_other_cpus,
-#if defined(CONFIG_KEXEC_CORE)
+#if defined(CONFIG_CRASH_DUMP)
 	.crash_stop_other_cpus	= kdump_nmi_shootdown_cpus,
 #endif
 	.smp_send_reschedule	= native_smp_send_reschedule,
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 3f8c34707c5001..09e3db7ff99066 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -149,12 +149,14 @@ static void xen_hvm_shutdown(void)
 		xen_reboot(SHUTDOWN_soft_reset);
 }
 
+#ifdef CONFIG_CRASH_DUMP
 static void xen_hvm_crash_shutdown(struct pt_regs *regs)
 {
 	native_machine_crash_shutdown(regs);
 	xen_reboot(SHUTDOWN_soft_reset);
 }
 #endif
+#endif
 
 static int xen_cpu_up_prepare_hvm(unsigned int cpu)
 {
@@ -236,8 +238,10 @@ static void __init xen_hvm_guest_init(void)
 
 #ifdef CONFIG_KEXEC_CORE
 	machine_ops.shutdown = xen_hvm_shutdown;
+#ifdef CONFIG_CRASH_DUMP
 	machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
 #endif
+#endif
 }
 
 static __init int xen_parse_nopv(char *arg)

From 674314edd59dba87224bf477afd901abf3c6d441 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:47 +0800
Subject: [PATCH 477/707] arm64, crash: wrap crash dumping code into crash
 related ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on arm64 with some adjustments.

Here wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery.

Link: https://lkml.kernel.org/r/20240124051254.67105-8-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/kexec.h         |  2 +-
 arch/arm64/kernel/machine_kexec.c      |  2 +-
 arch/arm64/kernel/machine_kexec_file.c | 10 ++++++++--
 arch/arm64/mm/init.c                   |  2 +-
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
index 9ac9572a3bbee2..4d9cc7a76d9ca1 100644
--- a/arch/arm64/include/asm/kexec.h
+++ b/arch/arm64/include/asm/kexec.h
@@ -80,7 +80,7 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
 	}
 }
 
-#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_HIBERNATION)
+#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_HIBERNATION)
 extern bool crash_is_nosave(unsigned long pfn);
 extern void crash_prepare_suspend(void);
 extern void crash_post_resume(void);
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index b38aae5b488d07..82e2203d86a31f 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -255,7 +255,7 @@ void machine_crash_shutdown(struct pt_regs *regs)
 	pr_info("Starting crashdump kernel...\n");
 }
 
-#ifdef CONFIG_HIBERNATION
+#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_HIBERNATION)
 /*
  * To preserve the crash dump kernel image, the relevant memory segments
  * should be mapped again around the hibernation.
diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
index 0e017358f4ba64..af1ca875c52ce2 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -39,6 +39,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image)
 	return kexec_image_post_load_cleanup_default(image);
 }
 
+#ifdef CONFIG_CRASH_DUMP
 static int prepare_elf_headers(void **addr, unsigned long *sz)
 {
 	struct crash_mem *cmem;
@@ -80,6 +81,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz)
 	kfree(cmem);
 	return ret;
 }
+#endif
 
 /*
  * Tries to add the initrd and DTB to the image. If it is not possible to find
@@ -93,8 +95,8 @@ int load_other_segments(struct kimage *image,
 			char *cmdline)
 {
 	struct kexec_buf kbuf;
-	void *headers, *dtb = NULL;
-	unsigned long headers_sz, initrd_load_addr = 0, dtb_len,
+	void *dtb = NULL;
+	unsigned long initrd_load_addr = 0, dtb_len,
 		      orig_segments = image->nr_segments;
 	int ret = 0;
 
@@ -102,7 +104,10 @@ int load_other_segments(struct kimage *image,
 	/* not allocate anything below the kernel */
 	kbuf.buf_min = kernel_load_addr + kernel_size;
 
+#ifdef CONFIG_CRASH_DUMP
 	/* load elf core header */
+	void *headers;
+	unsigned long headers_sz;
 	if (image->type == KEXEC_TYPE_CRASH) {
 		ret = prepare_elf_headers(&headers, &headers_sz);
 		if (ret) {
@@ -130,6 +135,7 @@ int load_other_segments(struct kimage *image,
 		kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
 			      image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
 	}
+#endif
 
 	/* load initrd */
 	if (initrd) {
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 74c1db8ce271d8..c1f6213e77f328 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -100,7 +100,7 @@ static void __init arch_reserve_crashkernel(void)
 	bool high = false;
 	int ret;
 
-	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
 		return;
 
 	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),

From ed4ab93b8dca4d638afd18dab2a1d3d930f1c882 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:48 +0800
Subject: [PATCH 478/707] ppc, crash: enforce KEXEC and KEXEC_FILE to select
 CRASH_DUMP

In PowerPC, the crash dumping and kexec reboot share code in
arch_kexec_locate_mem_hole(), in which struct crash_mem is used.

Here enfoce enforce KEXEC and KEXEC_FILE to select CRASH_DUMP for now.

Link: https://lkml.kernel.org/r/20240124051254.67105-9-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/Kconfig | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e66fd9923250ea..31f013e636e397 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -608,6 +608,10 @@ config PPC64_SUPPORTS_MEMORY_FAILURE
 config ARCH_SUPPORTS_KEXEC
 	def_bool PPC_BOOK3S || PPC_E500 || (44x && !SMP)
 
+config ARCH_SELECTS_KEXEC
+	def_bool y
+	select CRASH_DUMP
+
 config ARCH_SUPPORTS_KEXEC_FILE
 	def_bool PPC64
 
@@ -618,6 +622,7 @@ config ARCH_SELECTS_KEXEC_FILE
 	def_bool y
 	depends on KEXEC_FILE
 	select KEXEC_ELF
+	select CRASH_DUMP
 	select HAVE_IMA_KEXEC if IMA
 
 config PPC64_BIG_ENDIAN_ELF_ABI_V2

From 34847c970d7feb3460c577e2bc9b5d54dca8c53b Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 25 Jan 2024 22:29:07 +0800
Subject: [PATCH 479/707] 
 ppc-crash-enforce-kexec-and-kexec_file-to-select-crash_dump-fix

I reproduced the failure with allnoconfig on ppc, and found below change
can fix it too. And the change makes ARCH_SELECTS_KEXEC consistent with
ARCH_SELECTS_KEXEC_FILE on the dependency. What do you think?

Link: https://lkml.kernel.org/r/ZbJwMyCpz4HDySoo@MiWiFi-R3L-srv
Signed-off-by: Baoquan He <bhe@redhat.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 31f013e636e397..79f98cd5f2c907 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -610,6 +610,7 @@ config ARCH_SUPPORTS_KEXEC
 
 config ARCH_SELECTS_KEXEC
 	def_bool y
+	depends on KEXEC
 	select CRASH_DUMP
 
 config ARCH_SUPPORTS_KEXEC_FILE

From 1819823eed348ec15d25f82ff2afeeaf73d15430 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:49 +0800
Subject: [PATCH 480/707] s390, crash: wrap crash dumping code into crash
 related ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on s390 with some adjustments.

Here wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery.

Link: https://lkml.kernel.org/r/20240124051254.67105-10-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/kernel/kexec_elf.c          |  2 ++
 arch/s390/kernel/kexec_image.c        |  2 ++
 arch/s390/kernel/machine_kexec_file.c | 10 ++++++++++
 3 files changed, 14 insertions(+)

diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c
index 9da6fa30c44749..4d364de4379921 100644
--- a/arch/s390/kernel/kexec_elf.c
+++ b/arch/s390/kernel/kexec_elf.c
@@ -40,8 +40,10 @@ static int kexec_file_add_kernel_elf(struct kimage *image,
 		buf.bufsz = phdr->p_filesz;
 
 		buf.mem = ALIGN(phdr->p_paddr, phdr->p_align);
+#ifdef CONFIG_CRASH_DUMP
 		if (image->type == KEXEC_TYPE_CRASH)
 			buf.mem += crashk_res.start;
+#endif
 		buf.memsz = phdr->p_memsz;
 		data->memsz = ALIGN(data->memsz, phdr->p_align) + buf.memsz;
 
diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c
index af23eff5774dba..a32ce8bea745cf 100644
--- a/arch/s390/kernel/kexec_image.c
+++ b/arch/s390/kernel/kexec_image.c
@@ -24,8 +24,10 @@ static int kexec_file_add_kernel_image(struct kimage *image,
 	buf.bufsz = image->kernel_buf_len;
 
 	buf.mem = 0;
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH)
 		buf.mem += crashk_res.start;
+#endif
 	buf.memsz = buf.bufsz;
 
 	data->kernel_buf = image->kernel_buf;
diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
index 8d207b82d9fedd..c2bac14dd668ae 100644
--- a/arch/s390/kernel/machine_kexec_file.c
+++ b/arch/s390/kernel/machine_kexec_file.c
@@ -105,6 +105,7 @@ static int kexec_file_update_purgatory(struct kimage *image,
 	if (ret)
 		return ret;
 
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH) {
 		u64 crash_size;
 
@@ -121,6 +122,7 @@ static int kexec_file_update_purgatory(struct kimage *image,
 						     sizeof(crash_size),
 						     false);
 	}
+#endif
 	return ret;
 }
 
@@ -134,8 +136,10 @@ static int kexec_file_add_purgatory(struct kimage *image,
 
 	data->memsz = ALIGN(data->memsz, PAGE_SIZE);
 	buf.mem = data->memsz;
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH)
 		buf.mem += crashk_res.start;
+#endif
 
 	ret = kexec_load_purgatory(image, &buf);
 	if (ret)
@@ -158,8 +162,10 @@ static int kexec_file_add_initrd(struct kimage *image,
 
 	data->memsz = ALIGN(data->memsz, PAGE_SIZE);
 	buf.mem = data->memsz;
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH)
 		buf.mem += crashk_res.start;
+#endif
 	buf.memsz = buf.bufsz;
 
 	data->parm->initrd_start = data->memsz;
@@ -223,8 +229,10 @@ static int kexec_file_add_ipl_report(struct kimage *image,
 		data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr);
 	*lc_ipl_parmblock_ptr = (__u32)buf.mem;
 
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH)
 		buf.mem += crashk_res.start;
+#endif
 
 	ret = kexec_add_buffer(&buf);
 out:
@@ -268,10 +276,12 @@ void *kexec_file_add_components(struct kimage *image,
 	memcpy(data.parm->command_line, image->cmdline_buf,
 	       image->cmdline_buf_len);
 
+#ifdef CONFIG_CRASH_DUMP
 	if (image->type == KEXEC_TYPE_CRASH) {
 		data.parm->oldmem_base = crashk_res.start;
 		data.parm->oldmem_size = crashk_res.end - crashk_res.start + 1;
 	}
+#endif
 
 	if (image->initrd_buf) {
 		ret = kexec_file_add_initrd(image, &data);

From b4a07e34e15bf9b796a270932902f671cf0b4507 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:50 +0800
Subject: [PATCH 481/707] sh, crash: wrap crash dumping code into crash related
 ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on SuperH with some adjustments.

Wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery, and use
IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling in the
crashkernel reservation code.

Link: https://lkml.kernel.org/r/20240124051254.67105-11-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/sh/kernel/machine_kexec.c | 3 +++
 arch/sh/kernel/setup.c         | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c
index fa3a7b36190a2a..8daa8a6e6fa683 100644
--- a/arch/sh/kernel/machine_kexec.c
+++ b/arch/sh/kernel/machine_kexec.c
@@ -153,6 +153,9 @@ void __init reserve_crashkernel(void)
 	unsigned long long crash_size, crash_base;
 	int ret;
 
+	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
+		return;
+
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
 			&crash_size, &crash_base, NULL, NULL);
 	if (ret == 0 && crash_size > 0) {
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index d3175f09b3aad9..620e5cf8ae1e74 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -220,7 +220,7 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn,
 	request_resource(res, &code_resource);
 	request_resource(res, &data_resource);
 	request_resource(res, &bss_resource);
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_RESERVE
 	request_resource(res, &crashk_res);
 #endif
 

From cda487c0a74a9160c1477e14d857b339c79732c5 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:51 +0800
Subject: [PATCH 482/707] mips, crash: wrap crash dumping code into crash
 related ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on mips with some adjustments.

Here use IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling
in the crashkernel reservation code.

Link: https://lkml.kernel.org/r/20240124051254.67105-12-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/mips/kernel/setup.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 9c30de15159761..12a1a4ffb60211 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -442,8 +442,6 @@ static void __init mips_reserve_vmcore(void)
 #endif
 }
 
-#ifdef CONFIG_KEXEC
-
 /* 64M alignment for crash kernel regions */
 #define CRASH_ALIGN	SZ_64M
 #define CRASH_ADDR_MAX	SZ_512M
@@ -454,6 +452,9 @@ static void __init mips_parse_crashkernel(void)
 	unsigned long long crash_size, crash_base;
 	int ret;
 
+	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
+		return;
+
 	total_mem = memblock_phys_mem_size();
 	ret = parse_crashkernel(boot_command_line, total_mem,
 				&crash_size, &crash_base,
@@ -489,6 +490,9 @@ static void __init request_crashkernel(struct resource *res)
 {
 	int ret;
 
+	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
+		return;
+
 	if (crashk_res.start == crashk_res.end)
 		return;
 
@@ -498,15 +502,6 @@ static void __init request_crashkernel(struct resource *res)
 			(unsigned long)(resource_size(&crashk_res) >> 20),
 			(unsigned long)(crashk_res.start  >> 20));
 }
-#else /* !defined(CONFIG_KEXEC)		*/
-static void __init mips_parse_crashkernel(void)
-{
-}
-
-static void __init request_crashkernel(struct resource *res)
-{
-}
-#endif /* !defined(CONFIG_KEXEC)  */
 
 static void __init check_kernel_sections_mem(void)
 {

From 37281a1a4916e709401e728b5e7291d56b28e1fe Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:52 +0800
Subject: [PATCH 483/707] riscv, crash: wrap crash dumping code into crash
 related ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on risc-v with some adjustments.

Here wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery, and
use IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling
in the crashkernel reservation code.

Link: https://lkml.kernel.org/r/20240124051254.67105-13-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/riscv/kernel/elf_kexec.c | 9 +++++++--
 arch/riscv/mm/init.c          | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c
index 5bd1ec3341fe9c..54260c16f9912a 100644
--- a/arch/riscv/kernel/elf_kexec.c
+++ b/arch/riscv/kernel/elf_kexec.c
@@ -117,6 +117,7 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len,
 	return ret;
 }
 
+#ifdef CONFIG_CRASH_DUMP
 static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
 {
 	unsigned int *nr_ranges = arg;
@@ -189,6 +190,7 @@ static char *setup_kdump_cmdline(struct kimage *image, char *cmdline,
 	cmdline_ptr[COMMAND_LINE_SIZE - 1] = '\0';
 	return cmdline_ptr;
 }
+#endif
 
 static void *elf_kexec_load(struct kimage *image, char *kernel_buf,
 			    unsigned long kernel_len, char *initrd,
@@ -196,12 +198,11 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf,
 			    unsigned long cmdline_len)
 {
 	int ret;
+	void *fdt;
 	unsigned long old_kernel_pbase = ULONG_MAX;
 	unsigned long new_kernel_pbase = 0UL;
 	unsigned long initrd_pbase = 0UL;
-	unsigned long headers_sz;
 	unsigned long kernel_start;
-	void *fdt, *headers;
 	struct elfhdr ehdr;
 	struct kexec_buf kbuf;
 	struct kexec_elf_info elf_info;
@@ -227,8 +228,11 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf,
 	kbuf.buf_min = new_kernel_pbase + kernel_len;
 	kbuf.buf_max = ULONG_MAX;
 
+#ifdef CONFIG_CRASH_DUMP
 	/* Add elfcorehdr */
 	if (image->type == KEXEC_TYPE_CRASH) {
+		void *headers;
+		unsigned long headers_sz;
 		ret = prepare_elf_headers(&headers, &headers_sz);
 		if (ret) {
 			pr_err("Preparing elf core header failed\n");
@@ -264,6 +268,7 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf,
 		}
 		cmdline = modified_cmdline;
 	}
+#endif
 
 #ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY
 	/* Add purgatory to the image */
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 32cad6a65ccd23..245919dda91043 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1358,7 +1358,7 @@ static void __init arch_reserve_crashkernel(void)
 	bool high = false;
 	int ret;
 
-	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
 		return;
 
 	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),

From 41c5e0bec007b5910043563ca49d66f7a188d464 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:53 +0800
Subject: [PATCH 484/707] arm, crash: wrap crash dumping code into crash
 related ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on arm with some adjustments.

Here use CONFIG_CRASH_RESERVE ifdef to replace CONFIG_KEXEC ifdef.

Link: https://lkml.kernel.org/r/20240124051254.67105-14-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/kernel/setup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index ff2299ce1ad7a3..7b33b157fca0dc 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -979,7 +979,7 @@ static int __init init_machine_late(void)
 }
 late_initcall(init_machine_late);
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_CRASH_RESERVE
 /*
  * The crash region must be aligned to 128MB to avoid
  * zImage relocating below the reserved region.
@@ -1066,7 +1066,7 @@ static void __init reserve_crashkernel(void)
 }
 #else
 static inline void reserve_crashkernel(void) {}
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_CRASH_RESERVE*/
 
 void __init hyp_mode_check(void)
 {

From 57d6736e7a6eb47939977444770f74408a4f5732 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Jan 2024 13:12:54 +0800
Subject: [PATCH 485/707] loongarch, crash: wrap crash dumping code into crash
 related ifdefs

Now crash codes under kernel/ folder has been split out from kexec
code, crash dumping can be separated from kexec reboot in config
items on loongarch with some adjustments.

Here use IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling
in the crashkernel reservation code.

Link: https://lkml.kernel.org/r/20240124051254.67105-15-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/loongarch/kernel/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index edf2bba8013067..57d37dd9f964d3 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -260,7 +260,7 @@ static void __init arch_reserve_crashkernel(void)
 	char *cmdline = boot_command_line;
 	bool high = false;
 
-	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+	if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
 		return;
 
 	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),

From f73c7a5997426280402956c330928eef970a6571 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 19 Jan 2024 11:22:22 +0000
Subject: [PATCH 486/707] mm/zswap: make sure each swapfile always have zswap
 rb-tree

Patch series "mm/zswap: optimize the scalability of zswap rb-tree", v2.

When testing the zswap performance by using kernel build -j32 in a tmpfs
directory, I found the scalability of zswap rb-tree is not good, which is
protected by the only spinlock.  That would cause heavy lock contention if
multiple tasks zswap_store/load concurrently.

So a simple solution is to split the only one zswap rb-tree into multiple
rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M).  This idea
is from the commit 4b3ef9daa4fc ("mm/swap: split swap cache into 64MB
trunks").

Although this method can't solve the spinlock contention completely, it
can mitigate much of that contention.  Below is the results of kernel
build in tmpfs with zswap shrinker enabled:

     linux-next  zswap-lock-optimize
real 1m9.181s    1m3.820s
user 17m44.036s  17m40.100s
sys  7m37.297s   4m54.622s

So there are clearly improvements.  And it's complementary with the
ongoing zswap xarray conversion by Chris.  Anyway, I think we can also
merge this first, it's complementary IMHO.  So I just refresh and resend
this for further discussion.


This patch (of 2):

Not all zswap interfaces can handle the absence of the zswap rb-tree,
actually only zswap_store() has handled it for now.

To make things simple, we make sure each swapfile always have the zswap
rb-tree prepared before being enabled and used.  The preparation is
unlikely to fail in practice, this patch just make it explicit.

Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-0-b5cc55479090@bytedance.com
Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-1-b5cc55479090@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Chris Li <chriscli@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zswap.h |  7 +++++--
 mm/swapfile.c         | 10 +++++++---
 mm/zswap.c            |  7 ++++---
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 0b709f5bc65fac..eca388229d9a76 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -30,7 +30,7 @@ struct zswap_lruvec_state {
 bool zswap_store(struct folio *folio);
 bool zswap_load(struct folio *folio);
 void zswap_invalidate(int type, pgoff_t offset);
-void zswap_swapon(int type);
+int zswap_swapon(int type);
 void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
 void zswap_lruvec_state_init(struct lruvec *lruvec);
@@ -51,7 +51,10 @@ static inline bool zswap_load(struct folio *folio)
 }
 
 static inline void zswap_invalidate(int type, pgoff_t offset) {}
-static inline void zswap_swapon(int type) {}
+static inline int zswap_swapon(int type)
+{
+	return 0;
+}
 static inline void zswap_swapoff(int type) {}
 static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
 static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 556ff7347d5f04..726a2c4d185217 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2348,8 +2348,6 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
 				unsigned char *swap_map,
 				struct swap_cluster_info *cluster_info)
 {
-	zswap_swapon(p->type);
-
 	spin_lock(&swap_lock);
 	spin_lock(&p->lock);
 	setup_swap_info(p, prio, swap_map, cluster_info);
@@ -3167,6 +3165,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	if (error)
 		goto bad_swap_unlock_inode;
 
+	error = zswap_swapon(p->type);
+	if (error)
+		goto free_swap_address_space;
+
 	/*
 	 * Flush any pending IO and dirty mappings before we start using this
 	 * swap device.
@@ -3175,7 +3177,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	error = inode_drain_writes(inode);
 	if (error) {
 		inode->i_flags &= ~S_SWAPFILE;
-		goto free_swap_address_space;
+		goto free_swap_zswap;
 	}
 
 	mutex_lock(&swapon_mutex);
@@ -3199,6 +3201,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 
 	error = 0;
 	goto out;
+free_swap_zswap:
+	zswap_swapoff(p->type);
 free_swap_address_space:
 	exit_swap_address_space(p->type);
 bad_swap_unlock_inode:
diff --git a/mm/zswap.c b/mm/zswap.c
index e7b38aefb9afe7..3901a2445b1578 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1519,7 +1519,7 @@ bool zswap_store(struct folio *folio)
 	if (folio_test_large(folio))
 		return false;
 
-	if (!zswap_enabled || !tree)
+	if (!zswap_enabled)
 		return false;
 
 	/*
@@ -1772,19 +1772,20 @@ void zswap_invalidate(int type, pgoff_t offset)
 	spin_unlock(&tree->lock);
 }
 
-void zswap_swapon(int type)
+int zswap_swapon(int type)
 {
 	struct zswap_tree *tree;
 
 	tree = kzalloc(sizeof(*tree), GFP_KERNEL);
 	if (!tree) {
 		pr_err("alloc failed, zswap disabled for swap type %d\n", type);
-		return;
+		return -ENOMEM;
 	}
 
 	tree->rbroot = RB_ROOT;
 	spin_lock_init(&tree->lock);
 	zswap_trees[type] = tree;
+	return 0;
 }
 
 void zswap_swapoff(int type)

From 4d6262db26280c70b122008f5da894342786ca14 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 19 Jan 2024 11:22:23 +0000
Subject: [PATCH 487/707] mm/zswap: split zswap rb-tree

Each swapfile has one rb-tree to search the mapping of swp_entry_t to
zswap_entry, that use a spinlock to protect, which can cause heavy lock
contention if multiple tasks zswap_store/load concurrently.

Optimize the scalability problem by splitting the zswap rb-tree into
multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M),
just like we did in the swap cache address_space splitting.

Although this method can't solve the spinlock contention completely, it
can mitigate much of that contention.  Below is the results of kernel
build in tmpfs with zswap shrinker enabled:

     linux-next  zswap-lock-optimize
real 1m9.181s    1m3.820s
user 17m44.036s  17m40.100s
sys  7m37.297s   4m54.622s

So there are clearly improvements.

Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-2-b5cc55479090@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Chris Li <chriscli@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zswap.h |  4 +--
 mm/swapfile.c         |  2 +-
 mm/zswap.c            | 71 ++++++++++++++++++++++++++++---------------
 3 files changed, 49 insertions(+), 28 deletions(-)

diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index eca388229d9a76..91895ce1fdbc4f 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -30,7 +30,7 @@ struct zswap_lruvec_state {
 bool zswap_store(struct folio *folio);
 bool zswap_load(struct folio *folio);
 void zswap_invalidate(int type, pgoff_t offset);
-int zswap_swapon(int type);
+int zswap_swapon(int type, unsigned long nr_pages);
 void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
 void zswap_lruvec_state_init(struct lruvec *lruvec);
@@ -51,7 +51,7 @@ static inline bool zswap_load(struct folio *folio)
 }
 
 static inline void zswap_invalidate(int type, pgoff_t offset) {}
-static inline int zswap_swapon(int type)
+static inline int zswap_swapon(int type, unsigned long nr_pages)
 {
 	return 0;
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 726a2c4d185217..b11b6057d8b5fb 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3165,7 +3165,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	if (error)
 		goto bad_swap_unlock_inode;
 
-	error = zswap_swapon(p->type);
+	error = zswap_swapon(p->type, maxpages);
 	if (error)
 		goto free_swap_address_space;
 
diff --git a/mm/zswap.c b/mm/zswap.c
index 3901a2445b1578..1a864c8bc081bc 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -239,6 +239,7 @@ struct zswap_tree {
 };
 
 static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
+static unsigned int nr_zswap_trees[MAX_SWAPFILES];
 
 /* RCU-protected iteration */
 static LIST_HEAD(zswap_pools);
@@ -265,6 +266,12 @@ static bool zswap_has_pool;
 * helpers and fwd declarations
 **********************************/
 
+static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
+{
+	return &zswap_trees[swp_type(swp)][swp_offset(swp)
+		>> SWAP_ADDRESS_SPACE_SHIFT];
+}
+
 #define zswap_pool_debug(msg, p)				\
 	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
 		 zpool_get_type((p)->zpools[0]))
@@ -865,7 +872,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 	 * until the entry is verified to still be alive in the tree.
 	 */
 	swpoffset = swp_offset(entry->swpentry);
-	tree = zswap_trees[swp_type(entry->swpentry)];
+	tree = swap_zswap_tree(entry->swpentry);
 	list_lru_isolate(l, item);
 	/*
 	 * It's safe to drop the lock here because we return either
@@ -1494,10 +1501,9 @@ static void zswap_fill_page(void *ptr, unsigned long value)
 bool zswap_store(struct folio *folio)
 {
 	swp_entry_t swp = folio->swap;
-	int type = swp_type(swp);
 	pgoff_t offset = swp_offset(swp);
 	struct page *page = &folio->page;
-	struct zswap_tree *tree = zswap_trees[type];
+	struct zswap_tree *tree = swap_zswap_tree(swp);
 	struct zswap_entry *entry, *dupentry;
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
@@ -1569,7 +1575,7 @@ bool zswap_store(struct folio *folio)
 		src = kmap_local_page(page);
 		if (zswap_is_page_same_filled(src, &value)) {
 			kunmap_local(src);
-			entry->swpentry = swp_entry(type, offset);
+			entry->swpentry = swp;
 			entry->length = 0;
 			entry->value = value;
 			atomic_inc(&zswap_same_filled_pages);
@@ -1651,7 +1657,7 @@ bool zswap_store(struct folio *folio)
 	mutex_unlock(&acomp_ctx->mutex);
 
 	/* populate entry */
-	entry->swpentry = swp_entry(type, offset);
+	entry->swpentry = swp;
 	entry->handle = handle;
 	entry->length = dlen;
 
@@ -1711,10 +1717,9 @@ bool zswap_store(struct folio *folio)
 bool zswap_load(struct folio *folio)
 {
 	swp_entry_t swp = folio->swap;
-	int type = swp_type(swp);
 	pgoff_t offset = swp_offset(swp);
 	struct page *page = &folio->page;
-	struct zswap_tree *tree = zswap_trees[type];
+	struct zswap_tree *tree = swap_zswap_tree(swp);
 	struct zswap_entry *entry;
 	u8 *dst;
 
@@ -1757,7 +1762,7 @@ bool zswap_load(struct folio *folio)
 
 void zswap_invalidate(int type, pgoff_t offset)
 {
-	struct zswap_tree *tree = zswap_trees[type];
+	struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset));
 	struct zswap_entry *entry;
 
 	/* find */
@@ -1772,37 +1777,53 @@ void zswap_invalidate(int type, pgoff_t offset)
 	spin_unlock(&tree->lock);
 }
 
-int zswap_swapon(int type)
+int zswap_swapon(int type, unsigned long nr_pages)
 {
-	struct zswap_tree *tree;
+	struct zswap_tree *trees, *tree;
+	unsigned int nr, i;
 
-	tree = kzalloc(sizeof(*tree), GFP_KERNEL);
-	if (!tree) {
+	nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
+	trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL);
+	if (!trees) {
 		pr_err("alloc failed, zswap disabled for swap type %d\n", type);
 		return -ENOMEM;
 	}
 
-	tree->rbroot = RB_ROOT;
-	spin_lock_init(&tree->lock);
-	zswap_trees[type] = tree;
+	for (i = 0; i < nr; i++) {
+		tree = trees + i;
+		tree->rbroot = RB_ROOT;
+		spin_lock_init(&tree->lock);
+	}
+
+	nr_zswap_trees[type] = nr;
+	zswap_trees[type] = trees;
 	return 0;
 }
 
 void zswap_swapoff(int type)
 {
-	struct zswap_tree *tree = zswap_trees[type];
-	struct zswap_entry *entry, *n;
+	struct zswap_tree *trees = zswap_trees[type];
+	unsigned int i;
 
-	if (!tree)
+	if (!trees)
 		return;
 
-	/* walk the tree and free everything */
-	spin_lock(&tree->lock);
-	rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
-		zswap_free_entry(entry);
-	tree->rbroot = RB_ROOT;
-	spin_unlock(&tree->lock);
-	kfree(tree);
+	for (i = 0; i < nr_zswap_trees[type]; i++) {
+		struct zswap_tree *tree = trees + i;
+		struct zswap_entry *entry, *n;
+
+		/* walk the tree and free everything */
+		spin_lock(&tree->lock);
+		rbtree_postorder_for_each_entry_safe(entry, n,
+						     &tree->rbroot,
+						     rbnode)
+			zswap_free_entry(entry);
+		tree->rbroot = RB_ROOT;
+		spin_unlock(&tree->lock);
+	}
+
+	kvfree(trees);
+	nr_zswap_trees[type] = 0;
 	zswap_trees[type] = NULL;
 }
 

From e6a1ca66392f9af26cafb428123fe066c826b06c Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Wed, 24 Jan 2024 04:51:11 +0000
Subject: [PATCH 488/707] mm: swap: enforce updating inuse_pages at the end of
 swap_range_free()

Patch series "mm: zswap: simplify zswap_swapoff()", v2.

These patches aim to simplify zswap_swapoff() by removing the unnecessary
trees cleanup code.  Patch 1 makes sure that the order of operations
during swapoff is enforced correctly, making sure the simplification in
patch 2 is correct in a future-proof manner.


This patch (of 2):

In swap_range_free(), we update inuse_pages then do some cleanups (arch
invalidation, zswap invalidation, swap cache cleanups, etc).  During
swapoff, try_to_unuse() checks that inuse_pages is 0 to make sure all swap
entries are freed.  Make sure we only update inuse_pages after we are done
with the cleanups in swap_range_free(), and use the proper memory barriers
to enforce it.  This makes sure that code following try_to_unuse() can
safely assume that swap_range_free() ran for all entries in thr swapfile
(e.g.  swap cache cleanup, zswap_swapoff()).

In practice, this currently isn't a problem because swap_range_free() is
called with the swap info lock held, and the swapoff code happens to spin
for that after try_to_unuse().  However, this seems fragile and
unintentional, so make it more relable and future-proof.  This also
facilitates a following simplification of zswap_swapoff().

Link: https://lkml.kernel.org/r/20240124045113.415378-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20240124045113.415378-2-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/swapfile.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index b11b6057d8b5fb..0580bb3e34d773 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -737,8 +737,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 		if (was_full && (si->flags & SWP_WRITEOK))
 			add_to_avail_list(si);
 	}
-	atomic_long_add(nr_entries, &nr_swap_pages);
-	WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
 	if (si->flags & SWP_BLKDEV)
 		swap_slot_free_notify =
 			si->bdev->bd_disk->fops->swap_slot_free_notify;
@@ -752,6 +750,14 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 		offset++;
 	}
 	clear_shadow_from_swap_cache(si->type, begin, end);
+
+	/*
+	 * Make sure that try_to_unuse() observes si->inuse_pages reaching 0
+	 * only after the above cleanups are done.
+	 */
+	smp_wmb();
+	atomic_long_add(nr_entries, &nr_swap_pages);
+	WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
 }
 
 static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
@@ -2049,7 +2055,7 @@ static int try_to_unuse(unsigned int type)
 	unsigned int i;
 
 	if (!READ_ONCE(si->inuse_pages))
-		return 0;
+		goto success;
 
 retry:
 	retval = shmem_unuse(type);
@@ -2130,6 +2136,12 @@ static int try_to_unuse(unsigned int type)
 		return -EINTR;
 	}
 
+success:
+	/*
+	 * Make sure that further cleanups after try_to_unuse() returns happen
+	 * after swap_range_free() reduces si->inuse_pages to 0.
+	 */
+	smp_mb();
 	return 0;
 }
 

From 008a70daf58254538a14b42d6c144997c980ad90 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Wed, 24 Jan 2024 04:51:12 +0000
Subject: [PATCH 489/707] mm: zswap: remove unnecessary trees cleanups in
 zswap_swapoff()

During swapoff, try_to_unuse() makes sure that zswap_invalidate() is
called for all swap entries before zswap_swapoff() is called.  This means
that all zswap entries should already be removed from the tree.  Simplify
zswap_swapoff() by removing the trees cleanup code, and leave an assertion
in its place.

Link: https://lkml.kernel.org/r/20240124045113.415378-3-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 1a864c8bc081bc..b5638fb39ff622 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1808,19 +1808,9 @@ void zswap_swapoff(int type)
 	if (!trees)
 		return;
 
-	for (i = 0; i < nr_zswap_trees[type]; i++) {
-		struct zswap_tree *tree = trees + i;
-		struct zswap_entry *entry, *n;
-
-		/* walk the tree and free everything */
-		spin_lock(&tree->lock);
-		rbtree_postorder_for_each_entry_safe(entry, n,
-						     &tree->rbroot,
-						     rbnode)
-			zswap_free_entry(entry);
-		tree->rbroot = RB_ROOT;
-		spin_unlock(&tree->lock);
-	}
+	/* try_to_unuse() invalidated all the entries already */
+	for (i = 0; i < nr_zswap_trees[type]; i++)
+		WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot));
 
 	kvfree(trees);
 	nr_zswap_trees[type] = 0;

From ce3883bf8e64d3bfded2c41e9bfbfba8236e9ffa Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Wed, 24 Jan 2024 11:57:19 +0800
Subject: [PATCH 490/707] mm/mmap: introduce vma_set_range()

There is a lot of code needs to set the range of vma in mmap.c, introduce
vma_set_range() to simplify the code.

Link: https://lkml.kernel.org/r/20240124035719.3685193-1-yajun.deng@linux.dev
Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/internal.h |  9 +++++++++
 mm/mmap.c     | 29 +++++++----------------------
 2 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index f309a010d50fb6..1e29c5821a1dde 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1114,6 +1114,15 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
 extern bool mirrored_kernelcore;
 extern bool memblock_has_mirror(void);
 
+static __always_inline void vma_set_range(struct vm_area_struct *vma,
+					  unsigned long start, unsigned long end,
+					  pgoff_t pgoff)
+{
+	vma->vm_start = start;
+	vma->vm_end = end;
+	vma->vm_pgoff = pgoff;
+}
+
 static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
 {
 	/*
diff --git a/mm/mmap.c b/mm/mmap.c
index 66f534ec90a55e..476de5daf598d1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -663,9 +663,7 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
 
 	vma_prepare(&vp);
 	vma_adjust_trans_huge(vma, start, end, 0);
-	vma->vm_start = start;
-	vma->vm_end = end;
-	vma->vm_pgoff = pgoff;
+	vma_set_range(vma, start, end, pgoff);
 	vma_iter_store(vmi, vma);
 
 	vma_complete(&vp, vmi, vma->vm_mm);
@@ -708,9 +706,7 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	vma_adjust_trans_huge(vma, start, end, 0);
 
 	vma_iter_clear(vmi);
-	vma->vm_start = start;
-	vma->vm_end = end;
-	vma->vm_pgoff = pgoff;
+	vma_set_range(vma, start, end, pgoff);
 	vma_complete(&vp, vmi, vma->vm_mm);
 	return 0;
 }
@@ -1015,10 +1011,7 @@ static struct vm_area_struct
 
 	vma_prepare(&vp);
 	vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
-
-	vma->vm_start = vma_start;
-	vma->vm_end = vma_end;
-	vma->vm_pgoff = vma_pgoff;
+	vma_set_range(vma, vma_start, vma_end, vma_pgoff);
 
 	if (vma_expanded)
 		vma_iter_store(vmi, vma);
@@ -2811,11 +2804,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	}
 
 	vma_iter_config(&vmi, addr, end);
-	vma->vm_start = addr;
-	vma->vm_end = end;
+	vma_set_range(vma, addr, end, pgoff);
 	vm_flags_init(vma, vm_flags);
 	vma->vm_page_prot = vm_get_page_prot(vm_flags);
-	vma->vm_pgoff = pgoff;
 
 	if (file) {
 		vma->vm_file = get_file(file);
@@ -3165,9 +3156,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		goto unacct_fail;
 
 	vma_set_anonymous(vma);
-	vma->vm_start = addr;
-	vma->vm_end = addr + len;
-	vma->vm_pgoff = addr >> PAGE_SHIFT;
+	vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
 	vm_flags_init(vma, flags);
 	vma->vm_page_prot = vm_get_page_prot(flags);
 	vma_start_write(vma);
@@ -3404,9 +3393,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		new_vma = vm_area_dup(vma);
 		if (!new_vma)
 			goto out;
-		new_vma->vm_start = addr;
-		new_vma->vm_end = addr + len;
-		new_vma->vm_pgoff = pgoff;
+		vma_set_range(new_vma, addr, addr + len, pgoff);
 		if (vma_dup_policy(vma, new_vma))
 			goto out_free_vma;
 		if (anon_vma_clone(new_vma, vma))
@@ -3574,9 +3561,7 @@ static struct vm_area_struct *__install_special_mapping(
 	if (unlikely(vma == NULL))
 		return ERR_PTR(-ENOMEM);
 
-	vma->vm_start = addr;
-	vma->vm_end = addr + len;
-
+	vma_set_range(vma, addr, addr + len, 0);
 	vm_flags_init(vma, (vm_flags | mm->def_flags |
 		      VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

From 88889777af485e0d0cccb6e84713d7ec997cc1eb Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Thu, 25 Jan 2024 08:14:23 +0000
Subject: [PATCH 491/707] mm: zswap: remove unused tree argument in
 zswap_entry_put()

Commit 7310895779624 ("mm: zswap: tighten up entry invalidation") removed
the usage of tree argument, delete it.

Link: https://lkml.kernel.org/r/20240125081423.1200336-1-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index b5638fb39ff622..50a4af3f185607 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -569,8 +569,7 @@ static void zswap_entry_get(struct zswap_entry *entry)
 /* caller must hold the tree lock
 * remove from the tree and free it, if nobody reference the entry
 */
-static void zswap_entry_put(struct zswap_tree *tree,
-			struct zswap_entry *entry)
+static void zswap_entry_put(struct zswap_entry *entry)
 {
 	int refcount = --entry->refcount;
 
@@ -853,7 +852,7 @@ static void zswap_invalidate_entry(struct zswap_tree *tree,
 				   struct zswap_entry *entry)
 {
 	if (zswap_rb_erase(&tree->rbroot, entry))
-		zswap_entry_put(tree, entry);
+		zswap_entry_put(entry);
 }
 
 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
@@ -922,7 +921,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 
 put_unlock:
 	/* Drop local reference */
-	zswap_entry_put(tree, entry);
+	zswap_entry_put(entry);
 unlock:
 	spin_unlock(&tree->lock);
 	spin_lock(lock);
@@ -1754,7 +1753,7 @@ bool zswap_load(struct folio *folio)
 		zswap_lru_del(&entry->pool->list_lru, entry);
 		zswap_lru_add(&entry->pool->list_lru, entry);
 	}
-	zswap_entry_put(tree, entry);
+	zswap_entry_put(entry);
 	spin_unlock(&tree->lock);
 
 	return true;

From edcadaf023d516cf087d065b9921a4098459b9cb Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 24 Jan 2024 12:03:46 -0800
Subject: [PATCH 492/707] dax/bus.c: replace driver-core lock usage by a local
 rwsem

Patch series "Add DAX ABI for memmap_on_memory", v7.

This series adds sysfs ABI to control memmap_on_memory behavior for DAX
devices.

Patch 1 replaces incorrect device_lock() usage with a local rwsem - this
was identified during review.

Patch 2 is also a preparatory patch that replaces sprintf() for sysfs
operations with sysfs_emit()

Patch 3 adds the missing documentation for the sysfs ABI for DAX regions
and Dax devices.

Patch 4 exports mhp_supports_memmap_on_memory().

Patch 5 adds the new ABI for toggling memmap_on_memory semantics for dax
devices.


This patch (of 5):

The dax driver incorrectly used driver-core device locks to protect
internal dax region and dax device configuration structures.  Replace the
device lock usage with a local rwsem, one each for dax region
configuration and dax device configuration.  As a result of this
conversion, no device_lock() usage remains in dax/bus.c.

Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-0-20d16cb8d23d@intel.com
Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-1-20d16cb8d23d@intel.com
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Reported-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Li Zhijian <lizhijian@fujitsu.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/dax/bus.c | 218 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 156 insertions(+), 62 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 1ff1ab5fa105a6..cb148f74ceda67 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -12,6 +12,18 @@
 
 static DEFINE_MUTEX(dax_bus_lock);
 
+/*
+ * All changes to the dax region configuration occur with this lock held
+ * for write.
+ */
+DECLARE_RWSEM(dax_region_rwsem);
+
+/*
+ * All changes to the dax device configuration occur with this lock held
+ * for write.
+ */
+DECLARE_RWSEM(dax_dev_rwsem);
+
 #define DAX_NAME_LEN 30
 struct dax_id {
 	struct list_head list;
@@ -180,7 +192,7 @@ static u64 dev_dax_size(struct dev_dax *dev_dax)
 	u64 size = 0;
 	int i;
 
-	device_lock_assert(&dev_dax->dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_dev_rwsem));
 
 	for (i = 0; i < dev_dax->nr_range; i++)
 		size += range_len(&dev_dax->ranges[i].range);
@@ -194,8 +206,15 @@ static int dax_bus_probe(struct device *dev)
 	struct dev_dax *dev_dax = to_dev_dax(dev);
 	struct dax_region *dax_region = dev_dax->region;
 	int rc;
+	u64 size;
 
-	if (dev_dax_size(dev_dax) == 0 || dev_dax->id < 0)
+	rc = down_read_interruptible(&dax_dev_rwsem);
+	if (rc)
+		return rc;
+	size = dev_dax_size(dev_dax);
+	up_read(&dax_dev_rwsem);
+
+	if (size == 0 || dev_dax->id < 0)
 		return -ENXIO;
 
 	rc = dax_drv->probe(dev_dax);
@@ -283,7 +302,7 @@ static unsigned long long dax_region_avail_size(struct dax_region *dax_region)
 	resource_size_t size = resource_size(&dax_region->res);
 	struct resource *res;
 
-	device_lock_assert(dax_region->dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
 
 	for_each_dax_region_resource(dax_region, res)
 		size -= resource_size(res);
@@ -295,10 +314,13 @@ static ssize_t available_size_show(struct device *dev,
 {
 	struct dax_region *dax_region = dev_get_drvdata(dev);
 	unsigned long long size;
+	int rc;
 
-	device_lock(dev);
+	rc = down_read_interruptible(&dax_region_rwsem);
+	if (rc)
+		return rc;
 	size = dax_region_avail_size(dax_region);
-	device_unlock(dev);
+	up_read(&dax_region_rwsem);
 
 	return sprintf(buf, "%llu\n", size);
 }
@@ -314,10 +336,12 @@ static ssize_t seed_show(struct device *dev,
 	if (is_static(dax_region))
 		return -EINVAL;
 
-	device_lock(dev);
+	rc = down_read_interruptible(&dax_region_rwsem);
+	if (rc)
+		return rc;
 	seed = dax_region->seed;
 	rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : "");
-	device_unlock(dev);
+	up_read(&dax_region_rwsem);
 
 	return rc;
 }
@@ -333,14 +357,18 @@ static ssize_t create_show(struct device *dev,
 	if (is_static(dax_region))
 		return -EINVAL;
 
-	device_lock(dev);
+	rc = down_read_interruptible(&dax_region_rwsem);
+	if (rc)
+		return rc;
 	youngest = dax_region->youngest;
 	rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : "");
-	device_unlock(dev);
+	up_read(&dax_region_rwsem);
 
 	return rc;
 }
 
+static struct dev_dax *__devm_create_dev_dax(struct dev_dax_data *data);
+
 static ssize_t create_store(struct device *dev, struct device_attribute *attr,
 		const char *buf, size_t len)
 {
@@ -358,7 +386,9 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr,
 	if (val != 1)
 		return -EINVAL;
 
-	device_lock(dev);
+	rc = down_write_killable(&dax_region_rwsem);
+	if (rc)
+		return rc;
 	avail = dax_region_avail_size(dax_region);
 	if (avail == 0)
 		rc = -ENOSPC;
@@ -369,7 +399,7 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr,
 			.id = -1,
 			.memmap_on_memory = false,
 		};
-		struct dev_dax *dev_dax = devm_create_dev_dax(&data);
+		struct dev_dax *dev_dax = __devm_create_dev_dax(&data);
 
 		if (IS_ERR(dev_dax))
 			rc = PTR_ERR(dev_dax);
@@ -387,7 +417,7 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr,
 			rc = len;
 		}
 	}
-	device_unlock(dev);
+	up_write(&dax_region_rwsem);
 
 	return rc;
 }
@@ -417,7 +447,7 @@ static void trim_dev_dax_range(struct dev_dax *dev_dax)
 	struct range *range = &dev_dax->ranges[i].range;
 	struct dax_region *dax_region = dev_dax->region;
 
-	device_lock_assert(dax_region->dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
 	dev_dbg(&dev_dax->dev, "delete range[%d]: %#llx:%#llx\n", i,
 		(unsigned long long)range->start,
 		(unsigned long long)range->end);
@@ -435,7 +465,7 @@ static void free_dev_dax_ranges(struct dev_dax *dev_dax)
 		trim_dev_dax_range(dev_dax);
 }
 
-static void unregister_dev_dax(void *dev)
+static void __unregister_dev_dax(void *dev)
 {
 	struct dev_dax *dev_dax = to_dev_dax(dev);
 
@@ -447,6 +477,17 @@ static void unregister_dev_dax(void *dev)
 	put_device(dev);
 }
 
+static void unregister_dev_dax(void *dev)
+{
+	if (rwsem_is_locked(&dax_region_rwsem))
+		return __unregister_dev_dax(dev);
+
+	if (WARN_ON_ONCE(down_write_killable(&dax_region_rwsem) != 0))
+		return;
+	__unregister_dev_dax(dev);
+	up_write(&dax_region_rwsem);
+}
+
 static void dax_region_free(struct kref *kref)
 {
 	struct dax_region *dax_region;
@@ -463,11 +504,10 @@ static void dax_region_put(struct dax_region *dax_region)
 /* a return value >= 0 indicates this invocation invalidated the id */
 static int __free_dev_dax_id(struct dev_dax *dev_dax)
 {
-	struct device *dev = &dev_dax->dev;
 	struct dax_region *dax_region;
 	int rc = dev_dax->id;
 
-	device_lock_assert(dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_dev_rwsem));
 
 	if (!dev_dax->dyn_id || dev_dax->id < 0)
 		return -1;
@@ -480,12 +520,13 @@ static int __free_dev_dax_id(struct dev_dax *dev_dax)
 
 static int free_dev_dax_id(struct dev_dax *dev_dax)
 {
-	struct device *dev = &dev_dax->dev;
 	int rc;
 
-	device_lock(dev);
+	rc = down_write_killable(&dax_dev_rwsem);
+	if (rc)
+		return rc;
 	rc = __free_dev_dax_id(dev_dax);
-	device_unlock(dev);
+	up_write(&dax_dev_rwsem);
 	return rc;
 }
 
@@ -519,8 +560,14 @@ static ssize_t delete_store(struct device *dev, struct device_attribute *attr,
 	if (!victim)
 		return -ENXIO;
 
-	device_lock(dev);
-	device_lock(victim);
+	rc = down_write_killable(&dax_region_rwsem);
+	if (rc)
+		return rc;
+	rc = down_write_killable(&dax_dev_rwsem);
+	if (rc) {
+		up_write(&dax_region_rwsem);
+		return rc;
+	}
 	dev_dax = to_dev_dax(victim);
 	if (victim->driver || dev_dax_size(dev_dax))
 		rc = -EBUSY;
@@ -541,12 +588,12 @@ static ssize_t delete_store(struct device *dev, struct device_attribute *attr,
 		} else
 			rc = -EBUSY;
 	}
-	device_unlock(victim);
+	up_write(&dax_dev_rwsem);
 
 	/* won the race to invalidate the device, clean it up */
 	if (do_del)
 		devm_release_action(dev, unregister_dev_dax, victim);
-	device_unlock(dev);
+	up_write(&dax_region_rwsem);
 	put_device(victim);
 
 	return rc;
@@ -658,16 +705,15 @@ static void dax_mapping_release(struct device *dev)
 	put_device(parent);
 }
 
-static void unregister_dax_mapping(void *data)
+static void __unregister_dax_mapping(void *data)
 {
 	struct device *dev = data;
 	struct dax_mapping *mapping = to_dax_mapping(dev);
 	struct dev_dax *dev_dax = to_dev_dax(dev->parent);
-	struct dax_region *dax_region = dev_dax->region;
 
 	dev_dbg(dev, "%s\n", __func__);
 
-	device_lock_assert(dax_region->dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
 
 	dev_dax->ranges[mapping->range_id].mapping = NULL;
 	mapping->range_id = -1;
@@ -675,28 +721,37 @@ static void unregister_dax_mapping(void *data)
 	device_unregister(dev);
 }
 
+static void unregister_dax_mapping(void *data)
+{
+	if (rwsem_is_locked(&dax_region_rwsem))
+		return __unregister_dax_mapping(data);
+
+	if (WARN_ON_ONCE(down_write_killable(&dax_region_rwsem) != 0))
+		return;
+	__unregister_dax_mapping(data);
+	up_write(&dax_region_rwsem);
+}
+
 static struct dev_dax_range *get_dax_range(struct device *dev)
 {
 	struct dax_mapping *mapping = to_dax_mapping(dev);
 	struct dev_dax *dev_dax = to_dev_dax(dev->parent);
-	struct dax_region *dax_region = dev_dax->region;
+	int rc;
 
-	device_lock(dax_region->dev);
+	rc = down_write_killable(&dax_region_rwsem);
+	if (rc)
+		return NULL;
 	if (mapping->range_id < 0) {
-		device_unlock(dax_region->dev);
+		up_write(&dax_region_rwsem);
 		return NULL;
 	}
 
 	return &dev_dax->ranges[mapping->range_id];
 }
 
-static void put_dax_range(struct dev_dax_range *dax_range)
+static void put_dax_range(void)
 {
-	struct dax_mapping *mapping = dax_range->mapping;
-	struct dev_dax *dev_dax = to_dev_dax(mapping->dev.parent);
-	struct dax_region *dax_region = dev_dax->region;
-
-	device_unlock(dax_region->dev);
+	up_write(&dax_region_rwsem);
 }
 
 static ssize_t start_show(struct device *dev,
@@ -709,7 +764,7 @@ static ssize_t start_show(struct device *dev,
 	if (!dax_range)
 		return -ENXIO;
 	rc = sprintf(buf, "%#llx\n", dax_range->range.start);
-	put_dax_range(dax_range);
+	put_dax_range();
 
 	return rc;
 }
@@ -725,7 +780,7 @@ static ssize_t end_show(struct device *dev,
 	if (!dax_range)
 		return -ENXIO;
 	rc = sprintf(buf, "%#llx\n", dax_range->range.end);
-	put_dax_range(dax_range);
+	put_dax_range();
 
 	return rc;
 }
@@ -741,7 +796,7 @@ static ssize_t pgoff_show(struct device *dev,
 	if (!dax_range)
 		return -ENXIO;
 	rc = sprintf(buf, "%#lx\n", dax_range->pgoff);
-	put_dax_range(dax_range);
+	put_dax_range();
 
 	return rc;
 }
@@ -775,7 +830,7 @@ static int devm_register_dax_mapping(struct dev_dax *dev_dax, int range_id)
 	struct device *dev;
 	int rc;
 
-	device_lock_assert(dax_region->dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
 
 	if (dev_WARN_ONCE(&dev_dax->dev, !dax_region->dev->driver,
 				"region disabled\n"))
@@ -821,7 +876,7 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start,
 	struct resource *alloc;
 	int i, rc;
 
-	device_lock_assert(dax_region->dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
 
 	/* handle the seed alloc special case */
 	if (!size) {
@@ -875,13 +930,12 @@ static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, r
 {
 	int last_range = dev_dax->nr_range - 1;
 	struct dev_dax_range *dax_range = &dev_dax->ranges[last_range];
-	struct dax_region *dax_region = dev_dax->region;
 	bool is_shrink = resource_size(res) > size;
 	struct range *range = &dax_range->range;
 	struct device *dev = &dev_dax->dev;
 	int rc;
 
-	device_lock_assert(dax_region->dev);
+	WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem));
 
 	if (dev_WARN_ONCE(dev, !size, "deletion is handled by dev_dax_shrink\n"))
 		return -EINVAL;
@@ -907,10 +961,13 @@ static ssize_t size_show(struct device *dev,
 {
 	struct dev_dax *dev_dax = to_dev_dax(dev);
 	unsigned long long size;
+	int rc;
 
-	device_lock(dev);
+	rc = down_write_killable(&dax_dev_rwsem);
+	if (rc)
+		return rc;
 	size = dev_dax_size(dev_dax);
-	device_unlock(dev);
+	up_write(&dax_dev_rwsem);
 
 	return sprintf(buf, "%llu\n", size);
 }
@@ -1080,17 +1137,27 @@ static ssize_t size_store(struct device *dev, struct device_attribute *attr,
 		return -EINVAL;
 	}
 
-	device_lock(dax_region->dev);
+	rc = down_write_killable(&dax_region_rwsem);
+	if (rc)
+		return rc;
 	if (!dax_region->dev->driver) {
-		device_unlock(dax_region->dev);
-		return -ENXIO;
+		rc = -ENXIO;
+		goto err_region;
 	}
-	device_lock(dev);
+	rc = down_write_killable(&dax_dev_rwsem);
+	if (rc)
+		goto err_dev;
+
 	rc = dev_dax_resize(dax_region, dev_dax, val);
-	device_unlock(dev);
-	device_unlock(dax_region->dev);
 
-	return rc == 0 ? len : rc;
+err_dev:
+	up_write(&dax_dev_rwsem);
+err_region:
+	up_write(&dax_region_rwsem);
+
+	if (rc == 0)
+		return len;
+	return rc;
 }
 static DEVICE_ATTR_RW(size);
 
@@ -1138,18 +1205,24 @@ static ssize_t mapping_store(struct device *dev, struct device_attribute *attr,
 		return rc;
 
 	rc = -ENXIO;
-	device_lock(dax_region->dev);
+	rc = down_write_killable(&dax_region_rwsem);
+	if (rc)
+		return rc;
 	if (!dax_region->dev->driver) {
-		device_unlock(dax_region->dev);
+		up_write(&dax_region_rwsem);
+		return rc;
+	}
+	rc = down_write_killable(&dax_dev_rwsem);
+	if (rc) {
+		up_write(&dax_region_rwsem);
 		return rc;
 	}
-	device_lock(dev);
 
 	to_alloc = range_len(&r);
 	if (alloc_is_aligned(dev_dax, to_alloc))
 		rc = alloc_dev_dax_range(dev_dax, r.start, to_alloc);
-	device_unlock(dev);
-	device_unlock(dax_region->dev);
+	up_write(&dax_dev_rwsem);
+	up_write(&dax_region_rwsem);
 
 	return rc == 0 ? len : rc;
 }
@@ -1196,13 +1269,19 @@ static ssize_t align_store(struct device *dev, struct device_attribute *attr,
 	if (!dax_align_valid(val))
 		return -EINVAL;
 
-	device_lock(dax_region->dev);
+	rc = down_write_killable(&dax_region_rwsem);
+	if (rc)
+		return rc;
 	if (!dax_region->dev->driver) {
-		device_unlock(dax_region->dev);
+		up_write(&dax_region_rwsem);
 		return -ENXIO;
 	}
 
-	device_lock(dev);
+	rc = down_write_killable(&dax_dev_rwsem);
+	if (rc) {
+		up_write(&dax_region_rwsem);
+		return rc;
+	}
 	if (dev->driver) {
 		rc = -EBUSY;
 		goto out_unlock;
@@ -1214,8 +1293,8 @@ static ssize_t align_store(struct device *dev, struct device_attribute *attr,
 	if (rc)
 		dev_dax->align = align_save;
 out_unlock:
-	device_unlock(dev);
-	device_unlock(dax_region->dev);
+	up_write(&dax_dev_rwsem);
+	up_write(&dax_region_rwsem);
 	return rc == 0 ? len : rc;
 }
 static DEVICE_ATTR_RW(align);
@@ -1325,7 +1404,7 @@ static const struct device_type dev_dax_type = {
 	.groups = dax_attribute_groups,
 };
 
-struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
+static struct dev_dax *__devm_create_dev_dax(struct dev_dax_data *data)
 {
 	struct dax_region *dax_region = data->dax_region;
 	struct device *parent = dax_region->dev;
@@ -1440,6 +1519,21 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
 
 	return ERR_PTR(rc);
 }
+
+struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
+{
+	struct dev_dax *dev_dax;
+	int rc;
+
+	rc = down_write_killable(&dax_region_rwsem);
+	if (rc)
+		return ERR_PTR(rc);
+
+	dev_dax = __devm_create_dev_dax(data);
+	up_write(&dax_region_rwsem);
+
+	return dev_dax;
+}
 EXPORT_SYMBOL_GPL(devm_create_dev_dax);
 
 int __dax_driver_register(struct dax_device_driver *dax_drv,

From 1e0308bf31f3c788a922e49da69ec79d7684ba4d Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 24 Jan 2024 12:03:47 -0800
Subject: [PATCH 493/707] dax/bus.c: replace several sprintf() with
 sysfs_emit()

There were several places where drivers/dax/bus.c uses 'sprintf' to print
sysfs data.  Since a sysfs_emit() helper is available specifically for
this purpose, replace all the sprintf() usage for sysfs with sysfs_emit()
in this file.

Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-2-20d16cb8d23d@intel.com
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Reported-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Li Zhijian <lizhijian@fujitsu.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/dax/bus.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index cb148f74ceda67..0fd948a4443e38 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -269,7 +269,7 @@ static ssize_t id_show(struct device *dev,
 {
 	struct dax_region *dax_region = dev_get_drvdata(dev);
 
-	return sprintf(buf, "%d\n", dax_region->id);
+	return sysfs_emit(buf, "%d\n", dax_region->id);
 }
 static DEVICE_ATTR_RO(id);
 
@@ -278,8 +278,8 @@ static ssize_t region_size_show(struct device *dev,
 {
 	struct dax_region *dax_region = dev_get_drvdata(dev);
 
-	return sprintf(buf, "%llu\n", (unsigned long long)
-			resource_size(&dax_region->res));
+	return sysfs_emit(buf, "%llu\n",
+			  (unsigned long long)resource_size(&dax_region->res));
 }
 static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
 		region_size_show, NULL);
@@ -289,7 +289,7 @@ static ssize_t region_align_show(struct device *dev,
 {
 	struct dax_region *dax_region = dev_get_drvdata(dev);
 
-	return sprintf(buf, "%u\n", dax_region->align);
+	return sysfs_emit(buf, "%u\n", dax_region->align);
 }
 static struct device_attribute dev_attr_region_align =
 		__ATTR(align, 0400, region_align_show, NULL);
@@ -322,7 +322,7 @@ static ssize_t available_size_show(struct device *dev,
 	size = dax_region_avail_size(dax_region);
 	up_read(&dax_region_rwsem);
 
-	return sprintf(buf, "%llu\n", size);
+	return sysfs_emit(buf, "%llu\n", size);
 }
 static DEVICE_ATTR_RO(available_size);
 
@@ -340,7 +340,7 @@ static ssize_t seed_show(struct device *dev,
 	if (rc)
 		return rc;
 	seed = dax_region->seed;
-	rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : "");
+	rc = sysfs_emit(buf, "%s\n", seed ? dev_name(seed) : "");
 	up_read(&dax_region_rwsem);
 
 	return rc;
@@ -361,7 +361,7 @@ static ssize_t create_show(struct device *dev,
 	if (rc)
 		return rc;
 	youngest = dax_region->youngest;
-	rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : "");
+	rc = sysfs_emit(buf, "%s\n", youngest ? dev_name(youngest) : "");
 	up_read(&dax_region_rwsem);
 
 	return rc;
@@ -763,7 +763,7 @@ static ssize_t start_show(struct device *dev,
 	dax_range = get_dax_range(dev);
 	if (!dax_range)
 		return -ENXIO;
-	rc = sprintf(buf, "%#llx\n", dax_range->range.start);
+	rc = sysfs_emit(buf, "%#llx\n", dax_range->range.start);
 	put_dax_range();
 
 	return rc;
@@ -779,7 +779,7 @@ static ssize_t end_show(struct device *dev,
 	dax_range = get_dax_range(dev);
 	if (!dax_range)
 		return -ENXIO;
-	rc = sprintf(buf, "%#llx\n", dax_range->range.end);
+	rc = sysfs_emit(buf, "%#llx\n", dax_range->range.end);
 	put_dax_range();
 
 	return rc;
@@ -795,7 +795,7 @@ static ssize_t pgoff_show(struct device *dev,
 	dax_range = get_dax_range(dev);
 	if (!dax_range)
 		return -ENXIO;
-	rc = sprintf(buf, "%#lx\n", dax_range->pgoff);
+	rc = sysfs_emit(buf, "%#lx\n", dax_range->pgoff);
 	put_dax_range();
 
 	return rc;
@@ -969,7 +969,7 @@ static ssize_t size_show(struct device *dev,
 	size = dev_dax_size(dev_dax);
 	up_write(&dax_dev_rwsem);
 
-	return sprintf(buf, "%llu\n", size);
+	return sysfs_emit(buf, "%llu\n", size);
 }
 
 static bool alloc_is_aligned(struct dev_dax *dev_dax, resource_size_t size)
@@ -1233,7 +1233,7 @@ static ssize_t align_show(struct device *dev,
 {
 	struct dev_dax *dev_dax = to_dev_dax(dev);
 
-	return sprintf(buf, "%d\n", dev_dax->align);
+	return sysfs_emit(buf, "%d\n", dev_dax->align);
 }
 
 static ssize_t dev_dax_validate_align(struct dev_dax *dev_dax)
@@ -1311,7 +1311,7 @@ static ssize_t target_node_show(struct device *dev,
 {
 	struct dev_dax *dev_dax = to_dev_dax(dev);
 
-	return sprintf(buf, "%d\n", dev_dax_target_node(dev_dax));
+	return sysfs_emit(buf, "%d\n", dev_dax_target_node(dev_dax));
 }
 static DEVICE_ATTR_RO(target_node);
 
@@ -1327,7 +1327,7 @@ static ssize_t resource_show(struct device *dev,
 	else
 		start = dev_dax->ranges[0].range.start;
 
-	return sprintf(buf, "%#llx\n", start);
+	return sysfs_emit(buf, "%#llx\n", start);
 }
 static DEVICE_ATTR(resource, 0400, resource_show, NULL);
 
@@ -1338,14 +1338,14 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
 	 * We only ever expect to handle device-dax instances, i.e. the
 	 * @type argument to MODULE_ALIAS_DAX_DEVICE() is always zero
 	 */
-	return sprintf(buf, DAX_DEVICE_MODALIAS_FMT "\n", 0);
+	return sysfs_emit(buf, DAX_DEVICE_MODALIAS_FMT "\n", 0);
 }
 static DEVICE_ATTR_RO(modalias);
 
 static ssize_t numa_node_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%d\n", dev_to_node(dev));
+	return sysfs_emit(buf, "%d\n", dev_to_node(dev));
 }
 static DEVICE_ATTR_RO(numa_node);
 

From 7c04d26dcbe7be05c740796b5bc1f513937b6cd9 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 24 Jan 2024 12:03:48 -0800
Subject: [PATCH 494/707] Documentatiion/ABI: add ABI documentation for
 sys-bus-dax

Add the missing sysfs ABI documentation for the device DAX subsystem.
Various ABI attributes under this have been present since v5.1, and more
have been added over time. In preparation for adding a new attribute,
add this file with the historical details.

Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-3-20d16cb8d23d@intel.com
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Li Zhijian <lizhijian@fujitsu.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-bus-dax | 136 ++++++++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-dax

diff --git a/Documentation/ABI/testing/sysfs-bus-dax b/Documentation/ABI/testing/sysfs-bus-dax
new file mode 100644
index 00000000000000..6359f7bc9bf430
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-dax
@@ -0,0 +1,136 @@
+What:		/sys/bus/dax/devices/daxX.Y/align
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RW) Provides a way to specify an alignment for a dax device.
+		Values allowed are constrained by the physical address ranges
+		that back the dax device, and also by arch requirements.
+
+What:		/sys/bus/dax/devices/daxX.Y/mapping
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(WO) Provides a way to allocate a mapping range under a dax
+		device. Specified in the format <start>-<end>.
+
+What:		/sys/bus/dax/devices/daxX.Y/mapping[0..N]/start
+What:		/sys/bus/dax/devices/daxX.Y/mapping[0..N]/end
+What:		/sys/bus/dax/devices/daxX.Y/mapping[0..N]/page_offset
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) A dax device may have multiple constituent discontiguous
+		address ranges. These are represented by the different
+		'mappingX' subdirectories. The 'start' attribute indicates the
+		start physical address for the given range. The 'end' attribute
+		indicates the end physical address for the given range. The
+		'page_offset' attribute indicates the offset of the current
+		range in the dax device.
+
+What:		/sys/bus/dax/devices/daxX.Y/resource
+Date:		June, 2019
+KernelVersion:	v5.3
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) The resource attribute indicates the starting physical
+		address of a dax device. In case of a device with multiple
+		constituent ranges, it indicates the starting address of the
+		first range.
+
+What:		/sys/bus/dax/devices/daxX.Y/size
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RW) The size attribute indicates the total size of a dax
+		device. For creating subdivided dax devices, or for resizing
+		an existing device, the new size can be written to this as
+		part of the reconfiguration process.
+
+What:		/sys/bus/dax/devices/daxX.Y/numa_node
+Date:		November, 2019
+KernelVersion:	v5.5
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) If NUMA is enabled and the platform has affinitized the
+		backing device for this dax device, emit the CPU node
+		affinity for this device.
+
+What:		/sys/bus/dax/devices/daxX.Y/target_node
+Date:		February, 2019
+KernelVersion:	v5.1
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) The target-node attribute is the Linux numa-node that a
+		device-dax instance may create when it is online. Prior to
+		being online the device's 'numa_node' property reflects the
+		closest online cpu node which is the typical expectation of a
+		device 'numa_node'. Once it is online it becomes its own
+		distinct numa node.
+
+What:		$(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/available_size
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) The available_size attribute tracks available dax region
+		capacity. This only applies to volatile hmem devices, not pmem
+		devices, since pmem devices are defined by nvdimm namespace
+		boundaries.
+
+What:		$(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/size
+Date:		July, 2017
+KernelVersion:	v5.1
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) The size attribute indicates the size of a given dax region
+		in bytes.
+
+What:		$(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/align
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) The align attribute indicates alignment of the dax region.
+		Changes on align may not always be valid, when say certain
+		mappings were created with 2M and then we switch to 1G. This
+		validates all ranges against the new value being attempted, post
+		resizing.
+
+What:		$(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/seed
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) The seed device is a concept for dynamic dax regions to be
+		able to split the region amongst multiple sub-instances.  The
+		seed device, similar to libnvdimm seed devices, is a device
+		that starts with zero capacity allocated and unbound to a
+		driver.
+
+What:		$(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/create
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RW) The create interface to the dax region provides a way to
+		create a new unconfigured dax device under the given region, which
+		can then be configured (with a size etc.) and then probed.
+
+What:		$(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/delete
+Date:		October, 2020
+KernelVersion:	v5.10
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(WO) The delete interface for a dax region provides for deletion
+		of any 0-sized and idle dax devices.
+
+What:		$(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/id
+Date:		July, 2017
+KernelVersion:	v5.1
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RO) The id attribute indicates the region id of a dax region.

From 2705e47b92409155c868c12f0db02e32ccbb32d3 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 24 Jan 2024 12:03:49 -0800
Subject: [PATCH 495/707] mm/memory_hotplug: export
 mhp_supports_memmap_on_memory()

In preparation for adding sysfs ABI to toggle memmap_on_memory semantics
for drivers adding memory, export the mhp_supports_memmap_on_memory()
helper. This allows drivers to check if memmap_on_memory support is
available before trying to request it, and display an appropriate
message if it isn't available. As part of this, remove the size argument
to this - with recent updates to allow memmap_on_memory for larger
ranges, and the internal splitting of altmaps into respective memory
blocks, the size argument is meaningless.

Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-4-20d16cb8d23d@intel.com
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Li Zhijian <lizhijian@fujitsu.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory_hotplug.h |  6 ++++++
 mm/memory_hotplug.c            | 17 ++++++-----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index ee00015575aab3..70aadb2009a08c 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -137,6 +137,7 @@ struct mhp_params {
 
 bool mhp_range_allowed(u64 start, u64 size, bool need_mapping);
 struct range mhp_get_pluggable_range(bool need_mapping);
+bool mhp_supports_memmap_on_memory(void);
 
 /*
  * Zone resizing functions
@@ -278,6 +279,11 @@ static inline bool movable_node_is_enabled(void)
 	return false;
 }
 
+static bool mhp_supports_memmap_on_memory(void)
+{
+	return false;
+}
+
 static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {}
 static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {}
 static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 707027f691503f..a444e2d7dd2bff 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1337,7 +1337,7 @@ static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
 }
 #endif
 
-static bool mhp_supports_memmap_on_memory(unsigned long size)
+bool mhp_supports_memmap_on_memory(void)
 {
 	unsigned long vmemmap_size = memory_block_memmap_size();
 	unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
@@ -1346,17 +1346,11 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
 	 * Besides having arch support and the feature enabled at runtime, we
 	 * need a few more assumptions to hold true:
 	 *
-	 * a) We span a single memory block: memory onlining/offlinin;g happens
-	 *    in memory block granularity. We don't want the vmemmap of online
-	 *    memory blocks to reside on offline memory blocks. In the future,
-	 *    we might want to support variable-sized memory blocks to make the
-	 *    feature more versatile.
-	 *
-	 * b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+	 * a) The vmemmap pages span complete PMDs: We don't want vmemmap code
 	 *    to populate memory from the altmap for unrelated parts (i.e.,
 	 *    other memory blocks)
 	 *
-	 * c) The vmemmap pages (and thereby the pages that will be exposed to
+	 * b) The vmemmap pages (and thereby the pages that will be exposed to
 	 *    the buddy) have to cover full pageblocks: memory onlining/offlining
 	 *    code requires applicable ranges to be page-aligned, for example, to
 	 *    set the migratetypes properly.
@@ -1368,7 +1362,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
 	 *       altmap as an alternative source of memory, and we do not exactly
 	 *       populate a single PMD.
 	 */
-	if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+	if (!mhp_memmap_on_memory())
 		return false;
 
 	/*
@@ -1391,6 +1385,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
 
 	return arch_supports_memmap_on_memory(vmemmap_size);
 }
+EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory);
 
 static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
 {
@@ -1526,7 +1521,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 	 * Self hosted memmap array
 	 */
 	if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
-	    mhp_supports_memmap_on_memory(memory_block_size_bytes())) {
+	    mhp_supports_memmap_on_memory()) {
 		ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags);
 		if (ret)
 			goto error;

From 7e70ae812236a784564f59320ee7e94a5e5b5caf Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sat, 27 Jan 2024 19:13:46 -0800
Subject: [PATCH 496/707] 
 mm-memory_hotplug-export-mhp_supports_memmap_on_memory-fix

fix build

In file included from ./include/linux/mmzone.h:1425,
                 from ./include/linux/gfp.h:7,
                 from ./include/linux/slab.h:16,
                 from ./include/linux/crypto.h:17,
                 from arch/x86/kernel/asm-offsets.c:9:
./include/linux/memory_hotplug.h:282:13: warning: 'mhp_supports_memmap_on_memory' defined but not used [-Wunused-function]
  282 | static bool mhp_supports_memmap_on_memory(void)
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Cc: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 70aadb2009a08c..7a9ff464608d70 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -279,7 +279,7 @@ static inline bool movable_node_is_enabled(void)
 	return false;
 }
 
-static bool mhp_supports_memmap_on_memory(void)
+static inline bool mhp_supports_memmap_on_memory(void)
 {
 	return false;
 }

From 6e024d502dd3631dfee6997d970ca51be60769d1 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 24 Jan 2024 12:03:50 -0800
Subject: [PATCH 497/707] dax: add a sysfs knob to control memmap_on_memory
 behavior

Add a sysfs knob for dax devices to control the memmap_on_memory setting
if the dax device were to be hotplugged as system memory.

The default memmap_on_memory setting for dax devices originating via pmem
or hmem is set to 'false' - i.e.  no memmap_on_memory semantics, to
preserve legacy behavior.  For dax devices via CXL, the default is on.
The sysfs control allows the administrator to override the above defaults
if needed.

Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-5-20d16cb8d23d@intel.com
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Tested-by: Li Zhijian <lizhijian@fujitsu.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Huang, Ying <ying.huang@intel.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-bus-dax | 17 ++++++++++
 drivers/dax/bus.c                       | 43 +++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-bus-dax b/Documentation/ABI/testing/sysfs-bus-dax
index 6359f7bc9bf430..b34266bfae49ae 100644
--- a/Documentation/ABI/testing/sysfs-bus-dax
+++ b/Documentation/ABI/testing/sysfs-bus-dax
@@ -134,3 +134,20 @@ KernelVersion:	v5.1
 Contact:	nvdimm@lists.linux.dev
 Description:
 		(RO) The id attribute indicates the region id of a dax region.
+
+What:		/sys/bus/dax/devices/daxX.Y/memmap_on_memory
+Date:		January, 2024
+KernelVersion:	v6.8
+Contact:	nvdimm@lists.linux.dev
+Description:
+		(RW) Control the memmap_on_memory setting if the dax device
+		were to be hotplugged as system memory. This determines whether
+		the 'altmap' for the hotplugged memory will be placed on the
+		device being hotplugged (memmap_on_memory=1) or if it will be
+		placed on regular memory (memmap_on_memory=0). This attribute
+		must be set before the device is handed over to the 'kmem'
+		driver (i.e.  hotplugged into system-ram). Additionally, this
+		depends on CONFIG_MHP_MEMMAP_ON_MEMORY, and a globally enabled
+		memmap_on_memory parameter for memory_hotplug. This is
+		typically set on the kernel command line -
+		memory_hotplug.memmap_on_memory set to 'true' or 'force'."
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 0fd948a4443e38..27c86d0ca7118d 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -1349,6 +1349,48 @@ static ssize_t numa_node_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(numa_node);
 
+static ssize_t memmap_on_memory_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct dev_dax *dev_dax = to_dev_dax(dev);
+
+	return sysfs_emit(buf, "%d\n", dev_dax->memmap_on_memory);
+}
+
+static ssize_t memmap_on_memory_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t len)
+{
+	struct dev_dax *dev_dax = to_dev_dax(dev);
+	bool val;
+	int rc;
+
+	rc = kstrtobool(buf, &val);
+	if (rc)
+		return rc;
+
+	if (val == true && !mhp_supports_memmap_on_memory()) {
+		dev_dbg(dev, "memmap_on_memory is not available\n");
+		return -EOPNOTSUPP;
+	}
+
+	rc = down_write_killable(&dax_dev_rwsem);
+	if (rc)
+		return rc;
+
+	if (dev_dax->memmap_on_memory != val && dev->driver &&
+	    to_dax_drv(dev->driver)->type == DAXDRV_KMEM_TYPE) {
+		up_write(&dax_dev_rwsem);
+		return -EBUSY;
+	}
+
+	dev_dax->memmap_on_memory = val;
+	up_write(&dax_dev_rwsem);
+
+	return len;
+}
+static DEVICE_ATTR_RW(memmap_on_memory);
+
 static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
@@ -1375,6 +1417,7 @@ static struct attribute *dev_dax_attributes[] = {
 	&dev_attr_align.attr,
 	&dev_attr_resource.attr,
 	&dev_attr_numa_node.attr,
+	&dev_attr_memmap_on_memory.attr,
 	NULL,
 };
 

From 056484c1a19b104991c5276ddfb1841c44f76d04 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 24 Jan 2024 18:12:15 +0000
Subject: [PATCH 498/707] highmem: add kernel-doc for memcpy_*_folio()

This was inadvertently skipped when adding the new functions.

Link: https://lkml.kernel.org/r/20240124181217.1761674-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 451c1dff0e873c..00341b56d2910d 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -439,6 +439,13 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len)
 	kunmap_local(addr);
 }
 
+/**
+ * memcpy_from_folio - Copy a range of bytes from a folio.
+ * @to: The memory to copy to.
+ * @folio: The folio to read from.
+ * @offset: The first byte in the folio to read.
+ * @len: The number of bytes to copy.
+ */
 static inline void memcpy_from_folio(char *to, struct folio *folio,
 		size_t offset, size_t len)
 {
@@ -460,6 +467,13 @@ static inline void memcpy_from_folio(char *to, struct folio *folio,
 	} while (len > 0);
 }
 
+/**
+ * memcpy_to_folio - Copy a range of bytes to a folio.
+ * @folio: The folio to write to.
+ * @offset: The first byte in the folio to store to.
+ * @from: The memory to copy from.
+ * @len: The number of bytes to copy.
+ */
 static inline void memcpy_to_folio(struct folio *folio, size_t offset,
 		const char *from, size_t len)
 {

From bd8ef4803f72d2ace7cadcb5a21474d8caf1a079 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Wed, 24 Jan 2024 18:31:34 +0100
Subject: [PATCH 499/707] mm: kmsan: remove runtime checks from
 kmsan_unpoison_memory()

Similarly to what's been done in commit 85716a80c16d ("kmsan: allow using
__msan_instrument_asm_store() inside runtime"), it should be safe to call
kmsan_unpoison_memory() from within the runtime, as it does not allocate
memory or take locks.  Remove the redundant runtime checks.

This should fix false positives seen with CONFIG_DEBUG_LIST=y when
the non-instrumented lib/stackdepot.c failed to unpoison the memory
chunks later checked by the instrumented lib/list_debug.c

Also replace the implementation of kmsan_unpoison_entry_regs() with
a call to kmsan_unpoison_memory().

Link: https://lkml.kernel.org/r/20240124173134.1165747-1-glider@google.com
Fixes: f80be4571b19b9 ("kmsan: add KMSAN runtime core")
Signed-off-by: Alexander Potapenko <glider@google.com>
Tested-by: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Nicholas Miehlbradt <nicholas@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kmsan/hooks.c | 36 +++++++++++++-----------------------
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 5d6e2dee5692a3..0b09daa188ef6c 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -359,6 +359,12 @@ void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
 }
 
 /* Functions from kmsan-checks.h follow. */
+
+/*
+ * To create an origin, kmsan_poison_memory() unwinds the stacks and stores it
+ * into the stack depot. This may cause deadlocks if done from within KMSAN
+ * runtime, therefore we bail out if kmsan_in_runtime().
+ */
 void kmsan_poison_memory(const void *address, size_t size, gfp_t flags)
 {
 	if (!kmsan_enabled || kmsan_in_runtime())
@@ -371,47 +377,31 @@ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags)
 }
 EXPORT_SYMBOL(kmsan_poison_memory);
 
+/*
+ * Unlike kmsan_poison_memory(), this function can be used from within KMSAN
+ * runtime, because it does not trigger allocations or call instrumented code.
+ */
 void kmsan_unpoison_memory(const void *address, size_t size)
 {
 	unsigned long ua_flags;
 
-	if (!kmsan_enabled || kmsan_in_runtime())
+	if (!kmsan_enabled)
 		return;
 
 	ua_flags = user_access_save();
-	kmsan_enter_runtime();
 	/* The users may want to poison/unpoison random memory. */
 	kmsan_internal_unpoison_memory((void *)address, size,
 				       KMSAN_POISON_NOCHECK);
-	kmsan_leave_runtime();
 	user_access_restore(ua_flags);
 }
 EXPORT_SYMBOL(kmsan_unpoison_memory);
 
 /*
- * Version of kmsan_unpoison_memory() that can be called from within the KMSAN
- * runtime.
- *
- * Non-instrumented IRQ entry functions receive struct pt_regs from assembly
- * code. Those regs need to be unpoisoned, otherwise using them will result in
- * false positives.
- * Using kmsan_unpoison_memory() is not an option in entry code, because the
- * return value of in_task() is inconsistent - as a result, certain calls to
- * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that
- * the registers are unpoisoned even if kmsan_in_runtime() is true in the early
- * entry code.
+ * Version of kmsan_unpoison_memory() called from IRQ entry functions.
  */
 void kmsan_unpoison_entry_regs(const struct pt_regs *regs)
 {
-	unsigned long ua_flags;
-
-	if (!kmsan_enabled)
-		return;
-
-	ua_flags = user_access_save();
-	kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs),
-				       KMSAN_POISON_NOCHECK);
-	user_access_restore(ua_flags);
+	kmsan_unpoison_memory((void *)regs, sizeof(*regs));
 }
 
 void kmsan_check_memory(const void *addr, size_t size)

From 02ebb4652e3749bb4b139aaaebf593ca836d5e71 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Mon, 22 Jan 2024 22:46:33 -0500
Subject: [PATCH 500/707] mm/compaction: enable compacting >0 order folios.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Enable >0 order folio memory compaction", v2.

This patchset enables >0 order folio memory compaction, which is one of
the prerequisitions for large folio support[1].

I am aware that split free pages is necessary for folio migration in
compaction, since if >0 order free pages are never split and no order-0
free page is scanned, compaction will end prematurely due to migration
returns -ENOMEM.  Free page split becomes a must instead of an
optimization.

Some applications from vm-scalability show different performance trends on
default LRU and CONFIG_LRU_GEN from patch 1 (split folio during
compaction), to patch 2 (folio migration during compaction), to patch 3
(folio migration during compaction with free page split).  I am looking
into it.

lkp ncompare results (with >5% delta) for default LRU and CONFIG_LRU_GEN
are shown at the bottom (on a 8-CPU (Intel Xeon E5-2650 v4 @ 2.20GHz) 16G
VM).

Overview
========

To support >0 order folio compaction, the patchset changes how free pages
used for migration are kept during compaction.  Free pages used to be
split into order-0 pages that are post allocation processed (i.e.,
PageBuddy flag cleared, page order stored in page->private is zeroed, and
page reference is set to 1).  Now all free pages are kept in a MAX_ORDER+1
array of page lists based on their order without post allocation process.
When migrate_pages() asks for a new page, one of the free pages, based on
the requested page order, is then processed and given out.


[1] https://lore.kernel.org/linux-mm/20230912162815.440749-1-zi.yan@sent.com/
[2] https://lore.kernel.org/linux-mm/20231113170157.280181-1-zi.yan@sent.com/

vm-scalability results on CONFIG_LRU_GEN
===

=========================================================================================
compiler/kconfig/rootfs/runtime/tbox_group/test/testcase:
  gcc-13/defconfig/debian/300s/qemu-vm/small-allocs/vm-scalability

commit:
  6.7.0-rc4+
  6.7.0-rc4-split-folio-in-compaction+
  6.7.0-rc4-folio-migration-in-compaction+
  6.7.0-rc4-folio-migration-free-page-split+

      6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f
---------------- --------------------------- --------------------------- ---------------------------
         %stddev     %change         %stddev     %change         %stddev     %change         %stddev
             \          |                \          |                \          |                \
   2024326           +35.5%    2743772 ± 41%    +364.0%    9392198 ± 35%     +31.0%    2651634        vm-scalability.throughput

=========================================================================================
compiler/kconfig/rootfs/runtime/tbox_group/test/testcase:
  gcc-13/defconfig/debian/300s/qemu-vm/small-allocs-mt/vm-scalability

commit:
  6.7.0-rc4+
  6.7.0-rc4-split-folio-in-compaction+
  6.7.0-rc4-folio-migration-in-compaction+
  6.7.0-rc4-folio-migration-free-page-split+

      6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f
---------------- --------------------------- --------------------------- ---------------------------
         %stddev     %change         %stddev     %change         %stddev     %change         %stddev
             \          |                \          |                \          |                \
   1450189            +0.9%    1463418           +30.4%    1891610 ± 22%      +0.3%    1454100        vm-scalability.throughput

=========================================================================================
compiler/kconfig/rootfs/runtime/tbox_group/test/testcase:
  gcc-13/defconfig/debian/300s/qemu-vm/mmap-xread-seq-mt/vm-scalability

commit:
  6.7.0-rc4+
  6.7.0-rc4-split-folio-in-compaction+
  6.7.0-rc4-folio-migration-in-compaction+
  6.7.0-rc4-folio-migration-free-page-split+

      6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f
---------------- --------------------------- --------------------------- ---------------------------
         %stddev     %change         %stddev     %change         %stddev     %change         %stddev
             \          |                \          |                \          |                \
  14428848 ± 27%     -51.7%    6963308 ± 73%     +13.5%   16372621           +11.2%   16046511        vm-scalability.throughput

=========================================================================================
compiler/kconfig/rootfs/runtime/tbox_group/test/testcase:
  gcc-13/defconfig/debian/300s/qemu-vm/mmap-pread-seq/vm-scalability

commit:
  6.7.0-rc4+
  6.7.0-rc4-split-folio-in-compaction+
  6.7.0-rc4-folio-migration-in-compaction+
  6.7.0-rc4-folio-migration-free-page-split+

      6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f
---------------- --------------------------- --------------------------- ---------------------------
         %stddev     %change         %stddev     %change         %stddev     %change         %stddev
             \          |                \          |                \          |                \
  13569502 ± 24%     -45.9%    7340064 ± 59%     +12.3%   15240531           +10.4%   14983705        vm-scalability.throughput

=========================================================================================
compiler/kconfig/rootfs/runtime/tbox_group/test/testcase:
  gcc-13/defconfig/debian/300s/qemu-vm/mmap-pread-seq-mt/vm-scalability

commit:
  6.7.0-rc4+
  6.7.0-rc4-split-folio-in-compaction+
  6.7.0-rc4-folio-migration-in-compaction+
  6.7.0-rc4-folio-migration-free-page-split+

      6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f
---------------- --------------------------- --------------------------- ---------------------------
         %stddev     %change         %stddev     %change         %stddev     %change         %stddev
             \          |                \          |                \          |                \
  13305823 ± 24%     -45.1%    7299664 ± 56%     +12.5%   14974725           +10.4%   14695963        vm-scalability.throughput

=========================================================================================
compiler/kconfig/rootfs/runtime/tbox_group/test/testcase:
  gcc-13/defconfig/debian/300s/qemu-vm/lru-file-readtwice/vm-scalability

commit:
  6.7.0-rc4+
  6.7.0-rc4-split-folio-in-compaction+
  6.7.0-rc4-folio-migration-in-compaction+
  6.7.0-rc4-folio-migration-free-page-split+

      6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f
---------------- --------------------------- --------------------------- ---------------------------
         %stddev     %change         %stddev     %change         %stddev     %change         %stddev
             \          |                \          |                \          |                \
  13244376 ± 28%     +54.2%   20425838 ± 23%      -4.4%   12660113 ±  3%      -9.0%   12045809 ±  3%  vm-scalability.throughput

=========================================================================================
compiler/kconfig/rootfs/runtime/tbox_group/test/testcase:
  gcc-13/defconfig/debian/300s/qemu-vm/lru-file-mmap-read/vm-scalability

commit:
  6.7.0-rc4+
  6.7.0-rc4-split-folio-in-compaction+
  6.7.0-rc4-folio-migration-in-compaction+
  6.7.0-rc4-folio-migration-free-page-split+

      6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f
---------------- --------------------------- --------------------------- ---------------------------
         %stddev     %change         %stddev     %change         %stddev     %change         %stddev
             \          |                \          |                \          |                \
   7021425 ± 11%     -20.9%    5556751 ± 19%     +14.8%    8057811 ±  3%      +9.4%    7678613 ±  4%  vm-scalability.throughput

=========================================================================================
compiler/kconfig/rootfs/runtime/size/tbox_group/test/testcase:
  gcc-13/defconfig/debian/300s/256G/qemu-vm/msync/vm-scalability

commit:
  6.7.0-rc4+
  6.7.0-rc4-split-folio-in-compaction+
  6.7.0-rc4-folio-migration-in-compaction+
  6.7.0-rc4-folio-migration-free-page-split+

      6.7.0-rc4+ 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f
---------------- --------------------------- --------------------------- ---------------------------
         %stddev     %change         %stddev     %change         %stddev     %change         %stddev
             \          |                \          |                \          |                \
   1208994 ±137%    +263.5%    4394683 ± 49%     -49.4%     611204 ±  6%     -48.1%     627937 ± 13%  vm-scalability.throughput


vm-scalability results on default LRU (with -no-mglru suffix)
===

=========================================================================================
compiler/kconfig/rootfs/runtime/tbox_group/test/testcase:
  gcc-13/defconfig/debian/300s/qemu-vm/lru-file-readtwice/vm-scalability

commit:
  6.7.0-rc4-no-mglru+
  6.7.0-rc4-split-folio-in-compaction-no-mglru+
  6.7.0-rc4-folio-migration-in-compaction-no-mglru+
  6.7.0-rc4-folio-migration-free-page-split-no-mglru+

6.7.0-rc4-no-mgl 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f
---------------- --------------------------- --------------------------- ---------------------------
         %stddev     %change         %stddev     %change         %stddev     %change         %stddev
             \          |                \          |                \          |                \
   8412072 ±  3%     +32.1%   11114537 ± 41%      +3.5%    8703491 ±  3%      +1.5%    8536343 ±  3%  vm-scalability.throughput

=========================================================================================
compiler/kconfig/rootfs/runtime/tbox_group/test/testcase:
  gcc-13/defconfig/debian/300s/qemu-vm/lru-file-mmap-read/vm-scalability

commit:
  6.7.0-rc4-no-mglru+
  6.7.0-rc4-split-folio-in-compaction-no-mglru+
  6.7.0-rc4-folio-migration-in-compaction-no-mglru+
  6.7.0-rc4-folio-migration-free-page-split-no-mglru+

6.7.0-rc4-no-mgl 6.7.0-rc4-split-folio-in-co 6.7.0-rc4-folio-migration-i 6.7.0-rc4-folio-migration-f
---------------- --------------------------- --------------------------- ---------------------------
         %stddev     %change         %stddev     %change         %stddev     %change         %stddev
             \          |                \          |                \          |                \
   7095358           +10.8%    7863635 ± 16%      +5.5%    7484110            +1.5%    7200666 ±  4%  vm-scalability.throughput


This patch (of 3):

migrate_pages() supports >0 order folio migration and during compaction,
even if compaction_alloc() cannot provide >0 order free pages,
migrate_pages() can split the source page and try to migrate the base
pages from the split.  It can be a baseline and start point for adding
support for compacting >0 order folios.

Link: https://lkml.kernel.org/r/20240123034636.1095672-1-zi.yan@sent.com
Link: https://lkml.kernel.org/r/20240123034636.1095672-2-zi.yan@sent.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Suggested-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Adam Manzanares <a.manzanares@samsung.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 43 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 4add68d40e8d99..e43e898d2c77f9 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -816,6 +816,21 @@ static bool too_many_isolated(struct compact_control *cc)
 	return too_many;
 }
 
+/*
+ * 1. if the page order is larger than or equal to target_order (i.e.,
+ * cc->order and when it is not -1 for global compaction), skip it since
+ * target_order already indicates no free page with larger than target_order
+ * exists and later migrating it will most likely fail;
+ *
+ * 2. compacting > pageblock_order pages does not improve memory fragmentation,
+ * skip them;
+ */
+static bool skip_isolation_on_order(int order, int target_order)
+{
+	return (target_order != -1 && order >= target_order) ||
+		order >= pageblock_order;
+}
+
 /**
  * isolate_migratepages_block() - isolate all migrate-able pages within
  *				  a single pageblock
@@ -1010,7 +1025,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		/*
 		 * Regardless of being on LRU, compound pages such as THP and
 		 * hugetlbfs are not to be compacted unless we are attempting
-		 * an allocation much larger than the huge page size (eg CMA).
+		 * an allocation larger than the compound page size.
 		 * We can potentially save a lot of iterations if we skip them
 		 * at once. The check is racy, but we can consider only valid
 		 * values and the only danger is skipping too much.
@@ -1018,11 +1033,18 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		if (PageCompound(page) && !cc->alloc_contig) {
 			const unsigned int order = compound_order(page);
 
-			if (likely(order <= MAX_PAGE_ORDER)) {
-				low_pfn += (1UL << order) - 1;
-				nr_scanned += (1UL << order) - 1;
+			/*
+			 * Skip based on page order and compaction target order
+			 * and skip hugetlbfs pages.
+			 */
+			if (skip_isolation_on_order(order, cc->order) ||
+			    PageHuge(page)) {
+				if (order <= MAX_PAGE_ORDER) {
+					low_pfn += (1UL << order) - 1;
+					nr_scanned += (1UL << order) - 1;
+				}
+				goto isolate_fail;
 			}
-			goto isolate_fail;
 		}
 
 		/*
@@ -1165,10 +1187,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			}
 
 			/*
-			 * folio become large since the non-locked check,
-			 * and it's on LRU.
+			 * Check LRU folio order under the lock
 			 */
-			if (unlikely(folio_test_large(folio) && !cc->alloc_contig)) {
+			if (unlikely(skip_isolation_on_order(folio_order(folio),
+							     cc->order) &&
+				     !cc->alloc_contig)) {
 				low_pfn += folio_nr_pages(folio) - 1;
 				nr_scanned += folio_nr_pages(folio) - 1;
 				folio_set_lru(folio);
@@ -1786,6 +1809,10 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data)
 	struct compact_control *cc = (struct compact_control *)data;
 	struct folio *dst;
 
+	/* this makes migrate_pages() split the source page and retry */
+	if (folio_test_large(src) > 0)
+		return NULL;
+
 	if (list_empty(&cc->freepages)) {
 		isolate_freepages(cc);
 

From b6feb42e379bdb8da2567fdf3bb9fcb3cf84fe10 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Mon, 22 Jan 2024 22:46:34 -0500
Subject: [PATCH 501/707] mm/compaction: add support for >0 order folio memory
 compaction.

Before last commit, memory compaction only migrates order-0 folios and
skips >0 order folios.  Last commit splits all >0 order folios during
compaction.  This commit migrates >0 order folios during compaction by
keeping isolated free pages at their original size without splitting them
into order-0 pages and using them directly during migration process.

What is different from the prior implementation:
1. All isolated free pages are kept in a NR_PAGE_ORDERS array of page
   lists, where each page list stores free pages in the same order.
2. All free pages are not post_alloc_hook() processed nor buddy pages,
   although their orders are stored in first page's private like buddy
   pages.
3. During migration, in new page allocation time (i.e., in
   compaction_alloc()), free pages are then processed by post_alloc_hook().
   When migration fails and a new page is returned (i.e., in
   compaction_free()), free pages are restored by reversing the
   post_alloc_hook() operations using newly added
   free_pages_prepare_fpi_none().

Step 3 is done for a latter optimization that splitting and/or merging
free pages during compaction becomes easier.

Note: without splitting free pages, compaction can end prematurely due to
migration will return -ENOMEM even if there is free pages.  This happens
when no order-0 free page exist and compaction_alloc() return NULL.

Link: https://lkml.kernel.org/r/20240123034636.1095672-3-zi.yan@sent.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Cc: Adam Manzanares <a.manzanares@samsung.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 148 +++++++++++++++++++++++++++++-------------------
 mm/internal.h   |   9 ++-
 mm/page_alloc.c |   6 ++
 3 files changed, 103 insertions(+), 60 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index e43e898d2c77f9..7465a24288c165 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -66,45 +66,67 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
 #define COMPACTION_HPAGE_ORDER	(PMD_SHIFT - PAGE_SHIFT)
 #endif
 
-static unsigned long release_freepages(struct list_head *freelist)
+static void init_page_list(struct page_list *p)
 {
-	struct page *page, *next;
-	unsigned long high_pfn = 0;
-
-	list_for_each_entry_safe(page, next, freelist, lru) {
-		unsigned long pfn = page_to_pfn(page);
-		list_del(&page->lru);
-		__free_page(page);
-		if (pfn > high_pfn)
-			high_pfn = pfn;
-	}
-
-	return high_pfn;
+	INIT_LIST_HEAD(&p->pages);
+	p->nr_pages = 0;
 }
 
-static void split_map_pages(struct list_head *list)
+static void split_map_pages(struct page_list *freepages)
 {
-	unsigned int i, order, nr_pages;
+	unsigned int i, order, total_nr_pages;
 	struct page *page, *next;
 	LIST_HEAD(tmp_list);
 
-	list_for_each_entry_safe(page, next, list, lru) {
-		list_del(&page->lru);
+	for (order = 0; order < NR_PAGE_ORDERS; order++) {
+		total_nr_pages = freepages[order].nr_pages * (1 << order);
+		freepages[order].nr_pages = 0;
+
+		list_for_each_entry_safe(page, next, &freepages[order].pages, lru) {
+			unsigned int nr_pages;
+
+			list_del(&page->lru);
 
-		order = page_private(page);
-		nr_pages = 1 << order;
+			nr_pages = 1 << order;
 
-		post_alloc_hook(page, order, __GFP_MOVABLE);
-		if (order)
-			split_page(page, order);
+			post_alloc_hook(page, order, __GFP_MOVABLE);
+			if (order)
+				split_page(page, order);
 
-		for (i = 0; i < nr_pages; i++) {
-			list_add(&page->lru, &tmp_list);
-			page++;
+			for (i = 0; i < nr_pages; i++) {
+				list_add(&page->lru, &tmp_list);
+				page++;
+			}
 		}
+		freepages[0].nr_pages += total_nr_pages;
+		list_splice_init(&tmp_list, &freepages[0].pages);
 	}
+}
 
-	list_splice(&tmp_list, list);
+static unsigned long release_free_list(struct page_list *freepages)
+{
+	int order;
+	unsigned long high_pfn = 0;
+
+	for (order = 0; order < NR_PAGE_ORDERS; order++) {
+		struct page *page, *next;
+
+		list_for_each_entry_safe(page, next, &freepages[order].pages, lru) {
+			unsigned long pfn = page_to_pfn(page);
+
+			list_del(&page->lru);
+			/*
+			 * Convert free pages into post allocation pages, so
+			 * that we can free them via __free_page.
+			 */
+			post_alloc_hook(page, order, __GFP_MOVABLE);
+			__free_pages(page, order);
+			if (pfn > high_pfn)
+				high_pfn = pfn;
+		}
+		freepages[order].nr_pages = 0;
+	}
+	return high_pfn;
 }
 
 #ifdef CONFIG_COMPACTION
@@ -583,7 +605,7 @@ static bool compact_unlock_should_abort(spinlock_t *lock,
 static unsigned long isolate_freepages_block(struct compact_control *cc,
 				unsigned long *start_pfn,
 				unsigned long end_pfn,
-				struct list_head *freelist,
+				struct page_list *freelist,
 				unsigned int stride,
 				bool strict)
 {
@@ -657,7 +679,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		nr_scanned += isolated - 1;
 		total_isolated += isolated;
 		cc->nr_freepages += isolated;
-		list_add_tail(&page->lru, freelist);
+		list_add_tail(&page->lru, &freelist[order].pages);
+		freelist[order].nr_pages++;
 
 		if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
 			blockpfn += isolated;
@@ -722,7 +745,11 @@ isolate_freepages_range(struct compact_control *cc,
 			unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
-	LIST_HEAD(freelist);
+	int order;
+	struct page_list tmp_freepages[NR_PAGE_ORDERS];
+
+	for (order = 0; order < NR_PAGE_ORDERS; order++)
+		init_page_list(&tmp_freepages[order]);
 
 	pfn = start_pfn;
 	block_start_pfn = pageblock_start_pfn(pfn);
@@ -753,7 +780,7 @@ isolate_freepages_range(struct compact_control *cc,
 			break;
 
 		isolated = isolate_freepages_block(cc, &isolate_start_pfn,
-					block_end_pfn, &freelist, 0, true);
+					block_end_pfn, tmp_freepages, 0, true);
 
 		/*
 		 * In strict mode, isolate_freepages_block() returns 0 if
@@ -770,15 +797,15 @@ isolate_freepages_range(struct compact_control *cc,
 		 */
 	}
 
-	/* __isolate_free_page() does not map the pages */
-	split_map_pages(&freelist);
-
 	if (pfn < end_pfn) {
 		/* Loop terminated early, cleanup. */
-		release_freepages(&freelist);
+		release_free_list(tmp_freepages);
 		return 0;
 	}
 
+	/* __isolate_free_page() does not map the pages */
+	split_map_pages(tmp_freepages);
+
 	/* We don't use freelists for anything. */
 	return pfn;
 }
@@ -1481,7 +1508,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn)
 	if (!page)
 		return;
 
-	isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
+	isolate_freepages_block(cc, &start_pfn, end_pfn, cc->freepages, 1, false);
 
 	/* Skip this pageblock in the future as it's full or nearly full */
 	if (start_pfn == end_pfn && !cc->no_set_skip_hint)
@@ -1610,7 +1637,7 @@ static void fast_isolate_freepages(struct compact_control *cc)
 				nr_scanned += nr_isolated - 1;
 				total_isolated += nr_isolated;
 				cc->nr_freepages += nr_isolated;
-				list_add_tail(&page->lru, &cc->freepages);
+				list_add_tail(&page->lru, &cc->freepages[order].pages);
 				count_compact_events(COMPACTISOLATED, nr_isolated);
 			} else {
 				/* If isolation fails, abort the search */
@@ -1687,13 +1714,12 @@ static void isolate_freepages(struct compact_control *cc)
 	unsigned long isolate_start_pfn; /* exact pfn we start at */
 	unsigned long block_end_pfn;	/* end of current pageblock */
 	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
-	struct list_head *freelist = &cc->freepages;
 	unsigned int stride;
 
 	/* Try a small search of the free lists for a candidate */
 	fast_isolate_freepages(cc);
 	if (cc->nr_freepages)
-		goto splitmap;
+		return;
 
 	/*
 	 * Initialise the free scanner. The starting point is where we last
@@ -1753,7 +1779,7 @@ static void isolate_freepages(struct compact_control *cc)
 
 		/* Found a block suitable for isolating free pages from. */
 		nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn,
-					block_end_pfn, freelist, stride, false);
+					block_end_pfn, cc->freepages, stride, false);
 
 		/* Update the skip hint if the full pageblock was scanned */
 		if (isolate_start_pfn == block_end_pfn)
@@ -1794,10 +1820,6 @@ static void isolate_freepages(struct compact_control *cc)
 	 * and the loop terminated due to isolate_start_pfn < low_pfn
 	 */
 	cc->free_pfn = isolate_start_pfn;
-
-splitmap:
-	/* __isolate_free_page() does not map the pages */
-	split_map_pages(freelist);
 }
 
 /*
@@ -1808,23 +1830,22 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data)
 {
 	struct compact_control *cc = (struct compact_control *)data;
 	struct folio *dst;
+	int order = folio_order(src);
 
-	/* this makes migrate_pages() split the source page and retry */
-	if (folio_test_large(src) > 0)
-		return NULL;
-
-	if (list_empty(&cc->freepages)) {
+	if (!cc->freepages[order].nr_pages) {
 		isolate_freepages(cc);
-
-		if (list_empty(&cc->freepages))
+		if (!cc->freepages[order].nr_pages)
 			return NULL;
 	}
 
-	dst = list_entry(cc->freepages.next, struct folio, lru);
+	dst = list_first_entry(&cc->freepages[order].pages, struct folio, lru);
+	cc->freepages[order].nr_pages--;
 	list_del(&dst->lru);
-	cc->nr_freepages--;
-
-	return dst;
+	post_alloc_hook(&dst->page, order, __GFP_MOVABLE);
+	if (order)
+		prep_compound_page(&dst->page, order);
+	cc->nr_freepages -= 1 << order;
+	return page_rmappable_folio(&dst->page);
 }
 
 /*
@@ -1835,9 +1856,17 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data)
 static void compaction_free(struct folio *dst, unsigned long data)
 {
 	struct compact_control *cc = (struct compact_control *)data;
+	int order = folio_order(dst);
+	struct page *page = &dst->page;
+
+	folio_set_count(dst, 0);
+	free_pages_prepare_fpi_none(page, order);
 
-	list_add(&dst->lru, &cc->freepages);
-	cc->nr_freepages++;
+	INIT_LIST_HEAD(&dst->lru);
+
+	list_add(&dst->lru, &cc->freepages[order].pages);
+	cc->freepages[order].nr_pages++;
+	cc->nr_freepages += 1 << order;
 }
 
 /* possible outcome of isolate_migratepages */
@@ -2461,6 +2490,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
 	const bool sync = cc->mode != MIGRATE_ASYNC;
 	bool update_cached;
 	unsigned int nr_succeeded = 0;
+	int order;
 
 	/*
 	 * These counters track activities during zone compaction.  Initialize
@@ -2470,7 +2500,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
 	cc->total_free_scanned = 0;
 	cc->nr_migratepages = 0;
 	cc->nr_freepages = 0;
-	INIT_LIST_HEAD(&cc->freepages);
+	for (order = 0; order < NR_PAGE_ORDERS; order++)
+		init_page_list(&cc->freepages[order]);
 	INIT_LIST_HEAD(&cc->migratepages);
 
 	cc->migratetype = gfp_migratetype(cc->gfp_mask);
@@ -2656,7 +2687,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
 	 * so we don't leave any returned pages behind in the next attempt.
 	 */
 	if (cc->nr_freepages > 0) {
-		unsigned long free_pfn = release_freepages(&cc->freepages);
+		unsigned long free_pfn = release_free_list(cc->freepages);
 
 		cc->nr_freepages = 0;
 		VM_BUG_ON(free_pfn == 0);
@@ -2675,7 +2706,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
 
 	trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret);
 
-	VM_BUG_ON(!list_empty(&cc->freepages));
 	VM_BUG_ON(!list_empty(&cc->migratepages));
 
 	return ret;
diff --git a/mm/internal.h b/mm/internal.h
index 1e29c5821a1dde..c6ea449c5353ce 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -447,6 +447,8 @@ extern void prep_compound_page(struct page *page, unsigned int order);
 
 extern void post_alloc_hook(struct page *page, unsigned int order,
 					gfp_t gfp_flags);
+extern bool free_pages_prepare_fpi_none(struct page *page, unsigned int order);
+
 extern int user_min_free_kbytes;
 
 extern void free_unref_page(struct page *page, unsigned int order);
@@ -473,6 +475,11 @@ int split_free_page(struct page *free_page,
 /*
  * in mm/compaction.c
  */
+
+struct page_list {
+	struct list_head	pages;
+	unsigned long		nr_pages;
+};
 /*
  * compact_control is used to track pages being migrated and the free pages
  * they are being migrated to during memory compaction. The free_pfn starts
@@ -481,7 +488,7 @@ int split_free_page(struct page *free_page,
  * completes when free_pfn <= migrate_pfn
  */
 struct compact_control {
-	struct list_head freepages;	/* List of free pages to migrate to */
+	struct page_list freepages[NR_PAGE_ORDERS];	/* List of free pages to migrate to */
 	struct list_head migratepages;	/* List of pages being migrated */
 	unsigned int nr_freepages;	/* Number of isolated free pages */
 	unsigned int nr_migratepages;	/* Number of pages to migrate */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 150d4f23b01048..aeda1153dad92b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1179,6 +1179,12 @@ static __always_inline bool free_pages_prepare(struct page *page,
 	return true;
 }
 
+__always_inline bool free_pages_prepare_fpi_none(struct page *page,
+			unsigned int order)
+{
+	return free_pages_prepare(page, order, FPI_NONE);
+}
+
 /*
  * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone.

From f2ab40c4bc9c63f4feb9af6ec5ce534cfb6055d3 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Mon, 22 Jan 2024 22:46:35 -0500
Subject: [PATCH 502/707] mm/compaction: optimize >0 order folio compaction
 with free page split.

During migration in a memory compaction, free pages are placed in an array
of page lists based on their order.  But the desired free page order
(i.e., the order of a source page) might not be always present, thus
leading to migration failures and premature compaction termination.  Split
a high order free pages when source migration page has a lower order to
increase migration successful rate.

Note: merging free pages when a migration fails and a lower order free
page is returned via compaction_free() is possible, but there is too much
work.  Since the free pages are not buddy pages, it is hard to identify
these free pages using existing PFN-based page merging algorithm.

Link: https://lkml.kernel.org/r/20240123034636.1095672-4-zi.yan@sent.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Cc: Adam Manzanares <a.manzanares@samsung.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 7465a24288c165..335a6f6787e4e3 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1831,9 +1831,43 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data)
 	struct compact_control *cc = (struct compact_control *)data;
 	struct folio *dst;
 	int order = folio_order(src);
+	bool has_isolated_pages = false;
 
+again:
 	if (!cc->freepages[order].nr_pages) {
-		isolate_freepages(cc);
+		int i;
+
+		for (i = order + 1; i < NR_PAGE_ORDERS; i++) {
+			if (cc->freepages[i].nr_pages) {
+				struct page *freepage =
+					list_first_entry(&cc->freepages[i].pages,
+							 struct page, lru);
+
+				int start_order = i;
+				unsigned long size = 1 << start_order;
+
+				list_del(&freepage->lru);
+				cc->freepages[i].nr_pages--;
+
+				while (start_order > order) {
+					start_order--;
+					size >>= 1;
+
+					list_add(&freepage[size].lru,
+						&cc->freepages[start_order].pages);
+					cc->freepages[start_order].nr_pages++;
+					set_page_private(&freepage[size], start_order);
+				}
+				dst = (struct folio *)freepage;
+				goto done;
+			}
+		}
+		if (!has_isolated_pages) {
+			isolate_freepages(cc);
+			has_isolated_pages = true;
+			goto again;
+		}
+
 		if (!cc->freepages[order].nr_pages)
 			return NULL;
 	}
@@ -1841,6 +1875,7 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data)
 	dst = list_first_entry(&cc->freepages[order].pages, struct folio, lru);
 	cc->freepages[order].nr_pages--;
 	list_del(&dst->lru);
+done:
 	post_alloc_hook(&dst->page, order, __GFP_MOVABLE);
 	if (order)
 		prep_compound_page(&dst->page, order);

From 0e20ae3975d9489c5e4a156ef19a0faee288c325 Mon Sep 17 00:00:00 2001
From: "T.J. Mercier" <tjmercier@google.com>
Date: Fri, 26 Jan 2024 21:19:25 +0000
Subject: [PATCH 503/707] mm: memcg: don't periodically flush stats when memcg
 is disabled

The root memcg is onlined even when memcg is disabled.  When it's onlined
a 2 second periodic stat flush is started, but no stat flushing is
required when memcg is disabled because there can be no child memcgs.
Most calls to flush memcg stats are avoided when memcg is disabled as a
result of the mem_cgroup_disabled check added in 7d7ef0a4686a ("mm: memcg:
restore subtree stats flushing"), but the periodic flushing started in
mem_cgroup_css_online is not.  Skip it.

Link: https://lkml.kernel.org/r/20240126211927.1171338-1-tjmercier@google.com
Fixes: aa48e47e3906 ("memcg: infrastructure to flush memcg stats")
Signed-off-by: T.J. Mercier <tjmercier@google.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Chris Li <chrisl@kernel.org>
Reported-by: Minchan Kim <minchan@google.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 391ecdc5af68a0..eb8684269906e9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5622,7 +5622,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	if (alloc_shrinker_info(memcg))
 		goto offline_kmem;
 
-	if (unlikely(mem_cgroup_is_root(memcg)))
+	if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled())
 		queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
 				   FLUSH_TIME);
 	lru_gen_online_memcg(memcg);

From 8f12806c067b2d57bb3ed6ce0037dba110118ce0 Mon Sep 17 00:00:00 2001
From: Levi Yun <ppbuk5246@gmail.com>
Date: Fri, 26 Jan 2024 15:25:54 +0000
Subject: [PATCH 504/707] kswapd: replace try_to_freeze() with
 kthread_freezable_should_stop()

Instead of using try_to_freeze, use kthread_freezable_should_stop in
kswapd.  By this, we can avoid unnecessary freezing when kswapd should
stop.

Link: https://lkml.kernel.org/r/20240126152556.58791-1-ppbuk5246@gmail.com
Signed-off-by: Levi Yun <ppbuk5246@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4f9c854ce6cc66..1f139830b26f6c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6796,6 +6796,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 		bool raise_priority = true;
 		bool balanced;
 		bool ret;
+		bool was_frozen;
 
 		sc.reclaim_idx = highest_zoneidx;
 
@@ -6894,9 +6895,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 
 		/* Check if kswapd should be suspending */
 		__fs_reclaim_release(_THIS_IP_);
-		ret = try_to_freeze();
+		ret = kthread_freezable_should_stop(&was_frozen);
 		__fs_reclaim_acquire(_THIS_IP_);
-		if (ret || kthread_should_stop())
+		if (was_frozen || ret)
 			break;
 
 		/*
@@ -7102,7 +7103,7 @@ static int kswapd(void *p)
 	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
 	atomic_set(&pgdat->nr_writeback_throttled, 0);
 	for ( ; ; ) {
-		bool ret;
+		bool was_frozen;
 
 		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
 		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
@@ -7119,15 +7120,14 @@ static int kswapd(void *p)
 		WRITE_ONCE(pgdat->kswapd_order, 0);
 		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
 
-		ret = try_to_freeze();
-		if (kthread_should_stop())
+		if (kthread_freezable_should_stop(&was_frozen))
 			break;
 
 		/*
 		 * We can speed up thawing tasks if we don't call balance_pgdat
 		 * after returning from the refrigerator
 		 */
-		if (ret)
+		if (was_frozen)
 			continue;
 
 		/*

From f7279801ff46b58e6bd25aecfbea28e88f42a090 Mon Sep 17 00:00:00 2001
From: Gang Li <gang.li@linux.dev>
Date: Fri, 26 Jan 2024 23:24:05 +0800
Subject: [PATCH 505/707] hugetlb: code clean for hugetlb_hstate_alloc_pages

Patch series "hugetlb: parallelize hugetlb page init on boot", v5.

# Introduction
Hugetlb initialization during boot takes up a considerable amount of time.
For instance, on a 2TB system, initializing 1,800 1GB huge pages takes
1-2 seconds out of 10 seconds.  Initializing 11,776 1GB pages on a 12TB
Intel host takes more than 1 minute[1].  This is a noteworthy figure.

Inspired by [2] and [3], hugetlb initialization can also be accelerated
through parallelization.  Kernel already has infrastructure like
padata_do_multithreaded, this patch uses it to achieve effective results
by minimal modifications.

[1] https://lore.kernel.org/all/783f8bac-55b8-5b95-eb6a-11a583675000@google.com/
[2] https://lore.kernel.org/all/20200527173608.2885243-1-daniel.m.jordan@oracle.com/
[3] https://lore.kernel.org/all/20230906112605.2286994-1-usama.arif@bytedance.com/
[4] https://lore.kernel.org/all/76becfc1-e609-e3e8-2966-4053143170b6@google.com/

# max_threads
This patch use `padata_do_multithreaded` like this:

```
job.max_threads	= num_node_state(N_MEMORY) * multiplier;
padata_do_multithreaded(&job);
```

To fully utilize the CPU, the number of parallel threads needs to be
carefully considered.  `max_threads = num_node_state(N_MEMORY)` does not
fully utilize the CPU, so we need to multiply it by a multiplier.

Tests below indicate that a multiplier of 2 significantly improves
performance, and although larger values also provide improvements, the
gains are marginal.

  multiplier     1       2       3       4       5
 ------------ ------- ------- ------- ------- -------
  256G 2node   358ms   215ms   157ms   134ms   126ms
  2T   4node   979ms   679ms   543ms   489ms   481ms
  50G  2node   71ms    44ms    37ms    30ms    31ms

Therefore, choosing 2 as the multiplier strikes a good balance between
enhancing parallel processing capabilities and maintaining efficient
resource management.

# Test result
      test case       no patch(ms)   patched(ms)   saved
 ------------------- -------------- ------------- --------
  256c2T(4 node) 1G           4745          2024   57.34%
  128c1T(2 node) 1G           3358          1712   49.02%
     12T         1G          77000         18300   76.23%

  256c2T(4 node) 2M           3336          1051   68.52%
  128c1T(2 node) 2M           1943           716   63.15%


This patch (of 7):

The readability of `hugetlb_hstate_alloc_pages` is poor.  By cleaning the
code, its readability can be improved, facilitating future modifications.

This patch extracts two functions to reduce the complexity of
`hugetlb_hstate_alloc_pages` and has no functional changes.

- hugetlb_hstate_alloc_pages_node_specific() to handle iterates through
  each online node and performs allocation if necessary.
- hugetlb_hstate_alloc_pages_report() report error during allocation.
  And the value of h->max_huge_pages is updated accordingly.

Link: https://lkml.kernel.org/r/20240126152411.1238072-1-gang.li@linux.dev
Link: https://lkml.kernel.org/r/20240126152411.1238072-2-gang.li@linux.dev
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/hugetlb.c | 46 +++++++++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ed1581b670d42e..b8e4a6adefd67c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3482,6 +3482,33 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
 	h->max_huge_pages_node[nid] = i;
 }
 
+static bool __init hugetlb_hstate_alloc_pages_specific_nodes(struct hstate *h)
+{
+	int i;
+	bool node_specific_alloc = false;
+
+	for_each_online_node(i) {
+		if (h->max_huge_pages_node[i] > 0) {
+			hugetlb_hstate_alloc_pages_onenode(h, i);
+			node_specific_alloc = true;
+		}
+	}
+
+	return node_specific_alloc;
+}
+
+static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, struct hstate *h)
+{
+	if (allocated < h->max_huge_pages) {
+		char buf[32];
+
+		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
+		pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
+			h->max_huge_pages, buf, allocated);
+		h->max_huge_pages = allocated;
+	}
+}
+
 /*
  * NOTE: this routine is called in different contexts for gigantic and
  * non-gigantic pages.
@@ -3499,7 +3526,6 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 	struct folio *folio;
 	LIST_HEAD(folio_list);
 	nodemask_t *node_alloc_noretry;
-	bool node_specific_alloc = false;
 
 	/* skip gigantic hugepages allocation if hugetlb_cma enabled */
 	if (hstate_is_gigantic(h) && hugetlb_cma_size) {
@@ -3508,14 +3534,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 	}
 
 	/* do node specific alloc */
-	for_each_online_node(i) {
-		if (h->max_huge_pages_node[i] > 0) {
-			hugetlb_hstate_alloc_pages_onenode(h, i);
-			node_specific_alloc = true;
-		}
-	}
-
-	if (node_specific_alloc)
+	if (hugetlb_hstate_alloc_pages_specific_nodes(h))
 		return;
 
 	/* below will do all node balanced alloc */
@@ -3558,14 +3577,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 	/* list will be empty if hstate_is_gigantic */
 	prep_and_add_allocated_folios(h, &folio_list);
 
-	if (i < h->max_huge_pages) {
-		char buf[32];
-
-		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
-		pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
-			h->max_huge_pages, buf, i);
-		h->max_huge_pages = i;
-	}
+	hugetlb_hstate_alloc_pages_errcheck(i, h);
 	kfree(node_alloc_noretry);
 }
 

From cfdf0faaf3f0960554a6c8dda2d68c8fab8aad32 Mon Sep 17 00:00:00 2001
From: Gang Li <gang.li@linux.dev>
Date: Fri, 26 Jan 2024 23:24:06 +0800
Subject: [PATCH 506/707] hugetlb: split hugetlb_hstate_alloc_pages

1G and 2M huge pages have different allocation and initialization logic,
which leads to subtle differences in parallelization.  Therefore, it is
appropriate to split hugetlb_hstate_alloc_pages into gigantic and
non-gigantic.

This patch has no functional changes.

Link: https://lkml.kernel.org/r/20240126152411.1238072-3-gang.li@linux.dev
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/hugetlb.c | 87 ++++++++++++++++++++++++++--------------------------
 1 file changed, 43 insertions(+), 44 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b8e4a6adefd67c..98ae108e1fac56 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3509,6 +3509,43 @@ static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated,
 	}
 }
 
+static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
+{
+	unsigned long i;
+
+	for (i = 0; i < h->max_huge_pages; ++i) {
+		if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
+			break;
+		cond_resched();
+	}
+
+	return i;
+}
+
+static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
+{
+	unsigned long i;
+	struct folio *folio;
+	LIST_HEAD(folio_list);
+	nodemask_t node_alloc_noretry;
+
+	/* Bit mask controlling how hard we retry per-node allocations.*/
+	nodes_clear(node_alloc_noretry);
+
+	for (i = 0; i < h->max_huge_pages; ++i) {
+		folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
+						&node_alloc_noretry);
+		if (!folio)
+			break;
+		list_add(&folio->lru, &folio_list);
+		cond_resched();
+	}
+
+	prep_and_add_allocated_folios(h, &folio_list);
+
+	return i;
+}
+
 /*
  * NOTE: this routine is called in different contexts for gigantic and
  * non-gigantic pages.
@@ -3522,10 +3559,7 @@ static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated,
  */
 static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 {
-	unsigned long i;
-	struct folio *folio;
-	LIST_HEAD(folio_list);
-	nodemask_t *node_alloc_noretry;
+	unsigned long allocated;
 
 	/* skip gigantic hugepages allocation if hugetlb_cma enabled */
 	if (hstate_is_gigantic(h) && hugetlb_cma_size) {
@@ -3538,47 +3572,12 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 		return;
 
 	/* below will do all node balanced alloc */
-	if (!hstate_is_gigantic(h)) {
-		/*
-		 * Bit mask controlling how hard we retry per-node allocations.
-		 * Ignore errors as lower level routines can deal with
-		 * node_alloc_noretry == NULL.  If this kmalloc fails at boot
-		 * time, we are likely in bigger trouble.
-		 */
-		node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
-						GFP_KERNEL);
-	} else {
-		/* allocations done at boot time */
-		node_alloc_noretry = NULL;
-	}
-
-	/* bit mask controlling how hard we retry per-node allocations */
-	if (node_alloc_noretry)
-		nodes_clear(*node_alloc_noretry);
-
-	for (i = 0; i < h->max_huge_pages; ++i) {
-		if (hstate_is_gigantic(h)) {
-			/*
-			 * gigantic pages not added to list as they are not
-			 * added to pools now.
-			 */
-			if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
-				break;
-		} else {
-			folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
-							node_alloc_noretry);
-			if (!folio)
-				break;
-			list_add(&folio->lru, &folio_list);
-		}
-		cond_resched();
-	}
-
-	/* list will be empty if hstate_is_gigantic */
-	prep_and_add_allocated_folios(h, &folio_list);
+	if (hstate_is_gigantic(h))
+		allocated = hugetlb_gigantic_pages_alloc_boot(h);
+	else
+		allocated = hugetlb_pages_alloc_boot(h);
 
-	hugetlb_hstate_alloc_pages_errcheck(i, h);
-	kfree(node_alloc_noretry);
+	hugetlb_hstate_alloc_pages_errcheck(allocated, h);
 }
 
 static void __init hugetlb_init_hstates(void)

From 507b57b2daef420d2fe13c62f507b14387ef7445 Mon Sep 17 00:00:00 2001
From: Gang Li <gang.li@linux.dev>
Date: Fri, 26 Jan 2024 23:24:07 +0800
Subject: [PATCH 507/707] padata: dispatch works on different nodes

When a group of tasks that access different nodes are scheduled on the
same node, they may encounter bandwidth bottlenecks and access latency.

Thus, numa_aware flag is introduced here, allowing tasks to be distributed
across different nodes to fully utilize the advantage of multi-node
systems.

Link: https://lkml.kernel.org/r/20240126152411.1238072-4-gang.li@linux.dev
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/padata.h |  2 ++
 kernel/padata.c        | 14 ++++++++++++--
 mm/mm_init.c           |  1 +
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 495b16b6b4d729..8f418711351bcc 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -137,6 +137,7 @@ struct padata_shell {
  *             appropriate for one worker thread to do at once.
  * @max_threads: Max threads to use for the job, actual number may be less
  *               depending on task size and minimum chunk size.
+ * @numa_aware: Distribute jobs to different nodes with CPU in a round robin fashion.
  */
 struct padata_mt_job {
 	void (*thread_fn)(unsigned long start, unsigned long end, void *arg);
@@ -146,6 +147,7 @@ struct padata_mt_job {
 	unsigned long		align;
 	unsigned long		min_chunk;
 	int			max_threads;
+	bool			numa_aware;
 };
 
 /**
diff --git a/kernel/padata.c b/kernel/padata.c
index 179fb1518070c2..e3f639ff16707a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -485,7 +485,8 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
 	struct padata_work my_work, *pw;
 	struct padata_mt_job_state ps;
 	LIST_HEAD(works);
-	int nworks;
+	int nworks, nid;
+	static atomic_t last_used_nid __initdata;
 
 	if (job->size == 0)
 		return;
@@ -517,7 +518,16 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
 	ps.chunk_size = roundup(ps.chunk_size, job->align);
 
 	list_for_each_entry(pw, &works, pw_list)
-		queue_work(system_unbound_wq, &pw->pw_work);
+		if (job->numa_aware) {
+			int old_node = atomic_read(&last_used_nid);
+
+			do {
+				nid = next_node_in(old_node, node_states[N_CPU]);
+			} while (!atomic_try_cmpxchg(&last_used_nid, &old_node, nid));
+			queue_work_node(nid, system_unbound_wq, &pw->pw_work);
+		} else {
+			queue_work(system_unbound_wq, &pw->pw_work);
+		}
 
 	/* Use the current thread, which saves starting a workqueue worker. */
 	padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 2c19f5515e36c4..549e76af8f82a8 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2231,6 +2231,7 @@ static int __init deferred_init_memmap(void *data)
 			.align       = PAGES_PER_SECTION,
 			.min_chunk   = PAGES_PER_SECTION,
 			.max_threads = max_threads,
+			.numa_aware  = false,
 		};
 
 		padata_do_multithreaded(&job);

From 5018d25e9febad6d03f532bf0167b08296c87545 Mon Sep 17 00:00:00 2001
From: Gang Li <gang.li@linux.dev>
Date: Fri, 26 Jan 2024 23:24:08 +0800
Subject: [PATCH 508/707] hugetlb: pass *next_nid_to_alloc directly to
 for_each_node_mask_to_alloc

With parallelization of hugetlb allocation across different threads, each
thread works on a differnet node to allocate pages from, instead of all
allocating from a common node h->next_nid_to_alloc.  To address this, it's
necessary to assign a separate next_nid_to_alloc for each thread.

Consequently, the hstate_next_node_to_alloc and
for_each_node_mask_to_alloc have been modified to directly accept a
*next_nid_to_alloc parameter, ensuring thread-specific allocation and
avoiding concurrent access issues.

Link: https://lkml.kernel.org/r/20240126152411.1238072-5-gang.li@linux.dev
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/hugetlb.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 98ae108e1fac56..effe5539e545c7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1464,15 +1464,15 @@ static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
  * next node from which to allocate, handling wrap at end of node
  * mask.
  */
-static int hstate_next_node_to_alloc(struct hstate *h,
+static int hstate_next_node_to_alloc(int *next_node,
 					nodemask_t *nodes_allowed)
 {
 	int nid;
 
 	VM_BUG_ON(!nodes_allowed);
 
-	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
-	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+	nid = get_valid_node_allowed(*next_node, nodes_allowed);
+	*next_node = next_node_allowed(nid, nodes_allowed);
 
 	return nid;
 }
@@ -1495,10 +1495,10 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
 	return nid;
 }
 
-#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)		\
+#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask)		\
 	for (nr_nodes = nodes_weight(*mask);				\
 		nr_nodes > 0 &&						\
-		((node = hstate_next_node_to_alloc(hs, mask)) || 1);	\
+		((node = hstate_next_node_to_alloc(next_node, mask)) || 1);	\
 		nr_nodes--)
 
 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask)		\
@@ -2350,12 +2350,13 @@ static void prep_and_add_allocated_folios(struct hstate *h,
  */
 static struct folio *alloc_pool_huge_folio(struct hstate *h,
 					nodemask_t *nodes_allowed,
-					nodemask_t *node_alloc_noretry)
+					nodemask_t *node_alloc_noretry,
+					int *next_node)
 {
 	gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
 	int nr_nodes, node;
 
-	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+	for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) {
 		struct folio *folio;
 
 		folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node,
@@ -3310,7 +3311,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
 		goto found;
 	}
 	/* allocate from next node when distributing huge pages */
-	for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
+	for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_MEMORY]) {
 		m = memblock_alloc_try_nid_raw(
 				huge_page_size(h), huge_page_size(h),
 				0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
@@ -3679,7 +3680,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
 	VM_BUG_ON(delta != -1 && delta != 1);
 
 	if (delta < 0) {
-		for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+		for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, nodes_allowed) {
 			if (h->surplus_huge_pages_node[node])
 				goto found;
 		}
@@ -3794,7 +3795,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 		cond_resched();
 
 		folio = alloc_pool_huge_folio(h, nodes_allowed,
-						node_alloc_noretry);
+						node_alloc_noretry,
+						&h->next_nid_to_alloc);
 		if (!folio) {
 			prep_and_add_allocated_folios(h, &page_list);
 			spin_lock_irq(&hugetlb_lock);

From 4391e4679bd7a229f4bd97a2f27e57de7d05da15 Mon Sep 17 00:00:00 2001
From: Gang Li <gang.li@linux.dev>
Date: Fri, 26 Jan 2024 23:24:09 +0800
Subject: [PATCH 509/707] hugetlb: have CONFIG_HUGETLBFS select CONFIG_PADATA

Allow hugetlb use padata_do_multithreaded for parallel initialization.
Select CONFIG_PADATA in this case.

Link: https://lkml.kernel.org/r/20240126152411.1238072-6-gang.li@linux.dev
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/Kconfig b/fs/Kconfig
index 89fdbefd1075f8..a57d6e6c41e6f1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -262,6 +262,7 @@ menuconfig HUGETLBFS
 	depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
 	depends on (SYSFS || SYSCTL)
 	select MEMFD_CREATE
+	select PADATA
 	help
 	  hugetlbfs is a filesystem backing for HugeTLB pages, based on
 	  ramfs. For architectures that support it, say Y here and read

From b1a6f2b5f1827c8df2beb9da7505b46fa48a636f Mon Sep 17 00:00:00 2001
From: Gang Li <gang.li@linux.dev>
Date: Fri, 26 Jan 2024 23:24:10 +0800
Subject: [PATCH 510/707] hugetlb: parallelize 2M hugetlb allocation and
 initialization

By distributing both the allocation and the initialization tasks across
multiple threads, the initialization of 2M hugetlb will be faster, thereby
improving the boot speed.

Here are some test results:
      test case        no patch(ms)   patched(ms)   saved
 ------------------- -------------- ------------- --------
  256c2T(4 node) 2M           3336          1051   68.52%
  128c1T(2 node) 2M           1943           716   63.15%

Link: https://lkml.kernel.org/r/20240126152411.1238072-7-gang.li@linux.dev
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/hugetlb.c | 73 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 56 insertions(+), 17 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index effe5539e545c7..19d4dce2642bb1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -35,6 +35,7 @@
 #include <linux/delayacct.h>
 #include <linux/memory.h>
 #include <linux/mm_inline.h>
+#include <linux/padata.h>
 
 #include <asm/page.h>
 #include <asm/pgalloc.h>
@@ -3510,6 +3511,30 @@ static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated,
 	}
 }
 
+static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg)
+{
+	struct hstate *h = (struct hstate *)arg;
+	int i, num = end - start;
+	nodemask_t node_alloc_noretry;
+	LIST_HEAD(folio_list);
+	int next_node = first_online_node;
+
+	/* Bit mask controlling how hard we retry per-node allocations.*/
+	nodes_clear(node_alloc_noretry);
+
+	for (i = 0; i < num; ++i) {
+		struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
+						&node_alloc_noretry, &next_node);
+		if (!folio)
+			break;
+
+		list_move(&folio->lru, &folio_list);
+		cond_resched();
+	}
+
+	prep_and_add_allocated_folios(h, &folio_list);
+}
+
 static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
 {
 	unsigned long i;
@@ -3525,26 +3550,40 @@ static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
 
 static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
 {
-	unsigned long i;
-	struct folio *folio;
-	LIST_HEAD(folio_list);
-	nodemask_t node_alloc_noretry;
-
-	/* Bit mask controlling how hard we retry per-node allocations.*/
-	nodes_clear(node_alloc_noretry);
+	struct padata_mt_job job = {
+		.fn_arg		= h,
+		.align		= 1,
+		.numa_aware	= true
+	};
 
-	for (i = 0; i < h->max_huge_pages; ++i) {
-		folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
-						&node_alloc_noretry);
-		if (!folio)
-			break;
-		list_add(&folio->lru, &folio_list);
-		cond_resched();
-	}
+	job.thread_fn	= hugetlb_pages_alloc_boot_node;
+	job.start	= 0;
+	job.size	= h->max_huge_pages;
 
-	prep_and_add_allocated_folios(h, &folio_list);
+	/*
+	 * job.max_threads is twice the num_node_state(N_MEMORY),
+	 *
+	 * Tests below indicate that a multiplier of 2 significantly improves
+	 * performance, and although larger values also provide improvements,
+	 * the gains are marginal.
+	 *
+	 * Therefore, choosing 2 as the multiplier strikes a good balance between
+	 * enhancing parallel processing capabilities and maintaining efficient
+	 * resource management.
+	 *
+	 * +------------+-------+-------+-------+-------+-------+
+	 * | multiplier |   1   |   2   |   3   |   4   |   5   |
+	 * +------------+-------+-------+-------+-------+-------+
+	 * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms |
+	 * | 2T   4node | 979ms | 679ms | 543ms | 489ms | 481ms |
+	 * | 50G  2node | 71ms  | 44ms  | 37ms  | 30ms  | 31ms  |
+	 * +------------+-------+-------+-------+-------+-------+
+	 */
+	job.max_threads	= num_node_state(N_MEMORY) * 2;
+	job.min_chunk	= h->max_huge_pages / num_node_state(N_MEMORY) / 2;
+	padata_do_multithreaded(&job);
 
-	return i;
+	return h->nr_huge_pages;
 }
 
 /*

From 8b9c1350115da318a97836423f73f484a200b077 Mon Sep 17 00:00:00 2001
From: Gang Li <gang.li@linux.dev>
Date: Fri, 26 Jan 2024 23:24:11 +0800
Subject: [PATCH 511/707] hugetlb: parallelize 1G hugetlb initialization

Optimize the initialization speed of 1G huge pages through
parallelization.

1G hugetlbs are allocated from bootmem, a process that is already very
fast and does not currently require optimization.  Therefore, we focus on
parallelizing only the initialization phase in `gather_bootmem_prealloc`.

Here are some test results:
      test case       no patch(ms)   patched(ms)   saved
 ------------------- -------------- ------------- --------
  256c2T(4 node) 1G           4745          2024   57.34%
  128c1T(2 node) 1G           3358          1712   49.02%
     12T         1G          77000         18300   76.23%

Link: https://lkml.kernel.org/r/20240126152411.1238072-8-gang.li@linux.dev
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/mm/hugetlbpage.c |  2 +-
 include/linux/hugetlb.h       |  2 +-
 mm/hugetlb.c                  | 44 ++++++++++++++++++++++++++++-------
 3 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0a540b37aab62c..a1651d54718626 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -226,7 +226,7 @@ static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
 		return 0;
 	m = phys_to_virt(gpage_freearray[--nr_gpages]);
 	gpage_freearray[nr_gpages] = 0;
-	list_add(&m->list, &huge_boot_pages);
+	list_add(&m->list, &huge_boot_pages[0]);
 	m->hstate = hstate;
 	return 1;
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c1ee640d87b11d..77b30a8c6076b6 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -178,7 +178,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
 
 extern int sysctl_hugetlb_shm_group;
-extern struct list_head huge_boot_pages;
+extern struct list_head huge_boot_pages[MAX_NUMNODES];
 
 /* arch callbacks */
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 19d4dce2642bb1..9d996fe4ecd9cc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -69,7 +69,7 @@ static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
 #endif
 static unsigned long hugetlb_cma_size __initdata;
 
-__initdata LIST_HEAD(huge_boot_pages);
+__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
 
 /* for command line parsing */
 static struct hstate * __initdata parsed_hstate;
@@ -3301,7 +3301,7 @@ int alloc_bootmem_huge_page(struct hstate *h, int nid)
 int __alloc_bootmem_huge_page(struct hstate *h, int nid)
 {
 	struct huge_bootmem_page *m = NULL; /* initialize for clang */
-	int nr_nodes, node;
+	int nr_nodes, node = nid;
 
 	/* do node specific alloc */
 	if (nid != NUMA_NO_NODE) {
@@ -3339,7 +3339,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
 		huge_page_size(h) - PAGE_SIZE);
 	/* Put them into a private list first because mem_map is not up yet */
 	INIT_LIST_HEAD(&m->list);
-	list_add(&m->list, &huge_boot_pages);
+	list_add(&m->list, &huge_boot_pages[node]);
 	m->hstate = h;
 	return 1;
 }
@@ -3390,8 +3390,6 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
 	/* Send list for bulk vmemmap optimization processing */
 	hugetlb_vmemmap_optimize_folios(h, folio_list);
 
-	/* Add all new pool pages to free lists in one lock cycle */
-	spin_lock_irqsave(&hugetlb_lock, flags);
 	list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
 		if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
 			/*
@@ -3404,23 +3402,27 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
 					HUGETLB_VMEMMAP_RESERVE_PAGES,
 					pages_per_huge_page(h));
 		}
+		/* Subdivide locks to achieve better parallel performance */
+		spin_lock_irqsave(&hugetlb_lock, flags);
 		__prep_account_new_huge_page(h, folio_nid(folio));
 		enqueue_hugetlb_folio(h, folio);
+		spin_unlock_irqrestore(&hugetlb_lock, flags);
 	}
-	spin_unlock_irqrestore(&hugetlb_lock, flags);
 }
 
 /*
  * Put bootmem huge pages into the standard lists after mem_map is up.
  * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
  */
-static void __init gather_bootmem_prealloc(void)
+static void __init gather_bootmem_prealloc_node(unsigned long start, unsigned long end, void *arg)
+
 {
+	int nid = start;
 	LIST_HEAD(folio_list);
 	struct huge_bootmem_page *m;
 	struct hstate *h = NULL, *prev_h = NULL;
 
-	list_for_each_entry(m, &huge_boot_pages, list) {
+	list_for_each_entry(m, &huge_boot_pages[nid], list) {
 		struct page *page = virt_to_page(m);
 		struct folio *folio = (void *)page;
 
@@ -3453,6 +3455,22 @@ static void __init gather_bootmem_prealloc(void)
 	prep_and_add_bootmem_folios(h, &folio_list);
 }
 
+static void __init gather_bootmem_prealloc(void)
+{
+	struct padata_mt_job job = {
+		.thread_fn	= gather_bootmem_prealloc_node,
+		.fn_arg		= NULL,
+		.start		= 0,
+		.size		= num_node_state(N_MEMORY),
+		.align		= 1,
+		.min_chunk	= 1,
+		.max_threads	= num_node_state(N_MEMORY),
+		.numa_aware	= true,
+	};
+
+	padata_do_multithreaded(&job);
+}
+
 static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
 {
 	unsigned long i;
@@ -3600,6 +3618,7 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
 static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 {
 	unsigned long allocated;
+	static bool initialied __initdata;
 
 	/* skip gigantic hugepages allocation if hugetlb_cma enabled */
 	if (hstate_is_gigantic(h) && hugetlb_cma_size) {
@@ -3607,6 +3626,15 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 		return;
 	}
 
+	/* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */
+	if (!initialied) {
+		int i = 0;
+
+		for (i = 0; i < MAX_NUMNODES; i++)
+			INIT_LIST_HEAD(&huge_boot_pages[i]);
+		initialied = true;
+	}
+
 	/* do node specific alloc */
 	if (hugetlb_hstate_alloc_pages_specific_nodes(h))
 		return;

From 59711c69e692f6408ffe3e6f826609837709c406 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 26 Jan 2024 16:19:44 +0800
Subject: [PATCH 512/707] mm and cache_info: remove unnecessary CPU cache info
 update

For each CPU hotplug event, we will update per-CPU data slice size and
corresponding PCP configuration for every online CPU to make the
implementation simple.  But, Kyle reported that this takes tens seconds
during boot on a machine with 34 zones and 3840 CPUs.

So, in this patch, for each CPU hotplug event, we only update per-CPU data
slice size and corresponding PCP configuration for the CPUs that share
caches with the hotplugged CPU.  With the patch, the system boot time
reduces 67 seconds on the machine.

Link: https://lkml.kernel.org/r/20240126081944.414520-1-ying.huang@intel.com
Fixes: 362d37a106dd ("mm, pcp: reduce lock contention for draining high-order pages")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Originally-by: Kyle Meyer <kyle.meyer@hpe.com>
Reported-and-tested-by: Kyle Meyer <kyle.meyer@hpe.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/base/cacheinfo.c | 50 +++++++++++++++++++++++++++++++++++-----
 include/linux/gfp.h      |  2 +-
 mm/page_alloc.c          | 39 +++++++++++++++----------------
 3 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index f1e79263fe61eb..23b8cba4a2a3b8 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -898,6 +898,37 @@ static int cache_add_dev(unsigned int cpu)
 	return rc;
 }
 
+static unsigned int cpu_map_shared_cache(bool online, unsigned int cpu,
+					 cpumask_t **map)
+{
+	struct cacheinfo *llc, *sib_llc;
+	unsigned int sibling;
+
+	if (!last_level_cache_is_valid(cpu))
+		return 0;
+
+	llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1);
+
+	if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED)
+		return 0;
+
+	if (online) {
+		*map = &llc->shared_cpu_map;
+		return cpumask_weight(*map);
+	}
+
+	/* shared_cpu_map of offlined CPU will be cleared, so use sibling map */
+	for_each_cpu(sibling, &llc->shared_cpu_map) {
+		if (sibling == cpu || !last_level_cache_is_valid(sibling))
+			continue;
+		sib_llc = per_cpu_cacheinfo_idx(sibling, cache_leaves(sibling) - 1);
+		*map = &sib_llc->shared_cpu_map;
+		return cpumask_weight(*map);
+	}
+
+	return 0;
+}
+
 /*
  * Calculate the size of the per-CPU data cache slice.  This can be
  * used to estimate the size of the data cache slice that can be used
@@ -929,28 +960,31 @@ static void update_per_cpu_data_slice_size_cpu(unsigned int cpu)
 		ci->per_cpu_data_slice_size = llc->size / nr_shared;
 }
 
-static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu)
+static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu,
+					   cpumask_t *cpu_map)
 {
 	unsigned int icpu;
 
-	for_each_online_cpu(icpu) {
+	for_each_cpu(icpu, cpu_map) {
 		if (!cpu_online && icpu == cpu)
 			continue;
 		update_per_cpu_data_slice_size_cpu(icpu);
+		setup_pcp_cacheinfo(icpu);
 	}
 }
 
 static int cacheinfo_cpu_online(unsigned int cpu)
 {
 	int rc = detect_cache_attributes(cpu);
+	cpumask_t *cpu_map;
 
 	if (rc)
 		return rc;
 	rc = cache_add_dev(cpu);
 	if (rc)
 		goto err;
-	update_per_cpu_data_slice_size(true, cpu);
-	setup_pcp_cacheinfo();
+	if (cpu_map_shared_cache(true, cpu, &cpu_map))
+		update_per_cpu_data_slice_size(true, cpu, cpu_map);
 	return 0;
 err:
 	free_cache_attributes(cpu);
@@ -959,12 +993,16 @@ static int cacheinfo_cpu_online(unsigned int cpu)
 
 static int cacheinfo_cpu_pre_down(unsigned int cpu)
 {
+	cpumask_t *cpu_map;
+	unsigned int nr_shared;
+
+	nr_shared = cpu_map_shared_cache(false, cpu, &cpu_map);
 	if (cpumask_test_and_clear_cpu(cpu, &cache_dev_map))
 		cpu_cache_sysfs_exit(cpu);
 
 	free_cache_attributes(cpu);
-	update_per_cpu_data_slice_size(false, cpu);
-	setup_pcp_cacheinfo();
+	if (nr_shared > 1)
+		update_per_cpu_data_slice_size(false, cpu, cpu_map);
 	return 0;
 }
 
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index de292a0071389e..09e22091f1b03f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -334,7 +334,7 @@ void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
 
 void page_alloc_init_late(void);
-void setup_pcp_cacheinfo(void);
+void setup_pcp_cacheinfo(unsigned int cpu);
 
 /*
  * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index aeda1153dad92b..140c4f372db169 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5578,37 +5578,34 @@ static void zone_pcp_update(struct zone *zone, int cpu_online)
 	mutex_unlock(&pcp_batch_high_lock);
 }
 
-static void zone_pcp_update_cacheinfo(struct zone *zone)
+static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
 {
-	int cpu;
 	struct per_cpu_pages *pcp;
 	struct cpu_cacheinfo *cci;
 
-	for_each_online_cpu(cpu) {
-		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-		cci = get_cpu_cacheinfo(cpu);
-		/*
-		 * If data cache slice of CPU is large enough, "pcp->batch"
-		 * pages can be preserved in PCP before draining PCP for
-		 * consecutive high-order pages freeing without allocation.
-		 * This can reduce zone lock contention without hurting
-		 * cache-hot pages sharing.
-		 */
-		spin_lock(&pcp->lock);
-		if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
-			pcp->flags |= PCPF_FREE_HIGH_BATCH;
-		else
-			pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
-		spin_unlock(&pcp->lock);
-	}
+	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+	cci = get_cpu_cacheinfo(cpu);
+	/*
+	 * If data cache slice of CPU is large enough, "pcp->batch"
+	 * pages can be preserved in PCP before draining PCP for
+	 * consecutive high-order pages freeing without allocation.
+	 * This can reduce zone lock contention without hurting
+	 * cache-hot pages sharing.
+	 */
+	spin_lock(&pcp->lock);
+	if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
+		pcp->flags |= PCPF_FREE_HIGH_BATCH;
+	else
+		pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
+	spin_unlock(&pcp->lock);
 }
 
-void setup_pcp_cacheinfo(void)
+void setup_pcp_cacheinfo(unsigned int cpu)
 {
 	struct zone *zone;
 
 	for_each_populated_zone(zone)
-		zone_pcp_update_cacheinfo(zone);
+		zone_pcp_update_cacheinfo(zone, cpu);
 }
 
 /*

From 45ae2add29e2853f4dae8d37324c7f820921b583 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Fri, 26 Jan 2024 08:06:43 +0000
Subject: [PATCH 513/707] x86/mm: delete unused cpu argument to leave_mm()

The argument is unused since commit 3d28ebceaffa ("x86/mm: Rework lazy
TLB to track the actual loaded mm"), delete it.

Link: https://lkml.kernel.org/r/20240126080644.1714297-1-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/include/asm/mmu.h    | 2 +-
 arch/x86/kernel/alternative.c | 2 +-
 arch/x86/mm/tlb.c             | 2 +-
 arch/x86/xen/mmu_pv.c         | 2 +-
 drivers/cpuidle/cpuidle.c     | 2 +-
 include/linux/mmu_context.h   | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 0da5c227f490c0..ce4677b8b7356c 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -75,7 +75,7 @@ typedef struct {
 		.lock = __MUTEX_INITIALIZER(mm.context.lock),		\
 	}
 
-void leave_mm(int cpu);
+void leave_mm(void);
 #define leave_mm leave_mm
 
 #endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index cc130b57542ac4..66bd265c7a5876 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1805,7 +1805,7 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
 	 * restoring the previous mm.
 	 */
 	if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
-		leave_mm(smp_processor_id());
+		leave_mm();
 
 	temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
 	switch_mm_irqs_off(NULL, mm, current);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5768d386efab6e..80b0caa82a91b4 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -299,7 +299,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
 	write_cr3(new_mm_cr3);
 }
 
-void leave_mm(int cpu)
+void leave_mm(void)
 {
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
 
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 72af496a160c8b..218773cfb009f7 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -913,7 +913,7 @@ static void drop_mm_ref_this_cpu(void *info)
 	struct mm_struct *mm = info;
 
 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
-		leave_mm(smp_processor_id());
+		leave_mm();
 
 	/*
 	 * If this cpu still has a stale cr3 reference, then make sure
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 737a026ef58a38..02e40fd7d948c9 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -237,7 +237,7 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
 	}
 
 	if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
-		leave_mm(dev->cpu);
+		leave_mm();
 
 	/* Take note of the planned idle state. */
 	sched_idle_set_state(target_state);
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index f2b7a3f040999e..bbaec80c78c505 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -11,7 +11,7 @@
 #endif
 
 #ifndef leave_mm
-static inline void leave_mm(int cpu) { }
+static inline void leave_mm(void) { }
 #endif
 
 /*

From 6e2bbaa08ca958915e1da968b64327ea6ddc38e2 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Fri, 26 Jan 2024 08:06:44 +0000
Subject: [PATCH 514/707] x86/mm: clarify "prev" usage in switch_mm_irqs_off()

In the x86 implementation of switch_mm_irqs_off(), we do not use the
"prev" argument passed in by the caller, we use exclusively use
"real_prev", which is cpu_tlbstate.loaded_mm.  This is not obvious at the
first sight.

Furthermore, a comment describes a condition that happens when called with
prev == next, but this should not affect the function in any way since
prev is unused.  Apparently, the comment is intended to clarify why we
don't rely on prev == next to decide whether we need to update CR3, but
again, it is not obvious.  The comment also references the fact that
leave_mm() calls with prev == NULL and tsk == NULL, but this also
shouldn't matter because prev is unused and tsk is only used in one
function which has a NULL check.

Clarify things by renaming (prev -> unused) and (real_prev -> prev), also
move and rewrite the comment as an explanation for why we don't rely on
"prev" supplied by the caller in x86 code and use our own.  Hopefully this
makes reading the code easier.

Link: https://lkml.kernel.org/r/20240126080644.1714297-2-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/mm/tlb.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 80b0caa82a91b4..bf9605caf24f74 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -492,10 +492,16 @@ void cr4_update_pce(void *ignored)
 static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
 #endif
 
-void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+/*
+ * The "prev" argument passed by the caller does not always match CR3. For
+ * example, the scheduler passes in active_mm when switching from lazy TLB mode
+ * to normal mode, but switch_mm_irqs_off() can be called from x86 code without
+ * updating active_mm. Use cpu_tlbstate.loaded_mm instead.
+ */
+void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
 			struct task_struct *tsk)
 {
-	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+	struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
 	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
 	unsigned long new_lam = mm_lam_cr3_mask(next);
 	bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
@@ -504,15 +510,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	bool need_flush;
 	u16 new_asid;
 
-	/*
-	 * NB: The scheduler will call us with prev == next when switching
-	 * from lazy TLB mode to normal mode if active_mm isn't changing.
-	 * When this happens, we don't assume that CR3 (and hence
-	 * cpu_tlbstate.loaded_mm) matches next.
-	 *
-	 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
-	 */
-
 	/* We don't want flush_tlb_func() to run concurrently with us. */
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
 		WARN_ON_ONCE(!irqs_disabled());
@@ -527,7 +524,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	 * isn't free.
 	 */
 #ifdef CONFIG_DEBUG_VM
-	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid,
+	if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid,
 						   tlbstate_lam_cr3_mask()))) {
 		/*
 		 * If we were to BUG here, we'd be very likely to kill
@@ -559,7 +556,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	 * provides that full memory barrier and core serializing
 	 * instruction.
 	 */
-	if (real_prev == next) {
+	if (prev == next) {
 		/* Not actually switching mm's */
 		VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
 			   next->context.ctx_id);
@@ -574,7 +571,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		 * mm_cpumask. The TLB shootdown code can figure out from
 		 * cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
 		 */
-		if (WARN_ON_ONCE(real_prev != &init_mm &&
+		if (WARN_ON_ONCE(prev != &init_mm &&
 				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
 			cpumask_set_cpu(cpu, mm_cpumask(next));
 
@@ -616,10 +613,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		 * Skip kernel threads; we never send init_mm TLB flushing IPIs,
 		 * but the bitmap manipulation can cause cache line contention.
 		 */
-		if (real_prev != &init_mm) {
+		if (prev != &init_mm) {
 			VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
-						mm_cpumask(real_prev)));
-			cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+						mm_cpumask(prev)));
+			cpumask_clear_cpu(cpu, mm_cpumask(prev));
 		}
 
 		/*
@@ -656,9 +653,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	this_cpu_write(cpu_tlbstate.loaded_mm, next);
 	this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
 
-	if (next != real_prev) {
+	if (next != prev) {
 		cr4_update_pce_mm(next);
-		switch_ldt(real_prev, next);
+		switch_ldt(prev, next);
 	}
 }
 

From 9cf78c52c93d7b16d2b83df1ed6ca1ccb123ef2e Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sun, 28 Jan 2024 13:28:50 +0000
Subject: [PATCH 515/707] mm/zswap: fix race between lru writeback and swapoff

LRU writeback has race problem with swapoff, as spotted by Yosry [1]:

CPU1			CPU2
shrink_memcg_cb		swap_off
  list_lru_isolate	  zswap_invalidate
			  zswap_swapoff
			    kfree(tree)
  // UAF
  spin_lock(&tree->lock)

The problem is that the entry in lru list can't protect the tree from
being swapoff and freed, and the entry also can be invalidated and freed
concurrently after we unlock the lru lock.

We can fix it by moving the swap cache allocation ahead before referencing
the tree, then check invalidate race with tree lock, only after that we
can safely deref the entry.  Note we couldn't deref entry or tree anymore
after we unlock the folio, since we depend on this to hold on swapoff.

So this patch moves all tree and entry usage to zswap_writeback_entry(),
we only use the copied swpentry on the stack to allocate swap cache and if
returned with folio locked we can reference the tree safely.  Then we can
check invalidate race with tree lock, the following things is much the
same like zswap_load().

Since we can't deref the entry after zswap_writeback_entry(), we can't use
zswap_lru_putback() anymore, instead we rotate the entry in the beginning.
And it will be unlinked and freed when invalidated if writeback success.

Another change is we don't update the memcg nr_zswap_protected in the
-ENOMEM and -EEXIST cases anymore.  -EEXIST case means we raced with
swapin or concurrent shrinker action, since swapin already have memcg
nr_zswap_protected updated, don't need double counts here.  For concurrent
shrinker, the folio will be writeback and freed anyway.  -ENOMEM case is
extremely rare and doesn't happen spuriously either, so don't bother
distinguishing this case.

[1] https://lore.kernel.org/all/CAJD7tkasHsRnT_75-TXsEe58V9_OW6m3g6CF7Kmsvz8CKRG_EA@mail.gmail.com/

Link: https://lkml.kernel.org/r/20240126-zswap-writeback-race-v2-2-b10479847099@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chris Li <chriscli@google.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 114 +++++++++++++++++++++++------------------------------
 1 file changed, 49 insertions(+), 65 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 50a4af3f185607..fcd31fb670cce2 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -277,7 +277,7 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
 		 zpool_get_type((p)->zpools[0]))
 
 static int zswap_writeback_entry(struct zswap_entry *entry,
-				 struct zswap_tree *tree);
+				 swp_entry_t swpentry);
 static int zswap_pool_get(struct zswap_pool *pool);
 static void zswap_pool_put(struct zswap_pool *pool);
 
@@ -445,27 +445,6 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
 	rcu_read_unlock();
 }
 
-static void zswap_lru_putback(struct list_lru *list_lru,
-		struct zswap_entry *entry)
-{
-	int nid = entry_to_nid(entry);
-	spinlock_t *lock = &list_lru->node[nid].lock;
-	struct mem_cgroup *memcg;
-	struct lruvec *lruvec;
-
-	rcu_read_lock();
-	memcg = mem_cgroup_from_entry(entry);
-	spin_lock(lock);
-	/* we cannot use list_lru_add here, because it increments node's lru count */
-	list_lru_putback(list_lru, &entry->lru, nid, memcg);
-	spin_unlock(lock);
-
-	lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry)));
-	/* increment the protection area to account for the LRU rotation. */
-	atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
-	rcu_read_unlock();
-}
-
 /*********************************
 * rbtree functions
 **********************************/
@@ -860,40 +839,47 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 {
 	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
 	bool *encountered_page_in_swapcache = (bool *)arg;
-	struct zswap_tree *tree;
-	pgoff_t swpoffset;
+	swp_entry_t swpentry;
 	enum lru_status ret = LRU_REMOVED_RETRY;
 	int writeback_result;
 
+	/*
+	 * Rotate the entry to the tail before unlocking the LRU,
+	 * so that in case of an invalidation race concurrent
+	 * reclaimers don't waste their time on it.
+	 *
+	 * If writeback succeeds, or failure is due to the entry
+	 * being invalidated by the swap subsystem, the invalidation
+	 * will unlink and free it.
+	 *
+	 * Temporary failures, where the same entry should be tried
+	 * again immediately, almost never happen for this shrinker.
+	 * We don't do any trylocking; -ENOMEM comes closest,
+	 * but that's extremely rare and doesn't happen spuriously
+	 * either. Don't bother distinguishing this case.
+	 *
+	 * But since they do exist in theory, the entry cannot just
+	 * be unlinked, or we could leak it. Hence, rotate.
+	 */
+	list_move_tail(item, &l->list);
+
 	/*
 	 * Once the lru lock is dropped, the entry might get freed. The
-	 * swpoffset is copied to the stack, and entry isn't deref'd again
+	 * swpentry is copied to the stack, and entry isn't deref'd again
 	 * until the entry is verified to still be alive in the tree.
 	 */
-	swpoffset = swp_offset(entry->swpentry);
-	tree = swap_zswap_tree(entry->swpentry);
-	list_lru_isolate(l, item);
+	swpentry = entry->swpentry;
+
 	/*
 	 * It's safe to drop the lock here because we return either
 	 * LRU_REMOVED_RETRY or LRU_RETRY.
 	 */
 	spin_unlock(lock);
 
-	/* Check for invalidate() race */
-	spin_lock(&tree->lock);
-	if (entry != zswap_rb_search(&tree->rbroot, swpoffset))
-		goto unlock;
-
-	/* Hold a reference to prevent a free during writeback */
-	zswap_entry_get(entry);
-	spin_unlock(&tree->lock);
-
-	writeback_result = zswap_writeback_entry(entry, tree);
+	writeback_result = zswap_writeback_entry(entry, swpentry);
 
-	spin_lock(&tree->lock);
 	if (writeback_result) {
 		zswap_reject_reclaim_fail++;
-		zswap_lru_putback(&entry->pool->list_lru, entry);
 		ret = LRU_RETRY;
 
 		/*
@@ -903,27 +889,10 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 		 */
 		if (writeback_result == -EEXIST && encountered_page_in_swapcache)
 			*encountered_page_in_swapcache = true;
-
-		goto put_unlock;
+	} else {
+		zswap_written_back_pages++;
 	}
-	zswap_written_back_pages++;
-
-	if (entry->objcg)
-		count_objcg_event(entry->objcg, ZSWPWB);
 
-	count_vm_event(ZSWPWB);
-	/*
-	 * Writeback started successfully, the page now belongs to the
-	 * swapcache. Drop the entry from zswap - unless invalidate already
-	 * took it out while we had the tree->lock released for IO.
-	 */
-	zswap_invalidate_entry(tree, entry);
-
-put_unlock:
-	/* Drop local reference */
-	zswap_entry_put(entry);
-unlock:
-	spin_unlock(&tree->lock);
 	spin_lock(lock);
 	return ret;
 }
@@ -1408,9 +1377,9 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page)
  * freed.
  */
 static int zswap_writeback_entry(struct zswap_entry *entry,
-				 struct zswap_tree *tree)
+				 swp_entry_t swpentry)
 {
-	swp_entry_t swpentry = entry->swpentry;
+	struct zswap_tree *tree;
 	struct folio *folio;
 	struct mempolicy *mpol;
 	bool folio_was_allocated;
@@ -1426,9 +1395,11 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 		return -ENOMEM;
 
 	/*
-	 * Found an existing folio, we raced with load/swapin. We generally
-	 * writeback cold folios from zswap, and swapin means the folio just
-	 * became hot. Skip this folio and let the caller find another one.
+	 * Found an existing folio, we raced with swapin or concurrent
+	 * shrinker. We generally writeback cold folios from zswap, and
+	 * swapin means the folio just became hot, so skip this folio.
+	 * For unlikely concurrent shrinker case, it will be unlinked
+	 * and freed when invalidated by the concurrent shrinker anyway.
 	 */
 	if (!folio_was_allocated) {
 		folio_put(folio);
@@ -1442,18 +1413,31 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	 * backs (our zswap_entry reference doesn't prevent that), to
 	 * avoid overwriting a new swap folio with old compressed data.
 	 */
+	tree = swap_zswap_tree(swpentry);
 	spin_lock(&tree->lock);
-	if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
+	if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) {
 		spin_unlock(&tree->lock);
 		delete_from_swap_cache(folio);
 		folio_unlock(folio);
 		folio_put(folio);
 		return -ENOMEM;
 	}
+
+	/* Safe to deref entry after the entry is verified above. */
+	zswap_entry_get(entry);
 	spin_unlock(&tree->lock);
 
 	__zswap_load(entry, &folio->page);
 
+	count_vm_event(ZSWPWB);
+	if (entry->objcg)
+		count_objcg_event(entry->objcg, ZSWPWB);
+
+	spin_lock(&tree->lock);
+	zswap_invalidate_entry(tree, entry);
+	zswap_entry_put(entry);
+	spin_unlock(&tree->lock);
+
 	/* folio is up to date */
 	folio_mark_uptodate(folio);
 

From 26a7f1648f07aa0c5c856b64b5b82b1233f4f9cd Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sun, 28 Jan 2024 13:28:51 +0000
Subject: [PATCH 516/707] mm/list_lru: remove list_lru_putback()

Since the only user zswap_lru_putback() has gone, remove
list_lru_putback() too.

Link: https://lkml.kernel.org/r/20240126-zswap-writeback-race-v2-3-b10479847099@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Chris Li <chriscli@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/list_lru.h | 16 ----------------
 mm/list_lru.c            | 14 --------------
 mm/zswap.c               |  2 +-
 3 files changed, 1 insertion(+), 31 deletions(-)

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index c679e6b293c4c4..f2882a82069027 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -168,22 +168,6 @@ static inline unsigned long list_lru_count(struct list_lru *lru)
 void list_lru_isolate(struct list_lru_one *list, struct list_head *item);
 void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
 			   struct list_head *head);
-/**
- * list_lru_putback: undo list_lru_isolate
- * @lru: the lru pointer.
- * @item: the item to put back.
- * @nid: the node id of the sublist to put the item back to.
- * @memcg: the cgroup of the sublist to put the item back to.
- *
- * Put back an isolated item into its original LRU. Note that unlike
- * list_lru_add, this does not increment the node LRU count (as
- * list_lru_isolate does not originally decrement this count).
- *
- * Since we might have dropped the LRU lock in between, recompute list_lru_one
- * from the node's id and memcg.
- */
-void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid,
-		      struct mem_cgroup *memcg);
 
 typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
 		struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 158781d1d3c215..61f3b6b1134fbe 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -162,20 +162,6 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
 }
 EXPORT_SYMBOL_GPL(list_lru_isolate_move);
 
-void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid,
-		      struct mem_cgroup *memcg)
-{
-	struct list_lru_one *list =
-		list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
-
-	if (list_empty(item)) {
-		list_add_tail(item, &list->list);
-		if (!list->nr_items++)
-			set_shrinker_bit(memcg, nid, lru_shrinker_id(lru));
-	}
-}
-EXPORT_SYMBOL_GPL(list_lru_putback);
-
 unsigned long list_lru_count_one(struct list_lru *lru,
 				 int nid, struct mem_cgroup *memcg)
 {
diff --git a/mm/zswap.c b/mm/zswap.c
index fcd31fb670cce2..7f88b3a77e4a84 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -411,7 +411,7 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
 	 * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The
 	 *    new entry will be added directly to memcg's parent's list_lru.
 	 *
-	 * Similar reasoning holds for list_lru_del() and list_lru_putback().
+	 * Similar reasoning holds for list_lru_del().
 	 */
 	rcu_read_lock();
 	memcg = mem_cgroup_from_entry(entry);

From d869d3fb362c51a59c173fdee050dc100ff68383 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 29 Jan 2024 11:07:01 +0100
Subject: [PATCH 517/707] stackdepot: use variable size records for
 non-evictable entries

With the introduction of stack depot evictions, each stack record is now
fixed size, so that future reuse after an eviction can safely store
differently sized stack traces.  In all cases that do not make use of
evictions, this wastes lots of space.

Fix it by re-introducing variable size stack records (up to the max
allowed size) for entries that will never be evicted.  We know if an entry
will never be evicted if the flag STACK_DEPOT_FLAG_GET is not provided,
since a later stack_depot_put() attempt is undefined behavior.

With my current kernel config that enables KASAN and also SLUB owner
tracking, I observe (after a kernel boot) a whopping reduction of 296
stack depot pools, which translates into 4736 KiB saved.  The savings here
are from SLUB owner tracking only, because KASAN generic mode still uses
refcounting.

Before:

  pools: 893
  allocations: 29841
  frees: 6524
  in_use: 23317
  freelist_size: 3454

After:

  pools: 597
  refcounted_allocations: 17547
  refcounted_frees: 6477
  refcounted_in_use: 11070
  freelist_size: 3497
  persistent_count: 12163
  persistent_bytes: 1717008

Link: https://lkml.kernel.org/r/20240129100708.39460-1-elver@google.com
Fixes: 108be8def46e ("lib/stackdepot: allow users to evict stack traces")
Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/poison.h |   3 +
 lib/stackdepot.c       | 250 +++++++++++++++++++++--------------------
 2 files changed, 130 insertions(+), 123 deletions(-)

diff --git a/include/linux/poison.h b/include/linux/poison.h
index 27a7dad17eefb8..1f0ee2459f2aa2 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -92,4 +92,7 @@
 /********** VFS **********/
 #define VFS_PTR_POISON ((void *)(0xF5 + POISON_POINTER_DELTA))
 
+/********** lib/stackdepot.c **********/
+#define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA))
+
 #endif
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 5caa1f56655384..8f3b2c84ec2db3 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -22,6 +22,7 @@
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
+#include <linux/poison.h>
 #include <linux/printk.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
@@ -43,17 +44,7 @@
 #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN)
 #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \
 			       STACK_DEPOT_EXTRA_BITS)
-#if IS_ENABLED(CONFIG_KMSAN) && CONFIG_STACKDEPOT_MAX_FRAMES >= 32
-/*
- * KMSAN is frequently used in fuzzing scenarios and thus saves a lot of stack
- * traces. As KMSAN does not support evicting stack traces from the stack
- * depot, the stack depot capacity might be reached quickly with large stack
- * records. Adjust the maximum number of stack depot pools for this case.
- */
-#define DEPOT_POOLS_CAP (8192 * (CONFIG_STACKDEPOT_MAX_FRAMES / 16))
-#else
 #define DEPOT_POOLS_CAP 8192
-#endif
 #define DEPOT_MAX_POOLS \
 	(((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \
 	 (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP)
@@ -93,9 +84,6 @@ struct stack_record {
 	};
 };
 
-#define DEPOT_STACK_RECORD_SIZE \
-	ALIGN(sizeof(struct stack_record), 1 << DEPOT_STACK_ALIGN)
-
 static bool stack_depot_disabled;
 static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
 static bool __stack_depot_early_init_passed __initdata;
@@ -121,32 +109,31 @@ static void *stack_pools[DEPOT_MAX_POOLS];
 static void *new_pool;
 /* Number of pools in stack_pools. */
 static int pools_num;
+/* Offset to the unused space in the currently used pool. */
+static size_t pool_offset = DEPOT_POOL_SIZE;
 /* Freelist of stack records within stack_pools. */
 static LIST_HEAD(free_stacks);
-/*
- * Stack depot tries to keep an extra pool allocated even before it runs out
- * of space in the currently used pool. This flag marks whether this extra pool
- * needs to be allocated. It has the value 0 when either an extra pool is not
- * yet allocated or if the limit on the number of pools is reached.
- */
-static bool new_pool_required = true;
 /* The lock must be held when performing pool or freelist modifications. */
 static DEFINE_RAW_SPINLOCK(pool_lock);
 
 /* Statistics counters for debugfs. */
 enum depot_counter_id {
-	DEPOT_COUNTER_ALLOCS,
-	DEPOT_COUNTER_FREES,
-	DEPOT_COUNTER_INUSE,
+	DEPOT_COUNTER_REFD_ALLOCS,
+	DEPOT_COUNTER_REFD_FREES,
+	DEPOT_COUNTER_REFD_INUSE,
 	DEPOT_COUNTER_FREELIST_SIZE,
+	DEPOT_COUNTER_PERSIST_COUNT,
+	DEPOT_COUNTER_PERSIST_BYTES,
 	DEPOT_COUNTER_COUNT,
 };
 static long counters[DEPOT_COUNTER_COUNT];
 static const char *const counter_names[] = {
-	[DEPOT_COUNTER_ALLOCS]		= "allocations",
-	[DEPOT_COUNTER_FREES]		= "frees",
-	[DEPOT_COUNTER_INUSE]		= "in_use",
+	[DEPOT_COUNTER_REFD_ALLOCS]	= "refcounted_allocations",
+	[DEPOT_COUNTER_REFD_FREES]	= "refcounted_frees",
+	[DEPOT_COUNTER_REFD_INUSE]	= "refcounted_in_use",
 	[DEPOT_COUNTER_FREELIST_SIZE]	= "freelist_size",
+	[DEPOT_COUNTER_PERSIST_COUNT]	= "persistent_count",
+	[DEPOT_COUNTER_PERSIST_BYTES]	= "persistent_bytes",
 };
 static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT);
 
@@ -294,48 +281,52 @@ int stack_depot_init(void)
 EXPORT_SYMBOL_GPL(stack_depot_init);
 
 /*
- * Initializes new stack depot @pool, release all its entries to the freelist,
- * and update the list of pools.
+ * Initializes new stack pool, and updates the list of pools.
  */
-static void depot_init_pool(void *pool)
+static bool depot_init_pool(void **prealloc)
 {
-	int offset;
-
 	lockdep_assert_held(&pool_lock);
 
-	/* Initialize handles and link stack records into the freelist. */
-	for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE;
-	     offset += DEPOT_STACK_RECORD_SIZE) {
-		struct stack_record *stack = pool + offset;
-
-		stack->handle.pool_index = pools_num;
-		stack->handle.offset = offset >> DEPOT_STACK_ALIGN;
-		stack->handle.extra = 0;
-
-		/*
-		 * Stack traces of size 0 are never saved, and we can simply use
-		 * the size field as an indicator if this is a new unused stack
-		 * record in the freelist.
-		 */
-		stack->size = 0;
+	if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
+		/* Bail out if we reached the pool limit. */
+		WARN_ON_ONCE(pools_num > DEPOT_MAX_POOLS); /* should never happen */
+		WARN_ON_ONCE(!new_pool); /* to avoid unnecessary pre-allocation */
+		WARN_ONCE(1, "Stack depot reached limit capacity");
+		return false;
+	}
 
-		INIT_LIST_HEAD(&stack->hash_list);
-		/*
-		 * Add to the freelist front to prioritize never-used entries:
-		 * required in case there are entries in the freelist, but their
-		 * RCU cookie still belongs to the current RCU grace period
-		 * (there can still be concurrent readers).
-		 */
-		list_add(&stack->free_list, &free_stacks);
-		counters[DEPOT_COUNTER_FREELIST_SIZE]++;
+	if (!new_pool && *prealloc) {
+		/* We have preallocated memory, use it. */
+		WRITE_ONCE(new_pool, *prealloc);
+		*prealloc = NULL;
 	}
 
+	if (!new_pool)
+		return false; /* new_pool and *prealloc are NULL */
+
 	/* Save reference to the pool to be used by depot_fetch_stack(). */
-	stack_pools[pools_num] = pool;
+	stack_pools[pools_num] = new_pool;
+
+	/*
+	 * Stack depot tries to keep an extra pool allocated even before it runs
+	 * out of space in the currently used pool.
+	 *
+	 * To indicate that a new preallocation is needed new_pool is reset to
+	 * NULL; do not reset to NULL if we have reached the maximum number of
+	 * pools.
+	 */
+	if (pools_num < DEPOT_MAX_POOLS)
+		WRITE_ONCE(new_pool, NULL);
+	else
+		WRITE_ONCE(new_pool, STACK_DEPOT_POISON);
 
 	/* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */
 	WRITE_ONCE(pools_num, pools_num + 1);
 	ASSERT_EXCLUSIVE_WRITER(pools_num);
+
+	pool_offset = 0;
+
+	return true;
 }
 
 /* Keeps the preallocated memory to be used for a new stack depot pool. */
@@ -347,63 +338,51 @@ static void depot_keep_new_pool(void **prealloc)
 	 * If a new pool is already saved or the maximum number of
 	 * pools is reached, do not use the preallocated memory.
 	 */
-	if (!new_pool_required)
+	if (new_pool)
 		return;
 
-	/*
-	 * Use the preallocated memory for the new pool
-	 * as long as we do not exceed the maximum number of pools.
-	 */
-	if (pools_num < DEPOT_MAX_POOLS) {
-		new_pool = *prealloc;
-		*prealloc = NULL;
-	}
-
-	/*
-	 * At this point, either a new pool is kept or the maximum
-	 * number of pools is reached. In either case, take note that
-	 * keeping another pool is not required.
-	 */
-	WRITE_ONCE(new_pool_required, false);
+	WRITE_ONCE(new_pool, *prealloc);
+	*prealloc = NULL;
 }
 
 /*
- * Try to initialize a new stack depot pool from either a previous or the
- * current pre-allocation, and release all its entries to the freelist.
+ * Try to initialize a new stack record from the current pool, a cached pool, or
+ * the current pre-allocation.
  */
-static bool depot_try_init_pool(void **prealloc)
+static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size)
 {
+	struct stack_record *stack;
+	void *current_pool;
+	u32 pool_index;
+
 	lockdep_assert_held(&pool_lock);
 
-	/* Check if we have a new pool saved and use it. */
-	if (new_pool) {
-		depot_init_pool(new_pool);
-		new_pool = NULL;
+	if (pool_offset + size > DEPOT_POOL_SIZE) {
+		if (!depot_init_pool(prealloc))
+			return NULL;
+	}
 
-		/* Take note that we might need a new new_pool. */
-		if (pools_num < DEPOT_MAX_POOLS)
-			WRITE_ONCE(new_pool_required, true);
+	if (WARN_ON_ONCE(pools_num < 1))
+		return NULL;
+	pool_index = pools_num - 1;
+	current_pool = stack_pools[pool_index];
+	if (WARN_ON_ONCE(!current_pool))
+		return NULL;
 
-		return true;
-	}
+	stack = current_pool + pool_offset;
 
-	/* Bail out if we reached the pool limit. */
-	if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
-		WARN_ONCE(1, "Stack depot reached limit capacity");
-		return false;
-	}
+	/* Pre-initialize handle once. */
+	stack->handle.pool_index = pool_index;
+	stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN;
+	stack->handle.extra = 0;
+	INIT_LIST_HEAD(&stack->hash_list);
 
-	/* Check if we have preallocated memory and use it. */
-	if (*prealloc) {
-		depot_init_pool(*prealloc);
-		*prealloc = NULL;
-		return true;
-	}
+	pool_offset += size;
 
-	return false;
+	return stack;
 }
 
-/* Try to find next free usable entry. */
+/* Try to find next free usable entry from the freelist. */
 static struct stack_record *depot_pop_free(void)
 {
 	struct stack_record *stack;
@@ -420,7 +399,7 @@ static struct stack_record *depot_pop_free(void)
 	 * check the first entry.
 	 */
 	stack = list_first_entry(&free_stacks, struct stack_record, free_list);
-	if (stack->size && !poll_state_synchronize_rcu(stack->rcu_state))
+	if (!poll_state_synchronize_rcu(stack->rcu_state))
 		return NULL;
 
 	list_del(&stack->free_list);
@@ -429,48 +408,73 @@ static struct stack_record *depot_pop_free(void)
 	return stack;
 }
 
+static inline size_t depot_stack_record_size(struct stack_record *s, unsigned int nr_entries)
+{
+	const size_t used = flex_array_size(s, entries, nr_entries);
+	const size_t unused = sizeof(s->entries) - used;
+
+	WARN_ON_ONCE(sizeof(s->entries) < used);
+
+	return ALIGN(sizeof(struct stack_record) - unused, 1 << DEPOT_STACK_ALIGN);
+}
+
 /* Allocates a new stack in a stack depot pool. */
 static struct stack_record *
-depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
+depot_alloc_stack(unsigned long *entries, int nr_entries, u32 hash, depot_flags_t flags, void **prealloc)
 {
-	struct stack_record *stack;
+	struct stack_record *stack = NULL;
+	size_t record_size;
 
 	lockdep_assert_held(&pool_lock);
 
 	/* This should already be checked by public API entry points. */
-	if (WARN_ON_ONCE(!size))
+	if (WARN_ON_ONCE(!nr_entries))
 		return NULL;
 
-	/* Check if we have a stack record to save the stack trace. */
-	stack = depot_pop_free();
-	if (!stack) {
-		/* No usable entries on the freelist - try to refill the freelist. */
-		if (!depot_try_init_pool(prealloc))
-			return NULL;
+	/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
+	if (nr_entries > CONFIG_STACKDEPOT_MAX_FRAMES)
+		nr_entries = CONFIG_STACKDEPOT_MAX_FRAMES;
+
+	if (flags & STACK_DEPOT_FLAG_GET) {
+		/*
+		 * Evictable entries have to allocate the max. size so they may
+		 * safely be re-used by differently sized allocations.
+		 */
+		record_size = depot_stack_record_size(stack, CONFIG_STACKDEPOT_MAX_FRAMES);
 		stack = depot_pop_free();
-		if (WARN_ON(!stack))
-			return NULL;
+	} else {
+		record_size = depot_stack_record_size(stack, nr_entries);
 	}
 
-	/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
-	if (size > CONFIG_STACKDEPOT_MAX_FRAMES)
-		size = CONFIG_STACKDEPOT_MAX_FRAMES;
+	if (!stack) {
+		stack = depot_pop_free_pool(prealloc, record_size);
+		if (!stack)
+			return NULL;
+	}
 
 	/* Save the stack trace. */
 	stack->hash = hash;
-	stack->size = size;
-	/* stack->handle is already filled in by depot_init_pool(). */
-	refcount_set(&stack->count, 1);
-	memcpy(stack->entries, entries, flex_array_size(stack, entries, size));
+	stack->size = nr_entries;
+	/* stack->handle is already filled in by depot_pop_free_pool(). */
+	memcpy(stack->entries, entries, flex_array_size(stack, entries, nr_entries));
+
+	if (flags & STACK_DEPOT_FLAG_GET) {
+		refcount_set(&stack->count, 1);
+		counters[DEPOT_COUNTER_REFD_ALLOCS]++;
+		counters[DEPOT_COUNTER_REFD_INUSE]++;
+	} else {
+		/* Warn on attempts to switch to refcounting this entry. */
+		refcount_set(&stack->count, REFCOUNT_SATURATED);
+		counters[DEPOT_COUNTER_PERSIST_COUNT]++;
+		counters[DEPOT_COUNTER_PERSIST_BYTES] += record_size;
+	}
 
 	/*
 	 * Let KMSAN know the stored stack record is initialized. This shall
 	 * prevent false positive reports if instrumented code accesses it.
 	 */
-	kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE);
+	kmsan_unpoison_memory(stack, record_size);
 
-	counters[DEPOT_COUNTER_ALLOCS]++;
-	counters[DEPOT_COUNTER_INUSE]++;
 	return stack;
 }
 
@@ -538,8 +542,8 @@ static void depot_free_stack(struct stack_record *stack)
 	list_add_tail(&stack->free_list, &free_stacks);
 
 	counters[DEPOT_COUNTER_FREELIST_SIZE]++;
-	counters[DEPOT_COUNTER_FREES]++;
-	counters[DEPOT_COUNTER_INUSE]--;
+	counters[DEPOT_COUNTER_REFD_FREES]++;
+	counters[DEPOT_COUNTER_REFD_INUSE]--;
 
 	printk_deferred_exit();
 	raw_spin_unlock_irqrestore(&pool_lock, flags);
@@ -660,7 +664,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
 	 * Allocate memory for a new pool if required now:
 	 * we won't be able to do that under the lock.
 	 */
-	if (unlikely(can_alloc && READ_ONCE(new_pool_required))) {
+	if (unlikely(can_alloc && !READ_ONCE(new_pool))) {
 		/*
 		 * Zero out zone modifiers, as we don't have specific zone
 		 * requirements. Keep the flags related to allocation in atomic
@@ -681,7 +685,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
 	found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
 	if (!found) {
 		struct stack_record *new =
-			depot_alloc_stack(entries, nr_entries, hash, &prealloc);
+			depot_alloc_stack(entries, nr_entries, hash, depot_flags, &prealloc);
 
 		if (new) {
 			/*

From 2ef8127ce56d76a4f861e7f0c40e241409c18087 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 29 Jan 2024 11:07:02 +0100
Subject: [PATCH 518/707] kasan: revert eviction of stack traces in generic
 mode

This partially reverts commits cc478e0b6bdf, 63b85ac56a64, 08d7c94d9635,
a414d4286f34, and 773688a6cb24 to make use of variable-sized stack depot
records, since eviction of stack entries from stack depot forces fixed-
sized stack records.  Care was taken to retain the code cleanups by the
above commits.

Eviction was added to generic KASAN as a response to alleviating the
additional memory usage from fixed-sized stack records, but this still
uses more memory than previously.

With the re-introduction of variable-sized records for stack depot, we can
just switch back to non-evictable stack records again, and return back to
the previous performance and memory usage baseline.

Before (observed after a KASAN kernel boot):

  pools: 597
  refcounted_allocations: 17547
  refcounted_frees: 6477
  refcounted_in_use: 11070
  freelist_size: 3497
  persistent_count: 12163
  persistent_bytes: 1717008

After:

  pools: 319
  refcounted_allocations: 0
  refcounted_frees: 0
  refcounted_in_use: 0
  freelist_size: 0
  persistent_count: 29397
  persistent_bytes: 5183536

As can be seen from the counters, with a generic KASAN config, refcounted
allocations and evictions are no longer used.  Due to using variable-sized
records, I observe a reduction of 278 stack depot pools (saving 4448 KiB)
with my test setup.

Link: https://lkml.kernel.org/r/20240129100708.39460-2-elver@google.com
Fixes: cc478e0b6bdf ("kasan: avoid resetting aux_lock")
Fixes: 63b85ac56a64 ("kasan: stop leaking stack trace handles")
Fixes: 08d7c94d9635 ("kasan: memset free track in qlink_free")
Fixes: a414d4286f34 ("kasan: handle concurrent kasan_record_aux_stack calls")
Fixes: 773688a6cb24 ("kasan: use stack_depot_put for Generic mode")
Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/common.c     |  8 ++---
 mm/kasan/generic.c    | 68 +++++--------------------------------------
 mm/kasan/kasan.h      | 10 -------
 mm/kasan/quarantine.c |  5 +++-
 4 files changed, 14 insertions(+), 77 deletions(-)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 610efae9122094..6ca63e8dda741b 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -65,8 +65,7 @@ void kasan_save_track(struct kasan_track *track, gfp_t flags)
 {
 	depot_stack_handle_t stack;
 
-	stack = kasan_save_stack(flags,
-			STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET);
+	stack = kasan_save_stack(flags, STACK_DEPOT_FLAG_CAN_ALLOC);
 	kasan_set_track(track, stack);
 }
 
@@ -266,10 +265,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object,
 		return true;
 
 	/*
-	 * If the object is not put into quarantine, it will likely be quickly
-	 * reallocated. Thus, release its metadata now.
+	 * Note: Keep per-object metadata to allow KASAN print stack traces for
+	 * use-after-free-before-realloc bugs.
 	 */
-	kasan_release_object_meta(cache, object);
 
 	/* Let slab put the object onto the freelist. */
 	return false;
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index df6627f62402c0..fc9cf1860efb34 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -485,16 +485,6 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object)
 	if (alloc_meta) {
 		/* Zero out alloc meta to mark it as invalid. */
 		__memset(alloc_meta, 0, sizeof(*alloc_meta));
-
-		/*
-		 * Prepare the lock for saving auxiliary stack traces.
-		 * Temporarily disable KASAN bug reporting to allow instrumented
-		 * raw_spin_lock_init to access aux_lock, which resides inside
-		 * of a redzone.
-		 */
-		kasan_disable_current();
-		raw_spin_lock_init(&alloc_meta->aux_lock);
-		kasan_enable_current();
 	}
 
 	/*
@@ -506,18 +496,8 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object)
 
 static void release_alloc_meta(struct kasan_alloc_meta *meta)
 {
-	/* Evict the stack traces from stack depot. */
-	stack_depot_put(meta->alloc_track.stack);
-	stack_depot_put(meta->aux_stack[0]);
-	stack_depot_put(meta->aux_stack[1]);
-
-	/*
-	 * Zero out alloc meta to mark it as invalid but keep aux_lock
-	 * initialized to avoid having to reinitialize it when another object
-	 * is allocated in the same slot.
-	 */
-	__memset(&meta->alloc_track, 0, sizeof(meta->alloc_track));
-	__memset(meta->aux_stack, 0, sizeof(meta->aux_stack));
+	/* Zero out alloc meta to mark it as invalid. */
+	__memset(meta, 0, sizeof(*meta));
 }
 
 static void release_free_meta(const void *object, struct kasan_free_meta *meta)
@@ -526,27 +506,10 @@ static void release_free_meta(const void *object, struct kasan_free_meta *meta)
 	if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META)
 		return;
 
-	/* Evict the stack trace from the stack depot. */
-	stack_depot_put(meta->free_track.stack);
-
 	/* Mark free meta as invalid. */
 	*(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREE;
 }
 
-void kasan_release_object_meta(struct kmem_cache *cache, const void *object)
-{
-	struct kasan_alloc_meta *alloc_meta;
-	struct kasan_free_meta *free_meta;
-
-	alloc_meta = kasan_get_alloc_meta(cache, object);
-	if (alloc_meta)
-		release_alloc_meta(alloc_meta);
-
-	free_meta = kasan_get_free_meta(cache, object);
-	if (free_meta)
-		release_free_meta(object, free_meta);
-}
-
 size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object)
 {
 	struct kasan_cache *info = &cache->kasan_info;
@@ -571,8 +534,6 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags)
 	struct kmem_cache *cache;
 	struct kasan_alloc_meta *alloc_meta;
 	void *object;
-	depot_stack_handle_t new_handle, old_handle;
-	unsigned long flags;
 
 	if (is_kfence_address(addr) || !slab)
 		return;
@@ -583,33 +544,18 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags)
 	if (!alloc_meta)
 		return;
 
-	new_handle = kasan_save_stack(0, depot_flags);
-
-	/*
-	 * Temporarily disable KASAN bug reporting to allow instrumented
-	 * spinlock functions to access aux_lock, which resides inside of a
-	 * redzone.
-	 */
-	kasan_disable_current();
-	raw_spin_lock_irqsave(&alloc_meta->aux_lock, flags);
-	old_handle = alloc_meta->aux_stack[1];
 	alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
-	alloc_meta->aux_stack[0] = new_handle;
-	raw_spin_unlock_irqrestore(&alloc_meta->aux_lock, flags);
-	kasan_enable_current();
-
-	stack_depot_put(old_handle);
+	alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags);
 }
 
 void kasan_record_aux_stack(void *addr)
 {
-	return __kasan_record_aux_stack(addr,
-			STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET);
+	return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC);
 }
 
 void kasan_record_aux_stack_noalloc(void *addr)
 {
-	return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_GET);
+	return __kasan_record_aux_stack(addr, 0);
 }
 
 void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
@@ -620,7 +566,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
 	if (!alloc_meta)
 		return;
 
-	/* Evict previous stack traces (might exist for krealloc or mempool). */
+	/* Invalidate previous stack traces (might exist for krealloc or mempool). */
 	release_alloc_meta(alloc_meta);
 
 	kasan_save_track(&alloc_meta->alloc_track, flags);
@@ -634,7 +580,7 @@ void kasan_save_free_info(struct kmem_cache *cache, void *object)
 	if (!free_meta)
 		return;
 
-	/* Evict previous stack trace (might exist for mempool). */
+	/* Invalidate previous stack trace (might exist for mempool). */
 	release_free_meta(object, free_meta);
 
 	kasan_save_track(&free_meta->free_track, 0);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index d0f172f2b9783f..fb2b9ac0659a7a 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -6,7 +6,6 @@
 #include <linux/kasan.h>
 #include <linux/kasan-tags.h>
 #include <linux/kfence.h>
-#include <linux/spinlock.h>
 #include <linux/stackdepot.h>
 
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
@@ -265,13 +264,6 @@ struct kasan_global {
 struct kasan_alloc_meta {
 	struct kasan_track alloc_track;
 	/* Free track is stored in kasan_free_meta. */
-	/*
-	 * aux_lock protects aux_stack from accesses from concurrent
-	 * kasan_record_aux_stack calls. It is a raw spinlock to avoid sleeping
-	 * on RT kernels, as kasan_record_aux_stack_noalloc can be called from
-	 * non-sleepable contexts.
-	 */
-	raw_spinlock_t aux_lock;
 	depot_stack_handle_t aux_stack[2];
 };
 
@@ -398,10 +390,8 @@ struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
 struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
 						const void *object);
 void kasan_init_object_meta(struct kmem_cache *cache, const void *object);
-void kasan_release_object_meta(struct kmem_cache *cache, const void *object);
 #else
 static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { }
-static inline void kasan_release_object_meta(struct kmem_cache *cache, const void *object) { }
 #endif
 
 depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags);
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 3ba02efb952aac..6958aa713c67ee 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -145,7 +145,10 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
 	void *object = qlink_to_object(qlink, cache);
 	struct kasan_free_meta *free_meta = kasan_get_free_meta(cache, object);
 
-	kasan_release_object_meta(cache, object);
+	/*
+	 * Note: Keep per-object metadata to allow KASAN print stack traces for
+	 * use-after-free-before-realloc bugs.
+	 */
 
 	/*
 	 * If init_on_free is enabled and KASAN's free metadata is stored in

From 5ff5178495b3fd5624543b8bdcdd5ba5e7cafe24 Mon Sep 17 00:00:00 2001
From: Lance Yang <ioworker0@gmail.com>
Date: Mon, 29 Jan 2024 13:45:51 +0800
Subject: [PATCH 519/707] mm/khugepaged: bypassing unnecessary scans with
 MMF_DISABLE_THP check

khugepaged scans the entire address space in the background for each given
mm, looking for opportunities to merge sequences of basic pages into huge
pages.  However, when an mm is inserted to the mm_slots list, and the
MMF_DISABLE_THP flag is set later, this scanning process becomes
unnecessary for that mm and can be skipped to avoid redundant operations,
especially in scenarios with a large address space.

This commit introduces a check before each scanning process to test the
MMF_DISABLE_THP flag for the given mm; if the flag is set, the scanning
process is bypassed, thereby improving the efficiency of khugepaged.

Link: https://lkml.kernel.org/r/20240129054551.57728-1-ioworker0@gmail.com
Signed-off-by: Lance Yang <ioworker0@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Zach O'Keefe <zokeefe@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/khugepaged.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index fe43fbc4452539..2771fc043b3b8b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
 	return atomic_read(&mm->mm_users) == 0;
 }
 
+static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
+{
+	return hpage_collapse_test_exit(mm) ||
+	       test_bit(MMF_DISABLE_THP, &mm->flags);
+}
+
 void __khugepaged_enter(struct mm_struct *mm)
 {
 	struct khugepaged_mm_slot *mm_slot;
@@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
 
 	lockdep_assert_held(&khugepaged_mm_lock);
 
-	if (hpage_collapse_test_exit(mm)) {
+	if (hpage_collapse_test_exit_or_disable(mm)) {
 		/* free mm_slot */
 		hash_del(&slot->hash);
 		list_del(&slot->mm_node);
@@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 		goto breakouterloop_mmap_lock;
 
 	progress++;
-	if (unlikely(hpage_collapse_test_exit(mm)))
+	if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
 		goto breakouterloop;
 
 	vma_iter_init(&vmi, mm, khugepaged_scan.address);
@@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 		unsigned long hstart, hend;
 
 		cond_resched();
-		if (unlikely(hpage_collapse_test_exit(mm))) {
+		if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
 			progress++;
 			break;
 		}
@@ -2390,7 +2396,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 			bool mmap_locked = true;
 
 			cond_resched();
-			if (unlikely(hpage_collapse_test_exit(mm)))
+			if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
 				goto breakouterloop;
 
 			VM_BUG_ON(khugepaged_scan.address < hstart ||
@@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 				fput(file);
 				if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
 					mmap_read_lock(mm);
-					if (hpage_collapse_test_exit(mm))
+					if (hpage_collapse_test_exit_or_disable(mm))
 						goto breakouterloop;
 					*result = collapse_pte_mapped_thp(mm,
 						khugepaged_scan.address, false);
@@ -2450,7 +2456,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 	 * Release the current mm_slot if this mm is about to die, or
 	 * if we scanned all vmas of this mm.
 	 */
-	if (hpage_collapse_test_exit(mm) || !vma) {
+	if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
 		/*
 		 * Make sure that if mm_users is reaching zero while
 		 * khugepaged runs here, khugepaged_exit will find

From 36c6faa33b878745f30c544b2189d43aacec96ef Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Mon, 22 Jan 2024 21:01:53 +0800
Subject: [PATCH 520/707] mm: compaction: limit the suitable target page order
 to be less than cc->order

It can not improve the fragmentation if we isolate the target free pages
exceeding cc->order, especially when the cc->order is less than
pageblock_order.  For example, suppose the pageblock_order is MAX_ORDER
(size is 4M) and cc->order is 2M THP size, we should not isolate other 2M
free pages to be the migration target, which can not improve the
fragmentation.

Moreover this is also applicable for large folio compaction.

Link: https://lkml.kernel.org/r/afcd9377351c259df7a25a388a4a0d5862b986f4.1705928395.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 335a6f6787e4e3..055687e6bd17b6 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1415,12 +1415,14 @@ static bool suitable_migration_target(struct compact_control *cc,
 {
 	/* If the page is a large free page, then disallow migration */
 	if (PageBuddy(page)) {
+		int order = cc->order > 0 ? cc->order : pageblock_order;
+
 		/*
 		 * We are checking page_order without zone->lock taken. But
 		 * the only small danger is that we skip a potentially suitable
 		 * pageblock, so it's not worth to check order for valid range.
 		 */
-		if (buddy_order_unsafe(page) >= pageblock_order)
+		if (buddy_order_unsafe(page) >= order)
 			return false;
 	}
 

From 3fba0bccc287b087a2262bc7f53dc7883e34e4e4 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:37 -0500
Subject: [PATCH 521/707] mm: zswap: rename zswap_free_entry to
 zswap_entry_free

There is a zswap_entry_ namespace with multiple functions already.

Link: https://lkml.kernel.org/r/20240130014208.565554-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 7f88b3a77e4a84..173f2e6657de65 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -520,7 +520,7 @@ static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
  * Carries out the common pattern of freeing and entry's zpool allocation,
  * freeing the entry itself, and decrementing the number of stored pages.
  */
-static void zswap_free_entry(struct zswap_entry *entry)
+static void zswap_entry_free(struct zswap_entry *entry)
 {
 	if (!entry->length)
 		atomic_dec(&zswap_same_filled_pages);
@@ -555,7 +555,7 @@ static void zswap_entry_put(struct zswap_entry *entry)
 	WARN_ON_ONCE(refcount < 0);
 	if (refcount == 0) {
 		WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
-		zswap_free_entry(entry);
+		zswap_entry_free(entry);
 	}
 }
 

From 32068a78f7a4d137f14580e96db2d681342ad853 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:38 -0500
Subject: [PATCH 522/707] mm: zswap: inline and remove zswap_entry_find_get()

There is only one caller and the function is trivial. Inline it.

Link: https://lkml.kernel.org/r/20240130014208.565554-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 173f2e6657de65..cf864aaa214d34 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -559,19 +559,6 @@ static void zswap_entry_put(struct zswap_entry *entry)
 	}
 }
 
-/* caller must hold the tree lock */
-static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
-				pgoff_t offset)
-{
-	struct zswap_entry *entry;
-
-	entry = zswap_rb_search(root, offset);
-	if (entry)
-		zswap_entry_get(entry);
-
-	return entry;
-}
-
 /*********************************
 * shrinker functions
 **********************************/
@@ -1708,13 +1695,13 @@ bool zswap_load(struct folio *folio)
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
 
-	/* find */
 	spin_lock(&tree->lock);
-	entry = zswap_entry_find_get(&tree->rbroot, offset);
+	entry = zswap_rb_search(&tree->rbroot, offset);
 	if (!entry) {
 		spin_unlock(&tree->lock);
 		return false;
 	}
+	zswap_entry_get(entry);
 	spin_unlock(&tree->lock);
 
 	if (entry->length)

From 4231a94f31606062d83b77c5704d3ef20e180b36 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:39 -0500
Subject: [PATCH 523/707] mm: zswap: move zswap_invalidate_entry() to related
 functions

Move it up to the other tree and refcounting functions.

Link: https://lkml.kernel.org/r/20240130014208.565554-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index cf864aaa214d34..9f05282efe3c22 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -559,6 +559,18 @@ static void zswap_entry_put(struct zswap_entry *entry)
 	}
 }
 
+/*
+ * If the entry is still valid in the tree, drop the initial ref and remove it
+ * from the tree. This function must be called with an additional ref held,
+ * otherwise it may race with another invalidation freeing the entry.
+ */
+static void zswap_invalidate_entry(struct zswap_tree *tree,
+				   struct zswap_entry *entry)
+{
+	if (zswap_rb_erase(&tree->rbroot, entry))
+		zswap_entry_put(entry);
+}
+
 /*********************************
 * shrinker functions
 **********************************/
@@ -809,18 +821,6 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
 	return NULL;
 }
 
-/*
- * If the entry is still valid in the tree, drop the initial ref and remove it
- * from the tree. This function must be called with an additional ref held,
- * otherwise it may race with another invalidation freeing the entry.
- */
-static void zswap_invalidate_entry(struct zswap_tree *tree,
-				   struct zswap_entry *entry)
-{
-	if (zswap_rb_erase(&tree->rbroot, entry))
-		zswap_entry_put(entry);
-}
-
 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
 				       spinlock_t *lock, void *arg)
 {

From 1a274d5dddf116a0900f4c32c91f9a04bef7103d Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:40 -0500
Subject: [PATCH 524/707] mm: zswap: warn when referencing a dead entry

Put a standard sanity check on zswap_entry_get() for UAF scenario.

Link: https://lkml.kernel.org/r/20240130014208.565554-5-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/zswap.c b/mm/zswap.c
index 9f05282efe3c22..0c6adaf2fdb6a3 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -542,6 +542,7 @@ static void zswap_entry_free(struct zswap_entry *entry)
 /* caller must hold the tree lock */
 static void zswap_entry_get(struct zswap_entry *entry)
 {
+	WARN_ON_ONCE(!entry->refcount);
 	entry->refcount++;
 }
 

From a2c7d38cf4915b8235839c5451ef82939b3af7df Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:41 -0500
Subject: [PATCH 525/707] mm: zswap: clean up zswap_entry_put()

Remove stale comment and unnecessary local variable.

Link: https://lkml.kernel.org/r/20240130014208.565554-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 0c6adaf2fdb6a3..7a7e8da2b4f8c3 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -546,15 +546,11 @@ static void zswap_entry_get(struct zswap_entry *entry)
 	entry->refcount++;
 }
 
-/* caller must hold the tree lock
-* remove from the tree and free it, if nobody reference the entry
-*/
+/* caller must hold the tree lock */
 static void zswap_entry_put(struct zswap_entry *entry)
 {
-	int refcount = --entry->refcount;
-
-	WARN_ON_ONCE(refcount < 0);
-	if (refcount == 0) {
+	WARN_ON_ONCE(!entry->refcount);
+	if (--entry->refcount == 0) {
 		WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
 		zswap_entry_free(entry);
 	}

From a4a9d1ffefbb68b09050284788de93921ec9ab15 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:42 -0500
Subject: [PATCH 526/707] mm: zswap: rename __zswap_load() to
 zswap_decompress()

Link: https://lkml.kernel.org/r/20240130014208.565554-7-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 7a7e8da2b4f8c3..bdc9f82fe4b90d 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1316,7 +1316,7 @@ static int zswap_enabled_param_set(const char *val,
 	return ret;
 }
 
-static void __zswap_load(struct zswap_entry *entry, struct page *page)
+static void zswap_decompress(struct zswap_entry *entry, struct page *page)
 {
 	struct zpool *zpool = zswap_find_zpool(entry);
 	struct scatterlist input, output;
@@ -1411,7 +1411,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	zswap_entry_get(entry);
 	spin_unlock(&tree->lock);
 
-	__zswap_load(entry, &folio->page);
+	zswap_decompress(entry, &folio->page);
 
 	count_vm_event(ZSWPWB);
 	if (entry->objcg)
@@ -1702,7 +1702,7 @@ bool zswap_load(struct folio *folio)
 	spin_unlock(&tree->lock);
 
 	if (entry->length)
-		__zswap_load(entry, page);
+		zswap_decompress(entry, page);
 	else {
 		dst = kmap_local_page(page);
 		zswap_fill_page(dst, entry->value);

From 6db85df064f864e16eefdca06c475a340a257cce Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:43 -0500
Subject: [PATCH 527/707] mm: zswap: break out zwap_compress()

zswap_store() is long and mixes work at the zswap layer with work at
the backend and compression layer. Move compression & backend work to
zswap_compress(), mirroring zswap_decompress().

Link: https://lkml.kernel.org/r/20240130014208.565554-8-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 145 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 77 insertions(+), 68 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index bdc9f82fe4b90d..f9b9494156ba60 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1316,6 +1316,79 @@ static int zswap_enabled_param_set(const char *val,
 	return ret;
 }
 
+static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
+{
+	struct crypto_acomp_ctx *acomp_ctx;
+	struct scatterlist input, output;
+	unsigned int dlen = PAGE_SIZE;
+	unsigned long handle;
+	struct zpool *zpool;
+	char *buf;
+	gfp_t gfp;
+	int ret;
+	u8 *dst;
+
+	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+
+	mutex_lock(&acomp_ctx->mutex);
+
+	dst = acomp_ctx->buffer;
+	sg_init_table(&input, 1);
+	sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
+
+	/*
+	 * We need PAGE_SIZE * 2 here since there maybe over-compression case,
+	 * and hardware-accelerators may won't check the dst buffer size, so
+	 * giving the dst buffer with enough length to avoid buffer overflow.
+	 */
+	sg_init_one(&output, dst, PAGE_SIZE * 2);
+	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
+
+	/*
+	 * it maybe looks a little bit silly that we send an asynchronous request,
+	 * then wait for its completion synchronously. This makes the process look
+	 * synchronous in fact.
+	 * Theoretically, acomp supports users send multiple acomp requests in one
+	 * acomp instance, then get those requests done simultaneously. but in this
+	 * case, zswap actually does store and load page by page, there is no
+	 * existing method to send the second page before the first page is done
+	 * in one thread doing zwap.
+	 * but in different threads running on different cpu, we have different
+	 * acomp instance, so multiple threads can do (de)compression in parallel.
+	 */
+	ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
+	dlen = acomp_ctx->req->dlen;
+	if (ret) {
+		zswap_reject_compress_fail++;
+		goto unlock;
+	}
+
+	zpool = zswap_find_zpool(entry);
+	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+	if (zpool_malloc_support_movable(zpool))
+		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+	ret = zpool_malloc(zpool, dlen, gfp, &handle);
+	if (ret == -ENOSPC) {
+		zswap_reject_compress_poor++;
+		goto unlock;
+	}
+	if (ret) {
+		zswap_reject_alloc_fail++;
+		goto unlock;
+	}
+
+	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
+	memcpy(buf, dst, dlen);
+	zpool_unmap_handle(zpool, handle);
+
+	entry->handle = handle;
+	entry->length = dlen;
+
+unlock:
+	mutex_unlock(&acomp_ctx->mutex);
+	return ret == 0;
+}
+
 static void zswap_decompress(struct zswap_entry *entry, struct page *page)
 {
 	struct zpool *zpool = zswap_find_zpool(entry);
@@ -1472,18 +1545,11 @@ bool zswap_store(struct folio *folio)
 	struct page *page = &folio->page;
 	struct zswap_tree *tree = swap_zswap_tree(swp);
 	struct zswap_entry *entry, *dupentry;
-	struct scatterlist input, output;
-	struct crypto_acomp_ctx *acomp_ctx;
 	struct obj_cgroup *objcg = NULL;
 	struct mem_cgroup *memcg = NULL;
 	struct zswap_pool *pool;
-	struct zpool *zpool;
-	unsigned int dlen = PAGE_SIZE;
-	unsigned long handle, value;
-	char *buf;
-	u8 *src, *dst;
-	gfp_t gfp;
-	int ret;
+	unsigned long value;
+	u8 *src;
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
 	VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1568,65 +1634,10 @@ bool zswap_store(struct folio *folio)
 		mem_cgroup_put(memcg);
 	}
 
-	/* compress */
-	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-
-	mutex_lock(&acomp_ctx->mutex);
-
-	dst = acomp_ctx->buffer;
-	sg_init_table(&input, 1);
-	sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
+	if (!zswap_compress(folio, entry))
+		goto put_pool;
 
-	/*
-	 * We need PAGE_SIZE * 2 here since there maybe over-compression case,
-	 * and hardware-accelerators may won't check the dst buffer size, so
-	 * giving the dst buffer with enough length to avoid buffer overflow.
-	 */
-	sg_init_one(&output, dst, PAGE_SIZE * 2);
-	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
-	/*
-	 * it maybe looks a little bit silly that we send an asynchronous request,
-	 * then wait for its completion synchronously. This makes the process look
-	 * synchronous in fact.
-	 * Theoretically, acomp supports users send multiple acomp requests in one
-	 * acomp instance, then get those requests done simultaneously. but in this
-	 * case, zswap actually does store and load page by page, there is no
-	 * existing method to send the second page before the first page is done
-	 * in one thread doing zwap.
-	 * but in different threads running on different cpu, we have different
-	 * acomp instance, so multiple threads can do (de)compression in parallel.
-	 */
-	ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
-	dlen = acomp_ctx->req->dlen;
-
-	if (ret) {
-		zswap_reject_compress_fail++;
-		goto put_dstmem;
-	}
-
-	/* store */
-	zpool = zswap_find_zpool(entry);
-	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
-	if (zpool_malloc_support_movable(zpool))
-		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
-	ret = zpool_malloc(zpool, dlen, gfp, &handle);
-	if (ret == -ENOSPC) {
-		zswap_reject_compress_poor++;
-		goto put_dstmem;
-	}
-	if (ret) {
-		zswap_reject_alloc_fail++;
-		goto put_dstmem;
-	}
-	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
-	memcpy(buf, dst, dlen);
-	zpool_unmap_handle(zpool, handle);
-	mutex_unlock(&acomp_ctx->mutex);
-
-	/* populate entry */
 	entry->swpentry = swp;
-	entry->handle = handle;
-	entry->length = dlen;
 
 insert_entry:
 	entry->objcg = objcg;
@@ -1663,8 +1674,6 @@ bool zswap_store(struct folio *folio)
 
 	return true;
 
-put_dstmem:
-	mutex_unlock(&acomp_ctx->mutex);
 put_pool:
 	zswap_pool_put(entry->pool);
 freepage:

From b1a2a0ed5d831cd9bb755e170260626f2fa43223 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:44 -0500
Subject: [PATCH 528/707] mm: zswap: further cleanup zswap_store()

- Remove dupentry, reusing entry works just fine.
- Rename pool to shrink_pool, as this one actually is confusing.
- Remove page, use folio_nid() and kmap_local_folio() directly.
- Set entry->swpentry in a common path.
- Move value and src to local scope of use.

Link: https://lkml.kernel.org/r/20240130014208.565554-9-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index f9b9494156ba60..cde309c539b337 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1542,14 +1542,11 @@ bool zswap_store(struct folio *folio)
 {
 	swp_entry_t swp = folio->swap;
 	pgoff_t offset = swp_offset(swp);
-	struct page *page = &folio->page;
 	struct zswap_tree *tree = swap_zswap_tree(swp);
 	struct zswap_entry *entry, *dupentry;
 	struct obj_cgroup *objcg = NULL;
 	struct mem_cgroup *memcg = NULL;
-	struct zswap_pool *pool;
-	unsigned long value;
-	u8 *src;
+	struct zswap_pool *shrink_pool;
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
 	VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1567,10 +1564,10 @@ bool zswap_store(struct folio *folio)
 	 * the tree, and it might be written back overriding the new data.
 	 */
 	spin_lock(&tree->lock);
-	dupentry = zswap_rb_search(&tree->rbroot, offset);
-	if (dupentry) {
+	entry = zswap_rb_search(&tree->rbroot, offset);
+	if (entry) {
+		zswap_invalidate_entry(tree, entry);
 		zswap_duplicate_entry++;
-		zswap_invalidate_entry(tree, dupentry);
 	}
 	spin_unlock(&tree->lock);
 	objcg = get_obj_cgroup_from_folio(folio);
@@ -1598,17 +1595,19 @@ bool zswap_store(struct folio *folio)
 	}
 
 	/* allocate entry */
-	entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
+	entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio));
 	if (!entry) {
 		zswap_reject_kmemcache_fail++;
 		goto reject;
 	}
 
 	if (zswap_same_filled_pages_enabled) {
-		src = kmap_local_page(page);
+		unsigned long value;
+		u8 *src;
+
+		src = kmap_local_folio(folio, 0);
 		if (zswap_is_page_same_filled(src, &value)) {
 			kunmap_local(src);
-			entry->swpentry = swp;
 			entry->length = 0;
 			entry->value = value;
 			atomic_inc(&zswap_same_filled_pages);
@@ -1637,9 +1636,8 @@ bool zswap_store(struct folio *folio)
 	if (!zswap_compress(folio, entry))
 		goto put_pool;
 
-	entry->swpentry = swp;
-
 insert_entry:
+	entry->swpentry = swp;
 	entry->objcg = objcg;
 	if (objcg) {
 		obj_cgroup_charge_zswap(objcg, entry->length);
@@ -1684,9 +1682,9 @@ bool zswap_store(struct folio *folio)
 	return false;
 
 shrink:
-	pool = zswap_pool_last_get();
-	if (pool && !queue_work(shrink_wq, &pool->shrink_work))
-		zswap_pool_put(pool);
+	shrink_pool = zswap_pool_last_get();
+	if (shrink_pool && !queue_work(shrink_wq, &shrink_pool->shrink_work))
+		zswap_pool_put(shrink_pool);
 	goto reject;
 }
 

From c9070092b658e578e5b6e80ed8b68ec327162237 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:45 -0500
Subject: [PATCH 529/707] mm: zswap: simplify zswap_invalidate()

The branching is awkward and duplicates code. The comment about
writeback is also misleading: yes, the entry might have been written
back. Or it might have never been stored in zswap to begin with due to
a rejection - zswap_invalidate() is called on all exiting swap entries.

Link: https://lkml.kernel.org/r/20240130014208.565554-10-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index cde309c539b337..082d076a758d19 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1739,15 +1739,10 @@ void zswap_invalidate(int type, pgoff_t offset)
 	struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset));
 	struct zswap_entry *entry;
 
-	/* find */
 	spin_lock(&tree->lock);
 	entry = zswap_rb_search(&tree->rbroot, offset);
-	if (!entry) {
-		/* entry was written back */
-		spin_unlock(&tree->lock);
-		return;
-	}
-	zswap_invalidate_entry(tree, entry);
+	if (entry)
+		zswap_invalidate_entry(tree, entry);
 	spin_unlock(&tree->lock);
 }
 

From 0858aef97827e36254120e075ab42919cf761e2a Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:46 -0500
Subject: [PATCH 530/707] mm: zswap: function ordering: pool alloc & free

The function ordering in zswap.c is a little chaotic, which requires
jumping in unexpected directions when following related code. This is
a series of patches that brings the file into the following order:

- pool functions
- lru functions
- rbtree functions
- zswap entry functions
- compression/backend functions
- writeback & shrinking functions
- store, load, invalidate, swapon, swapoff
- debugfs
- init

But it has to be split up such the moving still produces halfway
readable diffs.

In this patch, move pool allocation and freeing functions.

Link: https://lkml.kernel.org/r/20240130014208.565554-11-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 297 +++++++++++++++++++++++++++--------------------------
 1 file changed, 152 insertions(+), 145 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 082d076a758d19..805d9a35f63387 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -320,6 +320,158 @@ static void zswap_update_total_size(void)
 	zswap_pool_total_size = total;
 }
 
+/*********************************
+* pool functions
+**********************************/
+
+static void zswap_alloc_shrinker(struct zswap_pool *pool);
+static void shrink_worker(struct work_struct *w);
+
+static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
+{
+	int i;
+	struct zswap_pool *pool;
+	char name[38]; /* 'zswap' + 32 char (max) num + \0 */
+	gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+	int ret;
+
+	if (!zswap_has_pool) {
+		/* if either are unset, pool initialization failed, and we
+		 * need both params to be set correctly before trying to
+		 * create a pool.
+		 */
+		if (!strcmp(type, ZSWAP_PARAM_UNSET))
+			return NULL;
+		if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
+			return NULL;
+	}
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return NULL;
+
+	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
+		/* unique name for each pool specifically required by zsmalloc */
+		snprintf(name, 38, "zswap%x",
+			 atomic_inc_return(&zswap_pools_count));
+
+		pool->zpools[i] = zpool_create_pool(type, name, gfp);
+		if (!pool->zpools[i]) {
+			pr_err("%s zpool not available\n", type);
+			goto error;
+		}
+	}
+	pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
+
+	strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
+
+	pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
+	if (!pool->acomp_ctx) {
+		pr_err("percpu alloc failed\n");
+		goto error;
+	}
+
+	ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
+				       &pool->node);
+	if (ret)
+		goto error;
+
+	zswap_alloc_shrinker(pool);
+	if (!pool->shrinker)
+		goto error;
+
+	pr_debug("using %s compressor\n", pool->tfm_name);
+
+	/* being the current pool takes 1 ref; this func expects the
+	 * caller to always add the new pool as the current pool
+	 */
+	kref_init(&pool->kref);
+	INIT_LIST_HEAD(&pool->list);
+	if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
+		goto lru_fail;
+	shrinker_register(pool->shrinker);
+	INIT_WORK(&pool->shrink_work, shrink_worker);
+	atomic_set(&pool->nr_stored, 0);
+
+	zswap_pool_debug("created", pool);
+
+	return pool;
+
+lru_fail:
+	list_lru_destroy(&pool->list_lru);
+	shrinker_free(pool->shrinker);
+error:
+	if (pool->acomp_ctx)
+		free_percpu(pool->acomp_ctx);
+	while (i--)
+		zpool_destroy_pool(pool->zpools[i]);
+	kfree(pool);
+	return NULL;
+}
+
+static struct zswap_pool *__zswap_pool_create_fallback(void)
+{
+	bool has_comp, has_zpool;
+
+	has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
+	if (!has_comp && strcmp(zswap_compressor,
+				CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
+		pr_err("compressor %s not available, using default %s\n",
+		       zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
+		param_free_charp(&zswap_compressor);
+		zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
+		has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
+	}
+	if (!has_comp) {
+		pr_err("default compressor %s not available\n",
+		       zswap_compressor);
+		param_free_charp(&zswap_compressor);
+		zswap_compressor = ZSWAP_PARAM_UNSET;
+	}
+
+	has_zpool = zpool_has_pool(zswap_zpool_type);
+	if (!has_zpool && strcmp(zswap_zpool_type,
+				 CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
+		pr_err("zpool %s not available, using default %s\n",
+		       zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
+		param_free_charp(&zswap_zpool_type);
+		zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
+		has_zpool = zpool_has_pool(zswap_zpool_type);
+	}
+	if (!has_zpool) {
+		pr_err("default zpool %s not available\n",
+		       zswap_zpool_type);
+		param_free_charp(&zswap_zpool_type);
+		zswap_zpool_type = ZSWAP_PARAM_UNSET;
+	}
+
+	if (!has_comp || !has_zpool)
+		return NULL;
+
+	return zswap_pool_create(zswap_zpool_type, zswap_compressor);
+}
+
+static void zswap_pool_destroy(struct zswap_pool *pool)
+{
+	int i;
+
+	zswap_pool_debug("destroying", pool);
+
+	shrinker_free(pool->shrinker);
+	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
+	free_percpu(pool->acomp_ctx);
+	list_lru_destroy(&pool->list_lru);
+
+	spin_lock(&zswap_pools_lock);
+	mem_cgroup_iter_break(NULL, pool->next_shrink);
+	pool->next_shrink = NULL;
+	spin_unlock(&zswap_pools_lock);
+
+	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+		zpool_destroy_pool(pool->zpools[i]);
+	kfree(pool);
+}
+
 /* should be called under RCU */
 #ifdef CONFIG_MEMCG
 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
@@ -970,151 +1122,6 @@ static void shrink_worker(struct work_struct *w)
 	zswap_pool_put(pool);
 }
 
-static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
-{
-	int i;
-	struct zswap_pool *pool;
-	char name[38]; /* 'zswap' + 32 char (max) num + \0 */
-	gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
-	int ret;
-
-	if (!zswap_has_pool) {
-		/* if either are unset, pool initialization failed, and we
-		 * need both params to be set correctly before trying to
-		 * create a pool.
-		 */
-		if (!strcmp(type, ZSWAP_PARAM_UNSET))
-			return NULL;
-		if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
-			return NULL;
-	}
-
-	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
-	if (!pool)
-		return NULL;
-
-	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
-		/* unique name for each pool specifically required by zsmalloc */
-		snprintf(name, 38, "zswap%x",
-			 atomic_inc_return(&zswap_pools_count));
-
-		pool->zpools[i] = zpool_create_pool(type, name, gfp);
-		if (!pool->zpools[i]) {
-			pr_err("%s zpool not available\n", type);
-			goto error;
-		}
-	}
-	pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
-
-	strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
-
-	pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
-	if (!pool->acomp_ctx) {
-		pr_err("percpu alloc failed\n");
-		goto error;
-	}
-
-	ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
-				       &pool->node);
-	if (ret)
-		goto error;
-
-	zswap_alloc_shrinker(pool);
-	if (!pool->shrinker)
-		goto error;
-
-	pr_debug("using %s compressor\n", pool->tfm_name);
-
-	/* being the current pool takes 1 ref; this func expects the
-	 * caller to always add the new pool as the current pool
-	 */
-	kref_init(&pool->kref);
-	INIT_LIST_HEAD(&pool->list);
-	if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
-		goto lru_fail;
-	shrinker_register(pool->shrinker);
-	INIT_WORK(&pool->shrink_work, shrink_worker);
-	atomic_set(&pool->nr_stored, 0);
-
-	zswap_pool_debug("created", pool);
-
-	return pool;
-
-lru_fail:
-	list_lru_destroy(&pool->list_lru);
-	shrinker_free(pool->shrinker);
-error:
-	if (pool->acomp_ctx)
-		free_percpu(pool->acomp_ctx);
-	while (i--)
-		zpool_destroy_pool(pool->zpools[i]);
-	kfree(pool);
-	return NULL;
-}
-
-static struct zswap_pool *__zswap_pool_create_fallback(void)
-{
-	bool has_comp, has_zpool;
-
-	has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
-	if (!has_comp && strcmp(zswap_compressor,
-				CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
-		pr_err("compressor %s not available, using default %s\n",
-		       zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
-		param_free_charp(&zswap_compressor);
-		zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
-		has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
-	}
-	if (!has_comp) {
-		pr_err("default compressor %s not available\n",
-		       zswap_compressor);
-		param_free_charp(&zswap_compressor);
-		zswap_compressor = ZSWAP_PARAM_UNSET;
-	}
-
-	has_zpool = zpool_has_pool(zswap_zpool_type);
-	if (!has_zpool && strcmp(zswap_zpool_type,
-				 CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
-		pr_err("zpool %s not available, using default %s\n",
-		       zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
-		param_free_charp(&zswap_zpool_type);
-		zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
-		has_zpool = zpool_has_pool(zswap_zpool_type);
-	}
-	if (!has_zpool) {
-		pr_err("default zpool %s not available\n",
-		       zswap_zpool_type);
-		param_free_charp(&zswap_zpool_type);
-		zswap_zpool_type = ZSWAP_PARAM_UNSET;
-	}
-
-	if (!has_comp || !has_zpool)
-		return NULL;
-
-	return zswap_pool_create(zswap_zpool_type, zswap_compressor);
-}
-
-static void zswap_pool_destroy(struct zswap_pool *pool)
-{
-	int i;
-
-	zswap_pool_debug("destroying", pool);
-
-	shrinker_free(pool->shrinker);
-	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
-	free_percpu(pool->acomp_ctx);
-	list_lru_destroy(&pool->list_lru);
-
-	spin_lock(&zswap_pools_lock);
-	mem_cgroup_iter_break(NULL, pool->next_shrink);
-	pool->next_shrink = NULL;
-	spin_unlock(&zswap_pools_lock);
-
-	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
-		zpool_destroy_pool(pool->zpools[i]);
-	kfree(pool);
-}
-
 static int __must_check zswap_pool_get(struct zswap_pool *pool)
 {
 	if (!pool)

From 8f600872a7ff9350681364463805cafed07020a5 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:47 -0500
Subject: [PATCH 531/707] mm: zswap: function ordering: pool refcounting

Move pool refcounting functions into the pool section. First the
destroy functions, then the get and put which uses them.

__zswap_pool_empty() has an upward reference to the global
zswap_pools, to sanity check it's not the currently active pool that's
being freed. That gets the forward decl for zswap_pool_current().

This puts the get and put function above all callers, so kill the
forward decls as well.

Link: https://lkml.kernel.org/r/20240130014208.565554-12-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 94 +++++++++++++++++++++++++++---------------------------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 805d9a35f63387..33775f2224b711 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -278,8 +278,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
 
 static int zswap_writeback_entry(struct zswap_entry *entry,
 				 swp_entry_t swpentry);
-static int zswap_pool_get(struct zswap_pool *pool);
-static void zswap_pool_put(struct zswap_pool *pool);
 
 static bool zswap_is_full(void)
 {
@@ -472,6 +470,53 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
 	kfree(pool);
 }
 
+static void __zswap_pool_release(struct work_struct *work)
+{
+	struct zswap_pool *pool = container_of(work, typeof(*pool),
+						release_work);
+
+	synchronize_rcu();
+
+	/* nobody should have been able to get a kref... */
+	WARN_ON(kref_get_unless_zero(&pool->kref));
+
+	/* pool is now off zswap_pools list and has no references. */
+	zswap_pool_destroy(pool);
+}
+
+static struct zswap_pool *zswap_pool_current(void);
+
+static void __zswap_pool_empty(struct kref *kref)
+{
+	struct zswap_pool *pool;
+
+	pool = container_of(kref, typeof(*pool), kref);
+
+	spin_lock(&zswap_pools_lock);
+
+	WARN_ON(pool == zswap_pool_current());
+
+	list_del_rcu(&pool->list);
+
+	INIT_WORK(&pool->release_work, __zswap_pool_release);
+	schedule_work(&pool->release_work);
+
+	spin_unlock(&zswap_pools_lock);
+}
+
+static int __must_check zswap_pool_get(struct zswap_pool *pool)
+{
+	if (!pool)
+		return 0;
+
+	return kref_get_unless_zero(&pool->kref);
+}
+
+static void zswap_pool_put(struct zswap_pool *pool)
+{
+	kref_put(&pool->kref, __zswap_pool_empty);
+}
+
 /* should be called under RCU */
 #ifdef CONFIG_MEMCG
 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
@@ -1122,51 +1167,6 @@ static void shrink_worker(struct work_struct *w)
 	zswap_pool_put(pool);
 }
 
-static int __must_check zswap_pool_get(struct zswap_pool *pool)
-{
-	if (!pool)
-		return 0;
-
-	return kref_get_unless_zero(&pool->kref);
-}
-
-static void __zswap_pool_release(struct work_struct *work)
-{
-	struct zswap_pool *pool = container_of(work, typeof(*pool),
-						release_work);
-
-	synchronize_rcu();
-
-	/* nobody should have been able to get a kref... */
-	WARN_ON(kref_get_unless_zero(&pool->kref));
-
-	/* pool is now off zswap_pools list and has no references. */
-	zswap_pool_destroy(pool);
-}
-
-static void __zswap_pool_empty(struct kref *kref)
-{
-	struct zswap_pool *pool;
-
-	pool = container_of(kref, typeof(*pool), kref);
-
-	spin_lock(&zswap_pools_lock);
-
-	WARN_ON(pool == zswap_pool_current());
-
-	list_del_rcu(&pool->list);
-
-	INIT_WORK(&pool->release_work, __zswap_pool_release);
-	schedule_work(&pool->release_work);
-
-	spin_unlock(&zswap_pools_lock);
-}
-
-static void zswap_pool_put(struct zswap_pool *pool)
-{
-	kref_put(&pool->kref, __zswap_pool_empty);
-}
-
 /*********************************
 * param callbacks
 **********************************/

From 06b2ee23c95c6655586eba48fb25750e0ebd7878 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:48 -0500
Subject: [PATCH 532/707] mm: zswap: function ordering: zswap_pools

Move the operations against the global zswap_pools list (current pool,
last, find) to the pool section.

Link: https://lkml.kernel.org/r/20240130014208.565554-13-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 150 ++++++++++++++++++++++++++---------------------------
 1 file changed, 73 insertions(+), 77 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 33775f2224b711..168afd6767b3be 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -517,6 +517,79 @@ static void zswap_pool_put(struct zswap_pool *pool)
 	kref_put(&pool->kref, __zswap_pool_empty);
 }
 
+static struct zswap_pool *__zswap_pool_current(void)
+{
+	struct zswap_pool *pool;
+
+	pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
+	WARN_ONCE(!pool && zswap_has_pool,
+		  "%s: no page storage pool!\n", __func__);
+
+	return pool;
+}
+
+static struct zswap_pool *zswap_pool_current(void)
+{
+	assert_spin_locked(&zswap_pools_lock);
+
+	return __zswap_pool_current();
+}
+
+static struct zswap_pool *zswap_pool_current_get(void)
+{
+	struct zswap_pool *pool;
+
+	rcu_read_lock();
+
+	pool = __zswap_pool_current();
+	if (!zswap_pool_get(pool))
+		pool = NULL;
+
+	rcu_read_unlock();
+
+	return pool;
+}
+
+static struct zswap_pool *zswap_pool_last_get(void)
+{
+	struct zswap_pool *pool, *last = NULL;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(pool, &zswap_pools, list)
+		last = pool;
+	WARN_ONCE(!last && zswap_has_pool,
+		  "%s: no page storage pool!\n", __func__);
+	if (!zswap_pool_get(last))
+		last = NULL;
+
+	rcu_read_unlock();
+
+	return last;
+}
+
+/* type and compressor must be null-terminated */
+static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
+{
+	struct zswap_pool *pool;
+
+	assert_spin_locked(&zswap_pools_lock);
+
+	list_for_each_entry_rcu(pool, &zswap_pools, list) {
+		if (strcmp(pool->tfm_name, compressor))
+			continue;
+		/* all zpools share the same type */
+		if (strcmp(zpool_get_type(pool->zpools[0]), type))
+			continue;
+		/* if we can't get it, it's about to be destroyed */
+		if (!zswap_pool_get(pool))
+			continue;
+		return pool;
+	}
+
+	return NULL;
+}
+
 /* should be called under RCU */
 #ifdef CONFIG_MEMCG
 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
@@ -938,83 +1011,6 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
 	return 0;
 }
 
-/*********************************
-* pool functions
-**********************************/
-
-static struct zswap_pool *__zswap_pool_current(void)
-{
-	struct zswap_pool *pool;
-
-	pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
-	WARN_ONCE(!pool && zswap_has_pool,
-		  "%s: no page storage pool!\n", __func__);
-
-	return pool;
-}
-
-static struct zswap_pool *zswap_pool_current(void)
-{
-	assert_spin_locked(&zswap_pools_lock);
-
-	return __zswap_pool_current();
-}
-
-static struct zswap_pool *zswap_pool_current_get(void)
-{
-	struct zswap_pool *pool;
-
-	rcu_read_lock();
-
-	pool = __zswap_pool_current();
-	if (!zswap_pool_get(pool))
-		pool = NULL;
-
-	rcu_read_unlock();
-
-	return pool;
-}
-
-static struct zswap_pool *zswap_pool_last_get(void)
-{
-	struct zswap_pool *pool, *last = NULL;
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(pool, &zswap_pools, list)
-		last = pool;
-	WARN_ONCE(!last && zswap_has_pool,
-		  "%s: no page storage pool!\n", __func__);
-	if (!zswap_pool_get(last))
-		last = NULL;
-
-	rcu_read_unlock();
-
-	return last;
-}
-
-/* type and compressor must be null-terminated */
-static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
-{
-	struct zswap_pool *pool;
-
-	assert_spin_locked(&zswap_pools_lock);
-
-	list_for_each_entry_rcu(pool, &zswap_pools, list) {
-		if (strcmp(pool->tfm_name, compressor))
-			continue;
-		/* all zpools share the same type */
-		if (strcmp(zpool_get_type(pool->zpools[0]), type))
-			continue;
-		/* if we can't get it, it's about to be destroyed */
-		if (!zswap_pool_get(pool))
-			continue;
-		return pool;
-	}
-
-	return NULL;
-}
-
 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
 				       spinlock_t *lock, void *arg)
 {

From 034672cfecbd4ea82e8f0ea57f03d9023a100fb4 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:49 -0500
Subject: [PATCH 533/707] mm: zswap: function ordering: pool params

Patch series "mm: zswap: cleanups".

Cleanups and maintenance items that accumulated while reviewing zswap
patches.


This patch (of 20):

The parameters primarily control pool attributes. Move those
operations up to the pool section.

Link: https://lkml.kernel.org/r/20240130014208.565554-1-hannes@cmpxchg.org
Link: https://lkml.kernel.org/r/20240130014208.565554-14-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 312 ++++++++++++++++++++++++++---------------------------
 1 file changed, 156 insertions(+), 156 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 168afd6767b3be..e650fc58711662 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -590,6 +590,162 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
 	return NULL;
 }
 
+/*********************************
+* param callbacks
+**********************************/
+
+static bool zswap_pool_changed(const char *s, const struct kernel_param *kp)
+{
+	/* no change required */
+	if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
+		return false;
+	return true;
+}
+
+/* val must be a null-terminated string */
+static int __zswap_param_set(const char *val, const struct kernel_param *kp,
+			     char *type, char *compressor)
+{
+	struct zswap_pool *pool, *put_pool = NULL;
+	char *s = strstrip((char *)val);
+	int ret = 0;
+	bool new_pool = false;
+
+	mutex_lock(&zswap_init_lock);
+	switch (zswap_init_state) {
+	case ZSWAP_UNINIT:
+		/* if this is load-time (pre-init) param setting,
+		 * don't create a pool; that's done during init.
+		 */
+		ret = param_set_charp(s, kp);
+		break;
+	case ZSWAP_INIT_SUCCEED:
+		new_pool = zswap_pool_changed(s, kp);
+		break;
+	case ZSWAP_INIT_FAILED:
+		pr_err("can't set param, initialization failed\n");
+		ret = -ENODEV;
+	}
+	mutex_unlock(&zswap_init_lock);
+
+	/* no need to create a new pool, return directly */
+	if (!new_pool)
+		return ret;
+
+	if (!type) {
+		if (!zpool_has_pool(s)) {
+			pr_err("zpool %s not available\n", s);
+			return -ENOENT;
+		}
+		type = s;
+	} else if (!compressor) {
+		if (!crypto_has_acomp(s, 0, 0)) {
+			pr_err("compressor %s not available\n", s);
+			return -ENOENT;
+		}
+		compressor = s;
+	} else {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	spin_lock(&zswap_pools_lock);
+
+	pool = zswap_pool_find_get(type, compressor);
+	if (pool) {
+		zswap_pool_debug("using existing", pool);
+		WARN_ON(pool == zswap_pool_current());
+		list_del_rcu(&pool->list);
+	}
+
+	spin_unlock(&zswap_pools_lock);
+
+	if (!pool)
+		pool = zswap_pool_create(type, compressor);
+
+	if (pool)
+		ret = param_set_charp(s, kp);
+	else
+		ret = -EINVAL;
+
+	spin_lock(&zswap_pools_lock);
+
+	if (!ret) {
+		put_pool = zswap_pool_current();
+		list_add_rcu(&pool->list, &zswap_pools);
+		zswap_has_pool = true;
+	} else if (pool) {
+		/* add the possibly pre-existing pool to the end of the pools
+		 * list; if it's new (and empty) then it'll be removed and
+		 * destroyed by the put after we drop the lock
+		 */
+		list_add_tail_rcu(&pool->list, &zswap_pools);
+		put_pool = pool;
+	}
+
+	spin_unlock(&zswap_pools_lock);
+
+	if (!zswap_has_pool && !pool) {
+		/* if initial pool creation failed, and this pool creation also
+		 * failed, maybe both compressor and zpool params were bad.
+		 * Allow changing this param, so pool creation will succeed
+		 * when the other param is changed. We already verified this
+		 * param is ok in the zpool_has_pool() or crypto_has_acomp()
+		 * checks above.
+		 */
+		ret = param_set_charp(s, kp);
+	}
+
+	/* drop the ref from either the old current pool,
+	 * or the new pool we failed to add
+	 */
+	if (put_pool)
+		zswap_pool_put(put_pool);
+
+	return ret;
+}
+
+static int zswap_compressor_param_set(const char *val,
+				      const struct kernel_param *kp)
+{
+	return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
+}
+
+static int zswap_zpool_param_set(const char *val,
+				 const struct kernel_param *kp)
+{
+	return __zswap_param_set(val, kp, NULL, zswap_compressor);
+}
+
+static int zswap_enabled_param_set(const char *val,
+				   const struct kernel_param *kp)
+{
+	int ret = -ENODEV;
+
+	/* if this is load-time (pre-init) param setting, only set param. */
+	if (system_state != SYSTEM_RUNNING)
+		return param_set_bool(val, kp);
+
+	mutex_lock(&zswap_init_lock);
+	switch (zswap_init_state) {
+	case ZSWAP_UNINIT:
+		if (zswap_setup())
+			break;
+		fallthrough;
+	case ZSWAP_INIT_SUCCEED:
+		if (!zswap_has_pool)
+			pr_err("can't enable, no pool configured\n");
+		else
+			ret = param_set_bool(val, kp);
+		break;
+	case ZSWAP_INIT_FAILED:
+		pr_err("can't enable, initialization failed\n");
+	}
+	mutex_unlock(&zswap_init_lock);
+
+	return ret;
+}
+
 /* should be called under RCU */
 #ifdef CONFIG_MEMCG
 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
@@ -1163,162 +1319,6 @@ static void shrink_worker(struct work_struct *w)
 	zswap_pool_put(pool);
 }
 
-/*********************************
-* param callbacks
-**********************************/
-
-static bool zswap_pool_changed(const char *s, const struct kernel_param *kp)
-{
-	/* no change required */
-	if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
-		return false;
-	return true;
-}
-
-/* val must be a null-terminated string */
-static int __zswap_param_set(const char *val, const struct kernel_param *kp,
-			     char *type, char *compressor)
-{
-	struct zswap_pool *pool, *put_pool = NULL;
-	char *s = strstrip((char *)val);
-	int ret = 0;
-	bool new_pool = false;
-
-	mutex_lock(&zswap_init_lock);
-	switch (zswap_init_state) {
-	case ZSWAP_UNINIT:
-		/* if this is load-time (pre-init) param setting,
-		 * don't create a pool; that's done during init.
-		 */
-		ret = param_set_charp(s, kp);
-		break;
-	case ZSWAP_INIT_SUCCEED:
-		new_pool = zswap_pool_changed(s, kp);
-		break;
-	case ZSWAP_INIT_FAILED:
-		pr_err("can't set param, initialization failed\n");
-		ret = -ENODEV;
-	}
-	mutex_unlock(&zswap_init_lock);
-
-	/* no need to create a new pool, return directly */
-	if (!new_pool)
-		return ret;
-
-	if (!type) {
-		if (!zpool_has_pool(s)) {
-			pr_err("zpool %s not available\n", s);
-			return -ENOENT;
-		}
-		type = s;
-	} else if (!compressor) {
-		if (!crypto_has_acomp(s, 0, 0)) {
-			pr_err("compressor %s not available\n", s);
-			return -ENOENT;
-		}
-		compressor = s;
-	} else {
-		WARN_ON(1);
-		return -EINVAL;
-	}
-
-	spin_lock(&zswap_pools_lock);
-
-	pool = zswap_pool_find_get(type, compressor);
-	if (pool) {
-		zswap_pool_debug("using existing", pool);
-		WARN_ON(pool == zswap_pool_current());
-		list_del_rcu(&pool->list);
-	}
-
-	spin_unlock(&zswap_pools_lock);
-
-	if (!pool)
-		pool = zswap_pool_create(type, compressor);
-
-	if (pool)
-		ret = param_set_charp(s, kp);
-	else
-		ret = -EINVAL;
-
-	spin_lock(&zswap_pools_lock);
-
-	if (!ret) {
-		put_pool = zswap_pool_current();
-		list_add_rcu(&pool->list, &zswap_pools);
-		zswap_has_pool = true;
-	} else if (pool) {
-		/* add the possibly pre-existing pool to the end of the pools
-		 * list; if it's new (and empty) then it'll be removed and
-		 * destroyed by the put after we drop the lock
-		 */
-		list_add_tail_rcu(&pool->list, &zswap_pools);
-		put_pool = pool;
-	}
-
-	spin_unlock(&zswap_pools_lock);
-
-	if (!zswap_has_pool && !pool) {
-		/* if initial pool creation failed, and this pool creation also
-		 * failed, maybe both compressor and zpool params were bad.
-		 * Allow changing this param, so pool creation will succeed
-		 * when the other param is changed. We already verified this
-		 * param is ok in the zpool_has_pool() or crypto_has_acomp()
-		 * checks above.
-		 */
-		ret = param_set_charp(s, kp);
-	}
-
-	/* drop the ref from either the old current pool,
-	 * or the new pool we failed to add
-	 */
-	if (put_pool)
-		zswap_pool_put(put_pool);
-
-	return ret;
-}
-
-static int zswap_compressor_param_set(const char *val,
-				      const struct kernel_param *kp)
-{
-	return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
-}
-
-static int zswap_zpool_param_set(const char *val,
-				 const struct kernel_param *kp)
-{
-	return __zswap_param_set(val, kp, NULL, zswap_compressor);
-}
-
-static int zswap_enabled_param_set(const char *val,
-				   const struct kernel_param *kp)
-{
-	int ret = -ENODEV;
-
-	/* if this is load-time (pre-init) param setting, only set param. */
-	if (system_state != SYSTEM_RUNNING)
-		return param_set_bool(val, kp);
-
-	mutex_lock(&zswap_init_lock);
-	switch (zswap_init_state) {
-	case ZSWAP_UNINIT:
-		if (zswap_setup())
-			break;
-		fallthrough;
-	case ZSWAP_INIT_SUCCEED:
-		if (!zswap_has_pool)
-			pr_err("can't enable, no pool configured\n");
-		else
-			ret = param_set_bool(val, kp);
-		break;
-	case ZSWAP_INIT_FAILED:
-		pr_err("can't enable, initialization failed\n");
-	}
-	mutex_unlock(&zswap_init_lock);
-
-	return ret;
-}
-
 static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
 {
 	struct crypto_acomp_ctx *acomp_ctx;

From 3bcc879d25478db7730e98216c3ab0bcac10b3e6 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:50 -0500
Subject: [PATCH 534/707] mm: zswap: function ordering: public lru api

The zswap entry section sits awkwardly in the middle of LRU-related
functions. Group the external LRU API functions first.

Link: https://lkml.kernel.org/r/20240130014208.565554-15-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index e650fc58711662..511bfafc1456d0 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -746,6 +746,10 @@ static int zswap_enabled_param_set(const char *val,
 	return ret;
 }
 
+/*********************************
+* lru functions
+**********************************/
+
 /* should be called under RCU */
 #ifdef CONFIG_MEMCG
 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
@@ -764,6 +768,21 @@ static inline int entry_to_nid(struct zswap_entry *entry)
 	return page_to_nid(virt_to_page(entry));
 }
 
+void zswap_lruvec_state_init(struct lruvec *lruvec)
+{
+	atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
+}
+
+void zswap_folio_swapin(struct folio *folio)
+{
+	struct lruvec *lruvec;
+
+	if (folio) {
+		lruvec = folio_lruvec(folio);
+		atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+	}
+}
+
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
 {
 	struct zswap_pool *pool;
@@ -798,24 +817,6 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
 	kmem_cache_free(zswap_entry_cache, entry);
 }
 
-/*********************************
-* zswap lruvec functions
-**********************************/
-void zswap_lruvec_state_init(struct lruvec *lruvec)
-{
-	atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
-}
-
-void zswap_folio_swapin(struct folio *folio)
-{
-	struct lruvec *lruvec;
-
-	if (folio) {
-		lruvec = folio_lruvec(folio);
-		atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
-	}
-}
-
 /*********************************
 * lru functions
 **********************************/

From e9c7d9d6bda18627e3ac3557d587b73e6cca8518 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:51 -0500
Subject: [PATCH 535/707] mm: zswap: function ordering: move entry sections out
 of LRU section

This completes consolidation of the LRU section.

Link: https://lkml.kernel.org/r/20240130014208.565554-16-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 101 ++++++++++++++++++++++++++---------------------------
 1 file changed, 49 insertions(+), 52 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 511bfafc1456d0..756d4d575efef2 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -768,58 +768,6 @@ static inline int entry_to_nid(struct zswap_entry *entry)
 	return page_to_nid(virt_to_page(entry));
 }
 
-void zswap_lruvec_state_init(struct lruvec *lruvec)
-{
-	atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
-}
-
-void zswap_folio_swapin(struct folio *folio)
-{
-	struct lruvec *lruvec;
-
-	if (folio) {
-		lruvec = folio_lruvec(folio);
-		atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
-	}
-}
-
-void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
-{
-	struct zswap_pool *pool;
-
-	/* lock out zswap pools list modification */
-	spin_lock(&zswap_pools_lock);
-	list_for_each_entry(pool, &zswap_pools, list) {
-		if (pool->next_shrink == memcg)
-			pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
-	}
-	spin_unlock(&zswap_pools_lock);
-}
-
-/*********************************
-* zswap entry functions
-**********************************/
-static struct kmem_cache *zswap_entry_cache;
-
-static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
-{
-	struct zswap_entry *entry;
-	entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
-	if (!entry)
-		return NULL;
-	entry->refcount = 1;
-	RB_CLEAR_NODE(&entry->rbnode);
-	return entry;
-}
-
-static void zswap_entry_cache_free(struct zswap_entry *entry)
-{
-	kmem_cache_free(zswap_entry_cache, entry);
-}
-
-/*********************************
-* lru functions
-**********************************/
 static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
 {
 	atomic_long_t *nr_zswap_protected;
@@ -872,6 +820,55 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
 	rcu_read_unlock();
 }
 
+void zswap_lruvec_state_init(struct lruvec *lruvec)
+{
+	atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
+}
+
+void zswap_folio_swapin(struct folio *folio)
+{
+	struct lruvec *lruvec;
+
+	if (folio) {
+		lruvec = folio_lruvec(folio);
+		atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+	}
+}
+
+void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
+{
+	struct zswap_pool *pool;
+
+	/* lock out zswap pools list modification */
+	spin_lock(&zswap_pools_lock);
+	list_for_each_entry(pool, &zswap_pools, list) {
+		if (pool->next_shrink == memcg)
+			pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
+	}
+	spin_unlock(&zswap_pools_lock);
+}
+
+/*********************************
+* zswap entry functions
+**********************************/
+static struct kmem_cache *zswap_entry_cache;
+
+static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
+{
+	struct zswap_entry *entry;
+	entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
+	if (!entry)
+		return NULL;
+	entry->refcount = 1;
+	RB_CLEAR_NODE(&entry->rbnode);
+	return entry;
+}
+
+static void zswap_entry_cache_free(struct zswap_entry *entry)
+{
+	kmem_cache_free(zswap_entry_cache, entry);
+}
+
 /*********************************
 * rbtree functions
 **********************************/

From f74d04c807e343d4e82450922067c23a4e383426 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:52 -0500
Subject: [PATCH 536/707] mm: zswap: function ordering: move entry section out
 of tree section

The higher-level entry operations modify the tree, so move the entry
API after the tree section.

Link: https://lkml.kernel.org/r/20240130014208.565554-17-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 756d4d575efef2..80adc2f7d1a2b8 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -848,27 +848,6 @@ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
 	spin_unlock(&zswap_pools_lock);
 }
 
-/*********************************
-* zswap entry functions
-**********************************/
-static struct kmem_cache *zswap_entry_cache;
-
-static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
-{
-	struct zswap_entry *entry;
-	entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
-	if (!entry)
-		return NULL;
-	entry->refcount = 1;
-	RB_CLEAR_NODE(&entry->rbnode);
-	return entry;
-}
-
-static void zswap_entry_cache_free(struct zswap_entry *entry)
-{
-	kmem_cache_free(zswap_entry_cache, entry);
-}
-
 /*********************************
 * rbtree functions
 **********************************/
@@ -930,6 +909,27 @@ static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
 	return false;
 }
 
+/*********************************
+* zswap entry functions
+**********************************/
+static struct kmem_cache *zswap_entry_cache;
+
+static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
+{
+	struct zswap_entry *entry;
+	entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
+	if (!entry)
+		return NULL;
+	entry->refcount = 1;
+	RB_CLEAR_NODE(&entry->rbnode);
+	return entry;
+}
+
+static void zswap_entry_cache_free(struct zswap_entry *entry)
+{
+	kmem_cache_free(zswap_entry_cache, entry);
+}
+
 static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
 {
 	int i = 0;

From 326b395ae838f4e793336a752053e73069ed4fdc Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:53 -0500
Subject: [PATCH 537/707] mm: zswap: function ordering: compress & decompress
 functions

Writeback needs to decompress. Move the (de)compression API above what
will be the consolidated shrinking/writeback code.

Link: https://lkml.kernel.org/r/20240130014208.565554-18-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 207 +++++++++++++++++++++++++++--------------------------
 1 file changed, 105 insertions(+), 102 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 80adc2f7d1a2b8..17356b2e35c2cd 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -992,6 +992,111 @@ static void zswap_invalidate_entry(struct zswap_tree *tree,
 		zswap_entry_put(entry);
 }
 
+/*********************************
+* compressed storage functions
+**********************************/
+static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
+{
+	struct crypto_acomp_ctx *acomp_ctx;
+	struct scatterlist input, output;
+	unsigned int dlen = PAGE_SIZE;
+	unsigned long handle;
+	struct zpool *zpool;
+	char *buf;
+	gfp_t gfp;
+	int ret;
+	u8 *dst;
+
+	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+
+	mutex_lock(&acomp_ctx->mutex);
+
+	dst = acomp_ctx->buffer;
+	sg_init_table(&input, 1);
+	sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
+
+	/*
+	 * We need PAGE_SIZE * 2 here since there maybe over-compression case,
+	 * and hardware-accelerators may won't check the dst buffer size, so
+	 * giving the dst buffer with enough length to avoid buffer overflow.
+	 */
+	sg_init_one(&output, dst, PAGE_SIZE * 2);
+	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
+
+	/*
+	 * it maybe looks a little bit silly that we send an asynchronous request,
+	 * then wait for its completion synchronously. This makes the process look
+	 * synchronous in fact.
+	 * Theoretically, acomp supports users send multiple acomp requests in one
+	 * acomp instance, then get those requests done simultaneously. but in this
+	 * case, zswap actually does store and load page by page, there is no
+	 * existing method to send the second page before the first page is done
+	 * in one thread doing zwap.
+	 * but in different threads running on different cpu, we have different
+	 * acomp instance, so multiple threads can do (de)compression in parallel.
+	 */
+	ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
+	dlen = acomp_ctx->req->dlen;
+	if (ret) {
+		zswap_reject_compress_fail++;
+		goto unlock;
+	}
+
+	zpool = zswap_find_zpool(entry);
+	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+	if (zpool_malloc_support_movable(zpool))
+		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+	ret = zpool_malloc(zpool, dlen, gfp, &handle);
+	if (ret == -ENOSPC) {
+		zswap_reject_compress_poor++;
+		goto unlock;
+	}
+	if (ret) {
+		zswap_reject_alloc_fail++;
+		goto unlock;
+	}
+
+	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
+	memcpy(buf, dst, dlen);
+	zpool_unmap_handle(zpool, handle);
+
+	entry->handle = handle;
+	entry->length = dlen;
+
+unlock:
+	mutex_unlock(&acomp_ctx->mutex);
+	return ret == 0;
+}
+
+static void zswap_decompress(struct zswap_entry *entry, struct page *page)
+{
+	struct zpool *zpool = zswap_find_zpool(entry);
+	struct scatterlist input, output;
+	struct crypto_acomp_ctx *acomp_ctx;
+	u8 *src;
+
+	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+	mutex_lock(&acomp_ctx->mutex);
+
+	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
+	if (!zpool_can_sleep_mapped(zpool)) {
+		memcpy(acomp_ctx->buffer, src, entry->length);
+		src = acomp_ctx->buffer;
+		zpool_unmap_handle(zpool, entry->handle);
+	}
+
+	sg_init_one(&input, src, entry->length);
+	sg_init_table(&output, 1);
+	sg_set_page(&output, page, PAGE_SIZE, 0);
+	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
+	BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
+	BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
+	mutex_unlock(&acomp_ctx->mutex);
+
+	if (zpool_can_sleep_mapped(zpool))
+		zpool_unmap_handle(zpool, entry->handle);
+}
+
 /*********************************
 * shrinker functions
 **********************************/
@@ -1317,108 +1422,6 @@ static void shrink_worker(struct work_struct *w)
 	zswap_pool_put(pool);
 }
 
-static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
-{
-	struct crypto_acomp_ctx *acomp_ctx;
-	struct scatterlist input, output;
-	unsigned int dlen = PAGE_SIZE;
-	unsigned long handle;
-	struct zpool *zpool;
-	char *buf;
-	gfp_t gfp;
-	int ret;
-	u8 *dst;
-
-	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-
-	mutex_lock(&acomp_ctx->mutex);
-
-	dst = acomp_ctx->buffer;
-	sg_init_table(&input, 1);
-	sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
-
-	/*
-	 * We need PAGE_SIZE * 2 here since there maybe over-compression case,
-	 * and hardware-accelerators may won't check the dst buffer size, so
-	 * giving the dst buffer with enough length to avoid buffer overflow.
-	 */
-	sg_init_one(&output, dst, PAGE_SIZE * 2);
-	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
-
-	/*
-	 * it maybe looks a little bit silly that we send an asynchronous request,
-	 * then wait for its completion synchronously. This makes the process look
-	 * synchronous in fact.
-	 * Theoretically, acomp supports users send multiple acomp requests in one
-	 * acomp instance, then get those requests done simultaneously. but in this
-	 * case, zswap actually does store and load page by page, there is no
-	 * existing method to send the second page before the first page is done
-	 * in one thread doing zwap.
-	 * but in different threads running on different cpu, we have different
-	 * acomp instance, so multiple threads can do (de)compression in parallel.
-	 */
-	ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
-	dlen = acomp_ctx->req->dlen;
-	if (ret) {
-		zswap_reject_compress_fail++;
-		goto unlock;
-	}
-
-	zpool = zswap_find_zpool(entry);
-	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
-	if (zpool_malloc_support_movable(zpool))
-		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
-	ret = zpool_malloc(zpool, dlen, gfp, &handle);
-	if (ret == -ENOSPC) {
-		zswap_reject_compress_poor++;
-		goto unlock;
-	}
-	if (ret) {
-		zswap_reject_alloc_fail++;
-		goto unlock;
-	}
-
-	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
-	memcpy(buf, dst, dlen);
-	zpool_unmap_handle(zpool, handle);
-
-	entry->handle = handle;
-	entry->length = dlen;
-
-unlock:
-	mutex_unlock(&acomp_ctx->mutex);
-	return ret == 0;
-}
-
-static void zswap_decompress(struct zswap_entry *entry, struct page *page)
-{
-	struct zpool *zpool = zswap_find_zpool(entry);
-	struct scatterlist input, output;
-	struct crypto_acomp_ctx *acomp_ctx;
-	u8 *src;
-
-	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-	mutex_lock(&acomp_ctx->mutex);
-
-	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
-	if (!zpool_can_sleep_mapped(zpool)) {
-		memcpy(acomp_ctx->buffer, src, entry->length);
-		src = acomp_ctx->buffer;
-		zpool_unmap_handle(zpool, entry->handle);
-	}
-
-	sg_init_one(&input, src, entry->length);
-	sg_init_table(&output, 1);
-	sg_set_page(&output, page, PAGE_SIZE, 0);
-	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
-	BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
-	BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
-	mutex_unlock(&acomp_ctx->mutex);
-
-	if (zpool_can_sleep_mapped(zpool))
-		zpool_unmap_handle(zpool, entry->handle);
-}
-
 /*********************************
 * writeback code
 **********************************/

From fe8415052d45c7eb5be6844a163b9dbcceaa408b Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:54 -0500
Subject: [PATCH 538/707] mm: zswap: function ordering: per-cpu compression
 infra

The per-cpu compression init/exit callbacks are awkwardly in the
middle of the shrinker code. Move them up to the compression section.

Link: https://lkml.kernel.org/r/20240130014208.565554-19-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 135 ++++++++++++++++++++++++++---------------------------
 1 file changed, 66 insertions(+), 69 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 17356b2e35c2cd..acd7dcd1e0f2de 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -995,6 +995,72 @@ static void zswap_invalidate_entry(struct zswap_tree *tree,
 /*********************************
 * compressed storage functions
 **********************************/
+static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
+{
+	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+	struct crypto_acomp *acomp;
+	struct acomp_req *req;
+	int ret;
+
+	mutex_init(&acomp_ctx->mutex);
+
+	acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+	if (!acomp_ctx->buffer)
+		return -ENOMEM;
+
+	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
+	if (IS_ERR(acomp)) {
+		pr_err("could not alloc crypto acomp %s : %ld\n",
+				pool->tfm_name, PTR_ERR(acomp));
+		ret = PTR_ERR(acomp);
+		goto acomp_fail;
+	}
+	acomp_ctx->acomp = acomp;
+
+	req = acomp_request_alloc(acomp_ctx->acomp);
+	if (!req) {
+		pr_err("could not alloc crypto acomp_request %s\n",
+		       pool->tfm_name);
+		ret = -ENOMEM;
+		goto req_fail;
+	}
+	acomp_ctx->req = req;
+
+	crypto_init_wait(&acomp_ctx->wait);
+	/*
+	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
+	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
+	 * won't be called, crypto_wait_req() will return without blocking.
+	 */
+	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				   crypto_req_done, &acomp_ctx->wait);
+
+	return 0;
+
+req_fail:
+	crypto_free_acomp(acomp_ctx->acomp);
+acomp_fail:
+	kfree(acomp_ctx->buffer);
+	return ret;
+}
+
+static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
+{
+	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+
+	if (!IS_ERR_OR_NULL(acomp_ctx)) {
+		if (!IS_ERR_OR_NULL(acomp_ctx->req))
+			acomp_request_free(acomp_ctx->req);
+		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
+			crypto_free_acomp(acomp_ctx->acomp);
+		kfree(acomp_ctx->buffer);
+	}
+
+	return 0;
+}
+
 static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
 {
 	struct crypto_acomp_ctx *acomp_ctx;
@@ -1201,75 +1267,6 @@ static void zswap_alloc_shrinker(struct zswap_pool *pool)
 	pool->shrinker->seeks = DEFAULT_SEEKS;
 }
 
-/*********************************
-* per-cpu code
-**********************************/
-static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
-{
-	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
-	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
-	struct crypto_acomp *acomp;
-	struct acomp_req *req;
-	int ret;
-
-	mutex_init(&acomp_ctx->mutex);
-
-	acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
-	if (!acomp_ctx->buffer)
-		return -ENOMEM;
-
-	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
-	if (IS_ERR(acomp)) {
-		pr_err("could not alloc crypto acomp %s : %ld\n",
-				pool->tfm_name, PTR_ERR(acomp));
-		ret = PTR_ERR(acomp);
-		goto acomp_fail;
-	}
-	acomp_ctx->acomp = acomp;
-
-	req = acomp_request_alloc(acomp_ctx->acomp);
-	if (!req) {
-		pr_err("could not alloc crypto acomp_request %s\n",
-		       pool->tfm_name);
-		ret = -ENOMEM;
-		goto req_fail;
-	}
-	acomp_ctx->req = req;
-
-	crypto_init_wait(&acomp_ctx->wait);
-	/*
-	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
-	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
-	 * won't be called, crypto_wait_req() will return without blocking.
-	 */
-	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
-				   crypto_req_done, &acomp_ctx->wait);
-
-	return 0;
-
-req_fail:
-	crypto_free_acomp(acomp_ctx->acomp);
-acomp_fail:
-	kfree(acomp_ctx->buffer);
-	return ret;
-}
-
-static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
-{
-	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
-	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
-
-	if (!IS_ERR_OR_NULL(acomp_ctx)) {
-		if (!IS_ERR_OR_NULL(acomp_ctx->req))
-			acomp_request_free(acomp_ctx->req);
-		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
-			crypto_free_acomp(acomp_ctx->acomp);
-		kfree(acomp_ctx->buffer);
-	}
-
-	return 0;
-}
-
 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
 				       spinlock_t *lock, void *arg)
 {

From 45e264a25d7c022125692602e17dbc20d663aaf3 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:55 -0500
Subject: [PATCH 539/707] mm: zswap: function ordering: writeback

Shrinking needs writeback. Naturally, move the writeback code above
the shrinking code. Delete the forward decl.

Link: https://lkml.kernel.org/r/20240130014208.565554-20-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 183 ++++++++++++++++++++++++++---------------------------
 1 file changed, 90 insertions(+), 93 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index acd7dcd1e0f2de..0cb3437d47eba3 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -276,9 +276,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
 	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
 		 zpool_get_type((p)->zpools[0]))
 
-static int zswap_writeback_entry(struct zswap_entry *entry,
-				 swp_entry_t swpentry);
-
 static bool zswap_is_full(void)
 {
 	return totalram_pages() * zswap_max_pool_percent / 100 <
@@ -1163,6 +1160,96 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page)
 		zpool_unmap_handle(zpool, entry->handle);
 }
 
+/*********************************
+* writeback code
+**********************************/
+/*
+ * Attempts to free an entry by adding a folio to the swap cache,
+ * decompressing the entry data into the folio, and issuing a
+ * bio write to write the folio back to the swap device.
+ *
+ * This can be thought of as a "resumed writeback" of the folio
+ * to the swap device.  We are basically resuming the same swap
+ * writeback path that was intercepted with the zswap_store()
+ * in the first place.  After the folio has been decompressed into
+ * the swap cache, the compressed version stored by zswap can be
+ * freed.
+ */
+static int zswap_writeback_entry(struct zswap_entry *entry,
+				 swp_entry_t swpentry)
+{
+	struct zswap_tree *tree;
+	struct folio *folio;
+	struct mempolicy *mpol;
+	bool folio_was_allocated;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+	};
+
+	/* try to allocate swap cache folio */
+	mpol = get_task_policy(current);
+	folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
+				NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+	if (!folio)
+		return -ENOMEM;
+
+	/*
+	 * Found an existing folio, we raced with swapin or concurrent
+	 * shrinker. We generally writeback cold folios from zswap, and
+	 * swapin means the folio just became hot, so skip this folio.
+	 * For unlikely concurrent shrinker case, it will be unlinked
+	 * and freed when invalidated by the concurrent shrinker anyway.
+	 */
+	if (!folio_was_allocated) {
+		folio_put(folio);
+		return -EEXIST;
+	}
+
+	/*
+	 * folio is locked, and the swapcache is now secured against
+	 * concurrent swapping to and from the slot. Verify that the
+	 * swap entry hasn't been invalidated and recycled behind our
+	 * backs (our zswap_entry reference doesn't prevent that), to
+	 * avoid overwriting a new swap folio with old compressed data.
+	 */
+	tree = swap_zswap_tree(swpentry);
+	spin_lock(&tree->lock);
+	if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) {
+		spin_unlock(&tree->lock);
+		delete_from_swap_cache(folio);
+		folio_unlock(folio);
+		folio_put(folio);
+		return -ENOMEM;
+	}
+
+	/* Safe to deref entry after the entry is verified above. */
+	zswap_entry_get(entry);
+	spin_unlock(&tree->lock);
+
+	zswap_decompress(entry, &folio->page);
+
+	count_vm_event(ZSWPWB);
+	if (entry->objcg)
+		count_objcg_event(entry->objcg, ZSWPWB);
+
+	spin_lock(&tree->lock);
+	zswap_invalidate_entry(tree, entry);
+	zswap_entry_put(entry);
+	spin_unlock(&tree->lock);
+
+	/* folio is up to date */
+	folio_mark_uptodate(folio);
+
+	/* move it to the tail of the inactive list after end_writeback */
+	folio_set_reclaim(folio);
+
+	/* start writeback */
+	__swap_writepage(folio, &wbc);
+	folio_put(folio);
+
+	return 0;
+}
+
 /*********************************
 * shrinker functions
 **********************************/
@@ -1419,96 +1506,6 @@ static void shrink_worker(struct work_struct *w)
 	zswap_pool_put(pool);
 }
 
-/*********************************
-* writeback code
-**********************************/
-/*
- * Attempts to free an entry by adding a folio to the swap cache,
- * decompressing the entry data into the folio, and issuing a
- * bio write to write the folio back to the swap device.
- *
- * This can be thought of as a "resumed writeback" of the folio
- * to the swap device.  We are basically resuming the same swap
- * writeback path that was intercepted with the zswap_store()
- * in the first place.  After the folio has been decompressed into
- * the swap cache, the compressed version stored by zswap can be
- * freed.
- */
-static int zswap_writeback_entry(struct zswap_entry *entry,
-				 swp_entry_t swpentry)
-{
-	struct zswap_tree *tree;
-	struct folio *folio;
-	struct mempolicy *mpol;
-	bool folio_was_allocated;
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_NONE,
-	};
-
-	/* try to allocate swap cache folio */
-	mpol = get_task_policy(current);
-	folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
-				NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
-	if (!folio)
-		return -ENOMEM;
-
-	/*
-	 * Found an existing folio, we raced with swapin or concurrent
-	 * shrinker. We generally writeback cold folios from zswap, and
-	 * swapin means the folio just became hot, so skip this folio.
-	 * For unlikely concurrent shrinker case, it will be unlinked
-	 * and freed when invalidated by the concurrent shrinker anyway.
-	 */
-	if (!folio_was_allocated) {
-		folio_put(folio);
-		return -EEXIST;
-	}
-
-	/*
-	 * folio is locked, and the swapcache is now secured against
-	 * concurrent swapping to and from the slot. Verify that the
-	 * swap entry hasn't been invalidated and recycled behind our
-	 * backs (our zswap_entry reference doesn't prevent that), to
-	 * avoid overwriting a new swap folio with old compressed data.
-	 */
-	tree = swap_zswap_tree(swpentry);
-	spin_lock(&tree->lock);
-	if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) {
-		spin_unlock(&tree->lock);
-		delete_from_swap_cache(folio);
-		folio_unlock(folio);
-		folio_put(folio);
-		return -ENOMEM;
-	}
-
-	/* Safe to deref entry after the entry is verified above. */
-	zswap_entry_get(entry);
-	spin_unlock(&tree->lock);
-
-	zswap_decompress(entry, &folio->page);
-
-	count_vm_event(ZSWPWB);
-	if (entry->objcg)
-		count_objcg_event(entry->objcg, ZSWPWB);
-
-	spin_lock(&tree->lock);
-	zswap_invalidate_entry(tree, entry);
-	zswap_entry_put(entry);
-	spin_unlock(&tree->lock);
-
-	/* folio is up to date */
-	folio_mark_uptodate(folio);
-
-	/* move it to the tail of the inactive list after end_writeback */
-	folio_set_reclaim(folio);
-
-	/* start writeback */
-	__swap_writepage(folio, &wbc);
-	folio_put(folio);
-
-	return 0;
-}
-
 static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
 {
 	unsigned long *page;

From 7f4e0cf40b8d440755a3c038c4c4c123fe778223 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Jan 2024 20:36:56 -0500
Subject: [PATCH 540/707] mm: zswap: function ordering: shrink_memcg_cb

shrink_memcg_cb() is called by the shrinker and is based on
zswap_writeback_entry(). Move it in between. Save one fwd decl.

Link: https://lkml.kernel.org/r/20240130014208.565554-21-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Chengming Zhou <zhouchengming@bytedance.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/zswap.c | 125 ++++++++++++++++++++++++++---------------------------
 1 file changed, 61 insertions(+), 64 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 0cb3437d47eba3..4aea03285532b7 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1254,7 +1254,67 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 * shrinker functions
 **********************************/
 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
-				       spinlock_t *lock, void *arg);
+				       spinlock_t *lock, void *arg)
+{
+	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
+	bool *encountered_page_in_swapcache = (bool *)arg;
+	swp_entry_t swpentry;
+	enum lru_status ret = LRU_REMOVED_RETRY;
+	int writeback_result;
+
+	/*
+	 * Rotate the entry to the tail before unlocking the LRU,
+	 * so that in case of an invalidation race concurrent
+	 * reclaimers don't waste their time on it.
+	 *
+	 * If writeback succeeds, or failure is due to the entry
+	 * being invalidated by the swap subsystem, the invalidation
+	 * will unlink and free it.
+	 *
+	 * Temporary failures, where the same entry should be tried
+	 * again immediately, almost never happen for this shrinker.
+	 * We don't do any trylocking; -ENOMEM comes closest,
+	 * but that's extremely rare and doesn't happen spuriously
+	 * either. Don't bother distinguishing this case.
+	 *
+	 * But since they do exist in theory, the entry cannot just
+	 * be unlinked, or we could leak it. Hence, rotate.
+	 */
+	list_move_tail(item, &l->list);
+
+	/*
+	 * Once the lru lock is dropped, the entry might get freed. The
+	 * swpentry is copied to the stack, and entry isn't deref'd again
+	 * until the entry is verified to still be alive in the tree.
+	 */
+	swpentry = entry->swpentry;
+
+	/*
+	 * It's safe to drop the lock here because we return either
+	 * LRU_REMOVED_RETRY or LRU_RETRY.
+	 */
+	spin_unlock(lock);
+
+	writeback_result = zswap_writeback_entry(entry, swpentry);
+
+	if (writeback_result) {
+		zswap_reject_reclaim_fail++;
+		ret = LRU_RETRY;
+
+		/*
+		 * Encountering a page already in swap cache is a sign that we are shrinking
+		 * into the warmer region. We should terminate shrinking (if we're in the dynamic
+		 * shrinker context).
+		 */
+		if (writeback_result == -EEXIST && encountered_page_in_swapcache)
+			*encountered_page_in_swapcache = true;
+	} else {
+		zswap_written_back_pages++;
+	}
+
+	spin_lock(lock);
+	return ret;
+}
 
 static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
 		struct shrink_control *sc)
@@ -1354,69 +1414,6 @@ static void zswap_alloc_shrinker(struct zswap_pool *pool)
 	pool->shrinker->seeks = DEFAULT_SEEKS;
 }
 
-static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
-				       spinlock_t *lock, void *arg)
-{
-	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
-	bool *encountered_page_in_swapcache = (bool *)arg;
-	swp_entry_t swpentry;
-	enum lru_status ret = LRU_REMOVED_RETRY;
-	int writeback_result;
-
-	/*
-	 * Rotate the entry to the tail before unlocking the LRU,
-	 * so that in case of an invalidation race concurrent
-	 * reclaimers don't waste their time on it.
-	 *
-	 * If writeback succeeds, or failure is due to the entry
-	 * being invalidated by the swap subsystem, the invalidation
-	 * will unlink and free it.
-	 *
-	 * Temporary failures, where the same entry should be tried
-	 * again immediately, almost never happen for this shrinker.
-	 * We don't do any trylocking; -ENOMEM comes closest,
-	 * but that's extremely rare and doesn't happen spuriously
-	 * either. Don't bother distinguishing this case.
-	 *
-	 * But since they do exist in theory, the entry cannot just
-	 * be unlinked, or we could leak it. Hence, rotate.
-	 */
-	list_move_tail(item, &l->list);
-
-	/*
-	 * Once the lru lock is dropped, the entry might get freed. The
-	 * swpentry is copied to the stack, and entry isn't deref'd again
-	 * until the entry is verified to still be alive in the tree.
-	 */
-	swpentry = entry->swpentry;
-
-	/*
-	 * It's safe to drop the lock here because we return either
-	 * LRU_REMOVED_RETRY or LRU_RETRY.
-	 */
-	spin_unlock(lock);
-
-	writeback_result = zswap_writeback_entry(entry, swpentry);
-
-	if (writeback_result) {
-		zswap_reject_reclaim_fail++;
-		ret = LRU_RETRY;
-
-		/*
-		 * Encountering a page already in swap cache is a sign that we are shrinking
-		 * into the warmer region. We should terminate shrinking (if we're in the dynamic
-		 * shrinker context).
-		 */
-		if (writeback_result == -EEXIST && encountered_page_in_swapcache)
-			*encountered_page_in_swapcache = true;
-	} else {
-		zswap_written_back_pages++;
-	}
-
-	spin_lock(lock);
-	return ret;
-}
-
 static int shrink_memcg(struct mem_cgroup *memcg)
 {
 	struct zswap_pool *pool;

From b4489b2e8ef5277bd8925547b3464886c1192ff8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:40 -0800
Subject: [PATCH 541/707] Docs/admin-guide/mm/damon/usage: use sysfs interface
 for tracepoints example

Patch series "mm/damon: make DAMON debugfs interface deprecation
unignorable".

DAMON debugfs interface is deprecated in February 2023, by commit
5445fcbc4cda ("Docs/admin-guide/mm/damon/usage: add DAMON debugfs
interface deprecation notice").  Make the fact unable to be easily ignored
by removing an example usage from the document (patch 1), renaming the
config (patch 2), adding a deprecation notice file to the debugfs
directory (patches 3-5), and renaming the debugfs file that essnetial to
be used for real use of DAMON (patches 6-9).


This patch (of 9):

DAMON tracepoints example on the DAMON usage document is using DAMON
debugfs interface, which is deprecated.  Use its alternative, DAMON sysfs
interface.

Link: https://lkml.kernel.org/r/20240130013549.89538-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20240130013549.89538-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 9d23144bf98501..f2feabb4bd35c7 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -579,11 +579,11 @@ monitoring results recording.
 While the monitoring is turned on, you could record the tracepoint events and
 show results using tracepoint supporting tools like ``perf``.  For example::
 
-    # echo on > monitor_on
+    # echo on > kdamonds/0/state
     # perf record -e damon:damon_aggregated &
     # sleep 5
     # kill 9 $(pidof perf)
-    # echo off > monitor_on
+    # echo off > kdamonds/0/state
     # perf script
     kdamond.0 46568 [027] 79357.842179: damon:damon_aggregated: target_id=0 nr_regions=11 122509119488-135708762112: 0 864
     [...]

From f85fa5fa90db6f65d92f6cb99c6bcb655898154b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:41 -0800
Subject: [PATCH 542/707] mm/damon: rename CONFIG_DAMON_DBGFS to
 DAMON_DBGFS_DEPRECATED

DAMON debugfs interface is deprecated.  The fact has documented by commit
5445fcbc4cda ("Docs/admin-guide/mm/damon/usage: add DAMON debugfs
interface deprecation notice").  Commit 620932cd2852 ("mm/damon/dbgfs:
print DAMON debugfs interface deprecation message") further started
printing a warning message when users still use it.  Many people don't
read documentation or kernel log, though.

Make the deprecation harder to be ignored using the approach of commit
eb07c4f39c3e ("mm/slab: rename CONFIG_SLAB to CONFIG_SLAB_DEPRECATED").
'make oldconfig' with 'CONFIG_DAMON_DBGFS=y' will get a new prompt with
the explicit deprecation notice on the name.  'make olddefconfig' with
'CONFIG_DAMON_DBGFS=y' will result in not building DAMON debugfs
interface.  If there is a real user of DAMON debugfs interface, they will
complain the change to the builder.

Link: https://lkml.kernel.org/r/20240130013549.89538-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/Kconfig | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 29f43fbc2eff13..fecb8172410c54 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -71,7 +71,7 @@ config DAMON_SYSFS_KUNIT_TEST
 
 	  If unsure, say N.
 
-config DAMON_DBGFS
+config DAMON_DBGFS_DEPRECATED
 	bool "DAMON debugfs interface (DEPRECATED!)"
 	depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
 	help
@@ -84,6 +84,11 @@ config DAMON_DBGFS
 	  (DAMON_SYSFS).  If you depend on this and cannot move, please report
 	  your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
 
+config DAMON_DBGFS
+	bool
+	default y
+	depends on DAMON_DBGFS_DEPRECATED
+
 config DAMON_DBGFS_KUNIT_TEST
 	bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
 	depends on DAMON_DBGFS && KUNIT=y

From adf9047adfff48aa1a17257f82f2f3f08eb0ccc9 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:42 -0800
Subject: [PATCH 543/707] mm/damon/dbgfs: implement deprecation notice file

Implement a read-only file for DAMON debugfs interface deprecation notice,
to let users who manually read/write the DAMON debugfs files from their
shell command line easily notice the fact.

Link: https://lkml.kernel.org/r/20240130013549.89538-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/dbgfs.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 7dac24e69e3b95..fc6ece5a9f37cc 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -805,6 +805,18 @@ static void dbgfs_destroy_ctx(struct damon_ctx *ctx)
 	damon_destroy_ctx(ctx);
 }
 
+static ssize_t damon_dbgfs_deprecated_read(struct file *file,
+		char __user *buf, size_t count, loff_t *ppos)
+{
+	char kbuf[512] = "DAMON debugfs interface is deprecated, "
+		     "so users should move to DAMON_SYSFS. If you cannot, "
+		     "please report your usecase to damon@lists.linux.dev and "
+		     "linux-mm@kvack.org.\n";
+	int len = strnlen(kbuf, 1024);
+
+	return simple_read_from_buffer(buf, count, ppos, kbuf, len);
+}
+
 /*
  * Make a context of @name and create a debugfs directory for it.
  *
@@ -1056,6 +1068,10 @@ static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file)
 	return nonseekable_open(inode, file);
 }
 
+static const struct file_operations deprecated_fops = {
+	.read = damon_dbgfs_deprecated_read,
+};
+
 static const struct file_operations mk_contexts_fops = {
 	.open = damon_dbgfs_static_file_open,
 	.write = dbgfs_mk_context_write,
@@ -1076,9 +1092,9 @@ static int __init __damon_dbgfs_init(void)
 {
 	struct dentry *dbgfs_root;
 	const char * const file_names[] = {"mk_contexts", "rm_contexts",
-		"monitor_on"};
+		"monitor_on", "DEPRECATED"};
 	const struct file_operations *fops[] = {&mk_contexts_fops,
-		&rm_contexts_fops, &monitor_on_fops};
+		&rm_contexts_fops, &monitor_on_fops, &deprecated_fops};
 	int i;
 
 	dbgfs_root = debugfs_create_dir("damon", NULL);

From 56c233831d70a2be9ddd5bf264e8db6def138f79 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:43 -0800
Subject: [PATCH 544/707] mm/damon/dbgfs: make debugfs interface deprecation
 message a macro

DAMON debugfs interface deprecation message is written twice, once for the
warning, and again for DEPRECATED file's read output.  De-duplicate those
by defining the message as a macro and reuse.

Link: https://lkml.kernel.org/r/20240130013549.89538-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/dbgfs.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index fc6ece5a9f37cc..fbc0cd63f34c57 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -15,6 +15,11 @@
 #include <linux/page_idle.h>
 #include <linux/slab.h>
 
+#define DAMON_DBGFS_DEPRECATION_NOTICE					\
+	"DAMON debugfs interface is deprecated, so users should move "	\
+	"to DAMON_SYSFS. If you cannot, please report your usecase to "	\
+	"damon@lists.linux.dev and linux-mm@kvack.org.\n"
+
 static struct damon_ctx **dbgfs_ctxs;
 static int dbgfs_nr_ctxs;
 static struct dentry **dbgfs_dirs;
@@ -22,10 +27,7 @@ static DEFINE_MUTEX(damon_dbgfs_lock);
 
 static void damon_dbgfs_warn_deprecation(void)
 {
-	pr_warn_once("DAMON debugfs interface is deprecated, "
-		     "so users should move to DAMON_SYSFS. If you cannot, "
-		     "please report your usecase to damon@lists.linux.dev and "
-		     "linux-mm@kvack.org.\n");
+	pr_warn_once(DAMON_DBGFS_DEPRECATION_NOTICE);
 }
 
 /*
@@ -808,10 +810,7 @@ static void dbgfs_destroy_ctx(struct damon_ctx *ctx)
 static ssize_t damon_dbgfs_deprecated_read(struct file *file,
 		char __user *buf, size_t count, loff_t *ppos)
 {
-	char kbuf[512] = "DAMON debugfs interface is deprecated, "
-		     "so users should move to DAMON_SYSFS. If you cannot, "
-		     "please report your usecase to damon@lists.linux.dev and "
-		     "linux-mm@kvack.org.\n";
+	char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE;
 	int len = strnlen(kbuf, 1024);
 
 	return simple_read_from_buffer(buf, count, ppos, kbuf, len);

From 1364bbee446ea919e5d3f2433ba7457a986dd061 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:44 -0800
Subject: [PATCH 545/707] Docs/admin-guide/mm/damon/usage: document
 'DEPRECATED' file of DAMON debugfs interface

Document the newly added DAMON debugfs interface deprecation notice file
on the usage document.

Link: https://lkml.kernel.org/r/20240130013549.89538-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index f2feabb4bd35c7..5d3df18dfb9fc0 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -628,9 +628,16 @@ debugfs Interface (DEPRECATED!)
   move, please report your usecase to damon@lists.linux.dev and
   linux-mm@kvack.org.
 
-DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
-``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
-``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
+DAMON exports nine files, ``DEPRECATED``, ``attrs``, ``target_ids``,
+``init_regions``, ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts``
+and ``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
+
+
+``DEPRECATED`` is a read-only file for the DAMON debugfs interface deprecation
+notice.  Reading it returns the deprecation notice, as below::
+
+    # cat DEPRECATED
+    DAMON debugfs interface is deprecated, so users should move to DAMON_SYSFS. If you cannot, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
 
 
 Attributes

From c79e7095868ed62e5843176d95517c7262d45c2f Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:45 -0800
Subject: [PATCH 546/707] selftets/damon: prepare for monitor_on file renaming

Following change will rename 'monitor_on' DAMON debugfs file to
'monitor_on_DEPRECATED', to make the deprecation unignorable in runtime.
Since it could make DAMON selftests fail and disturb future bisects,
update DAMON selftests to support the change.

Link: https://lkml.kernel.org/r/20240130013549.89538-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/_chk_dependency.sh     | 11 +++++++++--
 tools/testing/selftests/damon/_debugfs_common.sh     |  7 +++++++
 .../testing/selftests/damon/debugfs_empty_targets.sh | 12 ++++++++++--
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh
index 0328ac0b5a5ed0..350f8c2b071dbc 100644
--- a/tools/testing/selftests/damon/_chk_dependency.sh
+++ b/tools/testing/selftests/damon/_chk_dependency.sh
@@ -18,7 +18,14 @@ then
 	exit $ksft_skip
 fi
 
-for f in attrs target_ids monitor_on
+if [ -f "$DBGFS/monitor_on_DEPRECATED" ]
+then
+	monitor_on_file="monitor_on_DEPRECATED"
+else
+	monitor_on_file="monitor_on"
+fi
+
+for f in attrs target_ids "$monitor_on_file"
 do
 	if [ ! -f "$DBGFS/$f" ]
 	then
@@ -28,7 +35,7 @@ do
 done
 
 permission_error="Operation not permitted"
-for f in attrs target_ids monitor_on
+for f in attrs target_ids "$monitor_on_file"
 do
 	status=$( cat "$DBGFS/$f" 2>&1 )
 	if [ "${status#*$permission_error}" != "$status" ]; then
diff --git a/tools/testing/selftests/damon/_debugfs_common.sh b/tools/testing/selftests/damon/_debugfs_common.sh
index 48989d4813ae8b..aa995516870bc8 100644
--- a/tools/testing/selftests/damon/_debugfs_common.sh
+++ b/tools/testing/selftests/damon/_debugfs_common.sh
@@ -45,6 +45,13 @@ test_content() {
 source ./_chk_dependency.sh
 
 damon_onoff="$DBGFS/monitor_on"
+if [ -f "$DBGFS/monitor_on_DEPRECATED" ]
+then
+	damon_onoff="$DBGFS/monitor_on_DEPRECATED"
+else
+	damon_onoff="$DBGFS/monitor_on"
+fi
+
 if [ $(cat "$damon_onoff") = "on" ]
 then
 	echo "monitoring is on"
diff --git a/tools/testing/selftests/damon/debugfs_empty_targets.sh b/tools/testing/selftests/damon/debugfs_empty_targets.sh
index 87aff8083822f6..effbea33dc1640 100755
--- a/tools/testing/selftests/damon/debugfs_empty_targets.sh
+++ b/tools/testing/selftests/damon/debugfs_empty_targets.sh
@@ -8,6 +8,14 @@ source _debugfs_common.sh
 
 orig_target_ids=$(cat "$DBGFS/target_ids")
 echo "" > "$DBGFS/target_ids"
-orig_monitor_on=$(cat "$DBGFS/monitor_on")
-test_write_fail "$DBGFS/monitor_on" "on" "orig_monitor_on" "empty target ids"
+
+if [ -f "$DBGFS/monitor_on_DEPRECATED" ]
+then
+	monitor_on_file="$DBGFS/monitor_on_DEPRECATED"
+else
+	monitor_on_file="$DBGFS/monitor_on"
+fi
+
+orig_monitor_on=$(cat "$monitor_on_file")
+test_write_fail "$monitor_on_file" "on" "orig_monitor_on" "empty target ids"
 echo "$orig_target_ids" > "$DBGFS/target_ids"

From d0c90dcc06509d5ee9dda96a7393d60343c63885 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:46 -0800
Subject: [PATCH 547/707] mm/damon/dbgfs: rename monitor_on file to
 monitor_on_DEPRECATED

Kernel builders could silently enable CONFIG_DAMON_DBGFS_DEPRECATED.
Users who manually check the files under the DAMON debugfs directory could
notice the deprecation owing to the 'DEPRECATED' DAMON debugfs file, but
there could be users who doesn't manually check the files.

Make the deprecation cannot be ignored in the case by renaming
'monitor_on' file, which is essential for real use of DAMON on runtime, to
'monitor_on_DEPRECATED'.  Still users who control DAMON via only
user-space tool could ignore the deprecation, but that's what the tool
developers should take care of.  DAMON user-space tool, damo, has also
made a change[1] for the purpose.

[1] commit 935dae76f2aee ("_damon_args: Rename --damon_interface to
    --damon_interface_DEPRECATED") of https://github.com/awslabs/damo

Link: https://lkml.kernel.org/r/20240130013549.89538-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/dbgfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index fbc0cd63f34c57..f7abbc0633aaf0 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -1091,7 +1091,7 @@ static int __init __damon_dbgfs_init(void)
 {
 	struct dentry *dbgfs_root;
 	const char * const file_names[] = {"mk_contexts", "rm_contexts",
-		"monitor_on", "DEPRECATED"};
+		"monitor_on_DEPRECATED", "DEPRECATED"};
 	const struct file_operations *fops[] = {&mk_contexts_fops,
 		&rm_contexts_fops, &monitor_on_fops, &deprecated_fops};
 	int i;

From 58796385178b77697b9b8dee488e5e859bf716be Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:47 -0800
Subject: [PATCH 548/707] Docs/admin-guide/mm/damon/usage: update for
 monitor_on renaming

Update DAMON debugfs interface sections on the usage document to reflect
the fact that 'monitor_on' file has renamed to 'monitor_on_DEPRECATED'.

Link: https://lkml.kernel.org/r/20240130013549.89538-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/damon/usage.rst | 29 ++++++++++----------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 5d3df18dfb9fc0..58c34e66b31b2b 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -629,8 +629,9 @@ debugfs Interface (DEPRECATED!)
   linux-mm@kvack.org.
 
 DAMON exports nine files, ``DEPRECATED``, ``attrs``, ``target_ids``,
-``init_regions``, ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts``
-and ``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
+``init_regions``, ``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``,
+``mk_contexts`` and ``rm_contexts`` under its debugfs directory,
+``<debugfs>/damon/``.
 
 
 ``DEPRECATED`` is a read-only file for the DAMON debugfs interface deprecation
@@ -855,16 +856,16 @@ Turning On/Off
 
 Setting the files as described above doesn't incur effect unless you explicitly
 start the monitoring.  You can start, stop, and check the current status of the
-monitoring by writing to and reading from the ``monitor_on`` file.  Writing
-``on`` to the file starts the monitoring of the targets with the attributes.
-Writing ``off`` to the file stops those.  DAMON also stops if every target
-process is terminated.  Below example commands turn on, off, and check the
-status of DAMON::
+monitoring by writing to and reading from the ``monitor_on_DEPRECATED`` file.
+Writing ``on`` to the file starts the monitoring of the targets with the
+attributes.  Writing ``off`` to the file stops those.  DAMON also stops if
+every target process is terminated.  Below example commands turn on, off, and
+check the status of DAMON::
 
     # cd <debugfs>/damon
-    # echo on > monitor_on
-    # echo off > monitor_on
-    # cat monitor_on
+    # echo on > monitor_on_DEPRECATED
+    # echo off > monitor_on_DEPRECATED
+    # cat monitor_on_DEPRECATED
     off
 
 Please note that you cannot write to the above-mentioned debugfs files while
@@ -880,11 +881,11 @@ can get the pid of the thread by reading the ``kdamond_pid`` file.  When the
 monitoring is turned off, reading the file returns ``none``. ::
 
     # cd <debugfs>/damon
-    # cat monitor_on
+    # cat monitor_on_DEPRECATED
     off
     # cat kdamond_pid
     none
-    # echo on > monitor_on
+    # echo on > monitor_on_DEPRECATED
     # cat kdamond_pid
     18594
 
@@ -914,5 +915,5 @@ directory by putting the name of the context to the ``rm_contexts`` file. ::
     # ls foo
     # ls: cannot access 'foo': No such file or directory
 
-Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the
-root directory only.
+Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on_DEPRECATED`` files
+are in the root directory only.

From dc35e9925ea1796a80a85a85742393d96f048810 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 29 Jan 2024 17:35:48 -0800
Subject: [PATCH 549/707] Docs/translations/damon/usage: update for monitor_on
 renaming

Update DAMON debugfs interface sections on the translated usage documents
to reflect the fact that 'monitor_on' file has renamed to
'monitor_on_DEPRECATED'.

Link: https://lkml.kernel.org/r/20240130013549.89538-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Alex Shi <alexs@kernel.org>
Cc: Hu Haowen <2023002089@link.tyut.edu.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../zh_CN/admin-guide/mm/damon/usage.rst      | 20 +++++++++----------
 .../zh_TW/admin-guide/mm/damon/usage.rst      | 20 +++++++++----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
index 17b9949d9b4357..da2745464ece45 100644
--- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
+++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
@@ -344,7 +344,7 @@ debugfs接口
   :ref:`sysfs接口<sysfs_interface>`。
 
 DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``,
-``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` 和
+``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和
 ``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
 
 
@@ -521,15 +521,15 @@ DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``,
 开关
 ----
 
-除非你明确地启动监测，否则如上所述的文件设置不会产生效果。你可以通过写入和读取 ``monitor_on``
+除非你明确地启动监测，否则如上所述的文件设置不会产生效果。你可以通过写入和读取 ``monitor_on_DEPRECATED``
 文件来启动、停止和检查监测的当前状态。写入 ``on`` 该文件可以启动对有属性的目标的监测。写入
 ``off`` 该文件则停止这些目标。如果每个目标进程被终止，DAMON也会停止。下面的示例命令开启、关
 闭和检查DAMON的状态::
 
     # cd <debugfs>/damon
-    # echo on > monitor_on
-    # echo off > monitor_on
-    # cat monitor_on
+    # echo on > monitor_on_DEPRECATED
+    # echo off > monitor_on_DEPRECATED
+    # cat monitor_on_DEPRECATED
     off
 
 请注意，当监测开启时，你不能写到上述的debugfs文件。如果你在DAMON运行时写到这些文件，将会返
@@ -543,11 +543,11 @@ DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以
 得该线程的 ``pid`` 。当监测被 ``关闭`` 时，读取该文件不会返回任何信息::
 
     # cd <debugfs>/damon
-    # cat monitor_on
+    # cat monitor_on_DEPRECATED
     off
     # cat kdamond_pid
     none
-    # echo on > monitor_on
+    # echo on > monitor_on_DEPRECATED
     # cat kdamond_pid
     18594
 
@@ -574,7 +574,7 @@ DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以
     # ls foo
     # ls: cannot access 'foo': No such file or directory
 
-注意， ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on`` 文件只在根目录下。
+注意， ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目录下。
 
 
 监测结果的监测点
@@ -583,9 +583,9 @@ DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以
 DAMON通过一个tracepoint ``damon:damon_aggregated`` 提供监测结果.  当监测开启时，你可
 以记录追踪点事件，并使用追踪点支持工具如perf显示结果。比如说::
 
-    # echo on > monitor_on
+    # echo on > monitor_on_DEPRECATED
     # perf record -e damon:damon_aggregated &
     # sleep 5
     # kill 9 $(pidof perf)
-    # echo off > monitor_on
+    # echo off > monitor_on_DEPRECATED
     # perf script
diff --git a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst
index 6dee719a32ea61..7464279f9b7de0 100644
--- a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst
+++ b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst
@@ -344,7 +344,7 @@ debugfs接口
   :ref:`sysfs接口<sysfs_interface>`。
 
 DAMON導出了八個文件, ``attrs``, ``target_ids``, ``init_regions``,
-``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` 和
+``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和
 ``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
 
 
@@ -521,15 +521,15 @@ DAMON導出了八個文件, ``attrs``, ``target_ids``, ``init_regions``,
 開關
 ----
 
-除非你明確地啓動監測，否則如上所述的文件設置不會產生效果。你可以通過寫入和讀取 ``monitor_on``
+除非你明確地啓動監測，否則如上所述的文件設置不會產生效果。你可以通過寫入和讀取 ``monitor_on_DEPRECATED``
 文件來啓動、停止和檢查監測的當前狀態。寫入 ``on`` 該文件可以啓動對有屬性的目標的監測。寫入
 ``off`` 該文件則停止這些目標。如果每個目標進程被終止，DAMON也會停止。下面的示例命令開啓、關
 閉和檢查DAMON的狀態::
 
     # cd <debugfs>/damon
-    # echo on > monitor_on
-    # echo off > monitor_on
-    # cat monitor_on
+    # echo on > monitor_on_DEPRECATED
+    # echo off > monitor_on_DEPRECATED
+    # cat monitor_on_DEPRECATED
     off
 
 請注意，當監測開啓時，你不能寫到上述的debugfs文件。如果你在DAMON運行時寫到這些文件，將會返
@@ -543,11 +543,11 @@ DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以
 得該線程的 ``pid`` 。當監測被 ``關閉`` 時，讀取該文件不會返回任何信息::
 
     # cd <debugfs>/damon
-    # cat monitor_on
+    # cat monitor_on_DEPRECATED
     off
     # cat kdamond_pid
     none
-    # echo on > monitor_on
+    # echo on > monitor_on_DEPRECATED
     # cat kdamond_pid
     18594
 
@@ -574,7 +574,7 @@ DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以
     # ls foo
     # ls: cannot access 'foo': No such file or directory
 
-注意， ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on`` 文件只在根目錄下。
+注意， ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目錄下。
 
 
 監測結果的監測點
@@ -583,10 +583,10 @@ DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以
 DAMON通過一個tracepoint ``damon:damon_aggregated`` 提供監測結果.  當監測開啓時，你可
 以記錄追蹤點事件，並使用追蹤點支持工具如perf顯示結果。比如說::
 
-    # echo on > monitor_on
+    # echo on > monitor_on_DEPRECATED
     # perf record -e damon:damon_aggregated &
     # sleep 5
     # kill 9 $(pidof perf)
-    # echo off > monitor_on
+    # echo off > monitor_on_DEPRECATED
     # perf script
 

From 22c124fdb062f9a4383337e30f4f93bb08792419 Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Wed, 31 Jan 2024 11:19:13 +0800
Subject: [PATCH 550/707] mm/mmap: use SZ_{8K, 128K} helper macro

Use SZ_{8K, 128K} helper macro instead of the number in init_user_reserve
and reserve_mem_notifier. This is more readable.

Link: https://lkml.kernel.org/r/20240131031913.2058597-1-yajun.deng@linux.dev
Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mmap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 476de5daf598d1..1f9e7024285866 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3845,7 +3845,7 @@ static int init_user_reserve(void)
 
 	free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
 
-	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+	sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K);
 	return 0;
 }
 subsys_initcall(init_user_reserve);
@@ -3866,7 +3866,7 @@ static int init_admin_reserve(void)
 
 	free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
 
-	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K);
 	return 0;
 }
 subsys_initcall(init_admin_reserve);
@@ -3898,12 +3898,12 @@ static int reserve_mem_notifier(struct notifier_block *nb,
 	case MEM_ONLINE:
 		/* Default max is 128MB. Leave alone if modified by operator. */
 		tmp = sysctl_user_reserve_kbytes;
-		if (0 < tmp && tmp < (1UL << 17))
+		if (tmp > 0 && tmp < SZ_128K)
 			init_user_reserve();
 
 		/* Default max is 8MB.  Leave alone if modified by operator. */
 		tmp = sysctl_admin_reserve_kbytes;
-		if (0 < tmp && tmp < (1UL << 13))
+		if (tmp > 0 && tmp < SZ_8K)
 			init_admin_reserve();
 
 		break;

From 4f60623e13c270bab759ff8483dec2062f2a0964 Mon Sep 17 00:00:00 2001
From: Rakie Kim <rakie.kim@sk.com>
Date: Tue, 30 Jan 2024 13:20:44 -0500
Subject: [PATCH 551/707] mm/mempolicy: implement the sysfs-based
 weighted_interleave interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "mm/mempolicy: weighted interleave mempolicy and sysfs
extension", v4.

Weighted interleave is a new interleave policy intended to make use of
heterogeneous memory environments appearing with CXL.

The existing interleave mechanism does an even round-robin distribution of
memory across all nodes in a nodemask, while weighted interleave
distributes memory across nodes according to a provided weight.  (Weight =
# of page allocations per round)

Weighted interleave is intended to reduce average latency when bandwidth
is pressured - therefore increasing total throughput.

In other words: It allows greater use of the total available bandwidth in
a heterogeneous hardware environment (different hardware provides
different bandwidth capacity).

As bandwidth is pressured, latency increases - first linearly and then
exponentially.  By keeping bandwidth usage distributed according to
available bandwidth, we therefore can reduce the average latency of a
cacheline fetch.

A good explanation of the bandwidth vs latency response curve:
https://mahmoudhatem.wordpress.com/2017/11/07/memory-bandwidth-vs-latency-response-curve/

From the article:
```
Constant region:
    The latency response is fairly constant for the first 40%
    of the sustained bandwidth.
Linear region:
    In between 40% to 80% of the sustained bandwidth, the
    latency response increases almost linearly with the bandwidth
    demand of the system due to contention overhead by numerous
    memory requests.
Exponential region:
    Between 80% to 100% of the sustained bandwidth, the memory
    latency is dominated by the contention latency which can be
    as much as twice the idle latency or more.
Maximum sustained bandwidth :
    Is 65% to 75% of the theoretical maximum bandwidth.
```

As a general rule of thumb:
* If bandwidth usage is low, latency does not increase. It is
  optimal to place data in the nearest (lowest latency) device.
* If bandwidth usage is high, latency increases. It is optimal
  to place data such that bandwidth use is optimized per-device.

This is the top line goal: Provide a user a mechanism to target using the
"maximum sustained bandwidth" of each hardware component in a heterogenous
memory system.


For example, the stream benchmark demonstrates that 1:1 (default)
interleave is actively harmful, while weighted interleave can be
beneficial.  Default interleave distributes data such that too much
pressure is placed on devices with lower available bandwidth.

Stream Benchmark (vs DRAM, 1 Socket + 1 CXL Device)
Default interleave : -78% (slower than DRAM)
Global weighting   : -6% to +4% (workload dependant)
Targeted weights   : +2.5% to +4% (consistently better than DRAM)

Global means the task-policy was set (set_mempolicy), while targeted means
VMA policies were set (mbind2).  We see weighted interleave is not always
beneficial when applied globally, but is always beneficial when applied to
bandwidth-driving memory regions.


There are 3 patches in this set:
1) Implement system-global interleave weights as sysfs extension
   in mm/mempolicy.c.  These weights are RCU protected, and a
   default weight set is provided (all weights are 1 by default).

   In future work, we intend to expose an interface for HMAT/CDAT
   code to set reasonable default values based on the memory
   configuration of the system discovered at boot/hotplug.

2) A mild refactor of some interleave-logic for re-use in the
   new weighted interleave logic.

3) MPOL_WEIGHTED_INTERLEAVE extension for set_mempolicy/mbind


Included below are some performance and LTP test information,
and a sample numactl branch which can be used for testing.

= Performance summary =
(tests may have different configurations, see extended info below)
1) MLC (W2) : +38% over DRAM. +264% over default interleave.
   MLC (W5) : +40% over DRAM. +226% over default interleave.
2) Stream   : -6% to +4% over DRAM, +430% over default interleave.
3) XSBench  : +19% over DRAM. +47% over default interleave.

= LTP Testing Summary =
existing mempolicy & mbind tests: pass
mempolicy & mbind + weighted interleave (global weights): pass

= version history
v4:
- style fixes, code deduplication, simplifications, comments
- moved mempolicy->il_weight to task_struct->il_weight
- changed logic to simply move forward on il_weight=0 rather than
  treat il_weight=0 as a special value
- detect when il_prev is no longer in the nodemask and move forward
- missed weighted interleave check in alloc_pages_mpol()
- uninitialized value of nr_allocated in bulk allocator

=====================================================================
Performance tests - MLC
From - Ravi Jonnalagadda <ravis.opensrc@micron.com>

Hardware: Single-socket, multiple CXL memory expanders.

Workload:                               W2
Data Signature:                         2:1 read:write
DRAM only bandwidth (GBps):             298.8
DRAM + CXL (default interleave) (GBps): 113.04
DRAM + CXL (weighted interleave)(GBps): 412.5
Gain over DRAM only:                    1.38x
Gain over default interleave:           2.64x

Workload:                               W5
Data Signature:                         1:1 read:write
DRAM only bandwidth (GBps):             273.2
DRAM + CXL (default interleave) (GBps): 117.23
DRAM + CXL (weighted interleave)(GBps): 382.7
Gain over DRAM only:                    1.4x
Gain over default interleave:           2.26x

=====================================================================
Performance test - Stream
From - Gregory Price <gregory.price@memverge.com>

Hardware: Single socket, single CXL expander
numactl extension: https://github.com/gmprice/numactl/tree/weighted_interleave_master

Summary: 64 threads, ~18GB workload, 3GB per array, executed 100 times
Default interleave : -78% (slower than DRAM)
Global weighting   : -6% to +4% (workload dependant)
mbind2 weights     : +2.5% to +4% (consistently better than DRAM)

dram only:
numactl --cpunodebind=1 --membind=1 ./stream_c.exe --ntimes 100 --array-size 400M --malloc
Function     Direction    BestRateMBs     AvgTime      MinTime      MaxTime
Copy:        0->0            200923.2     0.032662     0.031853     0.033301
Scale:       0->0            202123.0     0.032526     0.031664     0.032970
Add:         0->0            208873.2     0.047322     0.045961     0.047884
Triad:       0->0            208523.8     0.047262     0.046038     0.048414

CXL-only:
numactl --cpunodebind=1 -w --membind=2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc
Copy:        0->0             22209.7     0.288661     0.288162     0.289342
Scale:       0->0             22288.2     0.287549     0.287147     0.288291
Add:         0->0             24419.1     0.393372     0.393135     0.393735
Triad:       0->0             24484.6     0.392337     0.392083     0.394331

Based on the above, the optimal weights are ~9:1
echo 9 > /sys/kernel/mm/mempolicy/weighted_interleave/node1
echo 1 > /sys/kernel/mm/mempolicy/weighted_interleave/node2

default interleave:
numactl --cpunodebind=1 --interleave=1,2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc
Copy:        0->0             44666.2     0.143671     0.143285     0.144174
Scale:       0->0             44781.6     0.143256     0.142916     0.143713
Add:         0->0             48600.7     0.197719     0.197528     0.197858
Triad:       0->0             48727.5     0.197204     0.197014     0.197439

global weighted interleave:
numactl --cpunodebind=1 -w --interleave=1,2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc
Copy:        0->0            190085.9     0.034289     0.033669     0.034645
Scale:       0->0            207677.4     0.031909     0.030817     0.033061
Add:         0->0            202036.8     0.048737     0.047516     0.053409
Triad:       0->0            217671.5     0.045819     0.044103     0.046755

targted regions w/ global weights (modified stream to mbind2 malloc'd regions))
numactl --cpunodebind=1 --membind=1 ./stream_c.exe -b --ntimes 100 --array-size 400M --malloc
Copy:        0->0            205827.0     0.031445     0.031094     0.031984
Scale:       0->0            208171.8     0.031320     0.030744     0.032505
Add:         0->0            217352.0     0.045087     0.044168     0.046515
Triad:       0->0            216884.8     0.045062     0.044263     0.046982

=====================================================================
Performance tests - XSBench
From - Hyeongtak Ji <hyeongtak.ji@sk.com>

Hardware: Single socket, Single CXL memory Expander

NUMA node 0: 56 logical cores, 128 GB memory
NUMA node 2: 96 GB CXL memory
Threads:     56
Lookups:     170,000,000

Summary: +19% over DRAM. +47% over default interleave.

Performance tests - XSBench
1. dram only
$ numactl -m 0 ./XSBench -s XL –p 5000000
Runtime:     36.235 seconds
Lookups/s:   4,691,618

2. default interleave
$ numactl –i 0,2 ./XSBench –s XL –p 5000000
Runtime:     55.243 seconds
Lookups/s:   3,077,293

3. weighted interleave
numactl –w –i 0,2 ./XSBench –s XL –p 5000000
Runtime:     29.262 seconds
Lookups/s:   5,809,513

=====================================================================
LTP Tests: https://github.com/gmprice/ltp/tree/mempolicy2

= Existing tests
set_mempolicy, get_mempolicy, mbind

MPOL_WEIGHTED_INTERLEAVE added manually to test basic functionality
but did not adjust tests for weighting.  Basically the weights were
set to 1, which is the default, and it should behave the same as
MPOL_INTERLEAVE if logic is correct.

== set_mempolicy01 : passed   18, failed   0
== set_mempolicy02 : passed   10, failed   0
== set_mempolicy03 : passed   64, failed   0
== set_mempolicy04 : passed   32, failed   0
== set_mempolicy05 - n/a on non-x86
== set_mempolicy06 : passed   10, failed   0
   this is set_mempolicy02 + MPOL_WEIGHTED_INTERLEAVE
== set_mempolicy07 : passed   32, failed   0
   set_mempolicy04 + MPOL_WEIGHTED_INTERLEAVE
== get_mempolicy01 : passed   12, failed   0
   change: added MPOL_WEIGHTED_INTERLEAVE
== get_mempolicy02 : passed   2, failed   0
== mbind01 : passed   15, failed   0
   added MPOL_WEIGHTED_INTERLEAVE
== mbind02 : passed   4, failed   0
   added MPOL_WEIGHTED_INTERLEAVE
== mbind03 : passed   16, failed   0
   added MPOL_WEIGHTED_INTERLEAVE
== mbind04 : passed   48, failed   0
   added MPOL_WEIGHTED_INTERLEAVE

=====================================================================
numactl (set_mempolicy) w/ global weighting test
numactl fork: https://github.com/gmprice/numactl/tree/weighted_interleave_master

command: numactl -w --interleave=0,1 ./eatmem

result (weights 1:1):
0176a000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=32897 N1=32896 kernelpagesize_kB=4
7fceeb9ff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=32768 N1=32769 kernelpagesize_kB=4
50% distribution is correct

result (weights 5:1):
01b14000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=54828 N1=10965 kernelpagesize_kB=4
7f47a1dff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=54614 N1=10923 kernelpagesize_kB=4
16.666% distribution is correct

result (weights 1:5):
01f07000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=10966 N1=54827 kernelpagesize_kB=4
7f17b1dff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=10923 N1=54614 kernelpagesize_kB=4
16.666% distribution is correct

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main (void)
{
        char* mem = malloc(1024*1024*256);
        memset(mem, 1, 1024*1024*256);
        for (int i = 0; i  < ((1024*1024*256)/4096); i++)
        {
                mem = malloc(4096);
                mem[0] = 1;
        }
        printf("done\n");
        getchar();
        return 0;
}

=====================================================================


This patch (of 3):

This patch provides a way to set interleave weight information under sysfs
at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN

The sysfs structure is designed as follows.

  $ tree /sys/kernel/mm/mempolicy/
  /sys/kernel/mm/mempolicy/ [1]
  └── weighted_interleave [2]
      ├── node0 [3]
      └── node1

Each file above can be explained as follows.

[1] mm/mempolicy: configuration interface for mempolicy subsystem

[2] weighted_interleave/: config interface for weighted interleave policy

[3] weighted_interleave/nodeN: weight for nodeN

If a node value is set to `0`, the system-default value will be used.
As of this patch, the system-default for all nodes is always 1.

Link: https://lkml.kernel.org/r/20240130182046.74278-1-gregory.price@memverge.com
Link: https://lkml.kernel.org/r/20240130182046.74278-2-gregory.price@memverge.com
Suggested-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Rakie Kim <rakie.kim@sk.com>
Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
Co-developed-by: Gregory Price <gregory.price@memverge.com>
Signed-off-by: Gregory Price <gregory.price@memverge.com>
Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Gregory Price <gourry.memverge@gmail.com>
Cc: Hasan Al Maruf <Hasan.Maruf@amd.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../ABI/testing/sysfs-kernel-mm-mempolicy     |   4 +
 ...fs-kernel-mm-mempolicy-weighted-interleave |  25 ++
 mm/mempolicy.c                                | 223 ++++++++++++++++++
 3 files changed, 252 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
new file mode 100644
index 00000000000000..8ac327fd7fb6e3
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
@@ -0,0 +1,4 @@
+What:		/sys/kernel/mm/mempolicy/
+Date:		January 2024
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Interface for Mempolicy
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
new file mode 100644
index 00000000000000..0b7972de04e939
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
@@ -0,0 +1,25 @@
+What:		/sys/kernel/mm/mempolicy/weighted_interleave/
+Date:		January 2024
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Configuration Interface for the Weighted Interleave policy
+
+What:		/sys/kernel/mm/mempolicy/weighted_interleave/nodeN
+Date:		January 2024
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Weight configuration interface for nodeN
+
+		The interleave weight for a memory node (N). These weights are
+		utilized by tasks which have set their mempolicy to
+		MPOL_WEIGHTED_INTERLEAVE.
+
+		These weights only affect new allocations, and changes at runtime
+		will not cause migrations on already allocated pages.
+
+		The minimum weight for a node is always 1.
+
+		Minimum weight: 1
+		Maximum weight: 255
+
+		Writing an empty string or `0` will reset the weight to the
+		system default. The system default may be set by the kernel
+		or drivers at boot or during hotplug events.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5e519163c4dcb6..e62b205b4ee55e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -131,6 +131,32 @@ static struct mempolicy default_policy = {
 
 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 
+/*
+ * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
+ * system-default value should be used. A NULL iw_table also denotes that
+ * system-default values should be used. Until the system-default table
+ * is implemented, the system-default is always 1.
+ *
+ * iw_table is RCU protected
+ */
+static u8 __rcu *iw_table;
+static DEFINE_MUTEX(iw_table_lock);
+
+static u8 get_il_weight(int node)
+{
+	u8 __rcu *table;
+	u8 weight;
+
+	rcu_read_lock();
+	table = rcu_dereference(iw_table);
+	/* if no iw_table, use system default */
+	weight = table ? table[node] : 1;
+	/* if value in iw_table is 0, use system default */
+	weight = weight ? weight : 1;
+	rcu_read_unlock();
+	return weight;
+}
+
 /**
  * numa_nearest_node - Find nearest node by state
  * @node: Node id to start the search
@@ -3063,3 +3089,200 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
 			       nodemask_pr_args(&nodes));
 }
+
+#ifdef CONFIG_SYSFS
+struct iw_node_attr {
+	struct kobj_attribute kobj_attr;
+	int nid;
+};
+
+static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
+			 char *buf)
+{
+	struct iw_node_attr *node_attr;
+	u8 weight;
+
+	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+	weight = get_il_weight(node_attr->nid);
+	return sysfs_emit(buf, "%d\n", weight);
+}
+
+static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
+			  const char *buf, size_t count)
+{
+	struct iw_node_attr *node_attr;
+	u8 __rcu *new;
+	u8 __rcu *old;
+	u8 weight = 0;
+
+	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+	if (count == 0 || sysfs_streq(buf, ""))
+		weight = 0;
+	else if (kstrtou8(buf, 0, &weight))
+		return -EINVAL;
+
+	new = kzalloc(nr_node_ids, GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	mutex_lock(&iw_table_lock);
+	old = rcu_dereference_protected(iw_table,
+					lockdep_is_held(&iw_table_lock));
+	if (old)
+		memcpy(new, old, nr_node_ids);
+	new[node_attr->nid] = weight;
+	rcu_assign_pointer(iw_table, new);
+	mutex_unlock(&iw_table_lock);
+	synchronize_rcu();
+	kfree(old);
+	return count;
+}
+
+static struct iw_node_attr **node_attrs;
+
+static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
+				  struct kobject *parent)
+{
+	if (!node_attr)
+		return;
+	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
+	kfree(node_attr->kobj_attr.attr.name);
+	kfree(node_attr);
+}
+
+static void sysfs_wi_release(struct kobject *wi_kobj)
+{
+	int i;
+
+	for (i = 0; i < nr_node_ids; i++)
+		sysfs_wi_node_release(node_attrs[i], wi_kobj);
+	kobject_put(wi_kobj);
+}
+
+static const struct kobj_type wi_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = sysfs_wi_release,
+};
+
+static int add_weight_node(int nid, struct kobject *wi_kobj)
+{
+	struct iw_node_attr *node_attr;
+	char *name;
+
+	node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
+	if (!node_attr)
+		return -ENOMEM;
+
+	name = kasprintf(GFP_KERNEL, "node%d", nid);
+	if (!name) {
+		kfree(node_attr);
+		return -ENOMEM;
+	}
+
+	sysfs_attr_init(&node_attr->kobj_attr.attr);
+	node_attr->kobj_attr.attr.name = name;
+	node_attr->kobj_attr.attr.mode = 0644;
+	node_attr->kobj_attr.show = node_show;
+	node_attr->kobj_attr.store = node_store;
+	node_attr->nid = nid;
+
+	if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
+		kfree(node_attr->kobj_attr.attr.name);
+		kfree(node_attr);
+		pr_err("failed to add attribute to weighted_interleave\n");
+		return -ENOMEM;
+	}
+
+	node_attrs[nid] = node_attr;
+	return 0;
+}
+
+static int add_weighted_interleave_group(struct kobject *root_kobj)
+{
+	struct kobject *wi_kobj;
+	int nid, err;
+
+	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+	if (!wi_kobj)
+		return -ENOMEM;
+
+	err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
+				   "weighted_interleave");
+	if (err) {
+		kfree(wi_kobj);
+		return err;
+	}
+
+	for_each_node_state(nid, N_POSSIBLE) {
+		err = add_weight_node(nid, wi_kobj);
+		if (err) {
+			pr_err("failed to add sysfs [node%d]\n", nid);
+			break;
+		}
+	}
+	if (err)
+		kobject_put(wi_kobj);
+	return 0;
+}
+
+static void mempolicy_kobj_release(struct kobject *kobj)
+{
+	u8 __rcu *old;
+
+	mutex_lock(&iw_table_lock);
+	old = rcu_dereference_protected(iw_table,
+					lockdep_is_held(&iw_table_lock));
+	rcu_assign_pointer(iw_table, NULL);
+	mutex_unlock(&iw_table_lock);
+	synchronize_rcu();
+	kfree(old);
+	kfree(node_attrs);
+	kfree(kobj);
+}
+
+static const struct kobj_type mempolicy_ktype = {
+	.release = mempolicy_kobj_release
+};
+
+static int __init mempolicy_sysfs_init(void)
+{
+	int err;
+	static struct kobject *mempolicy_kobj;
+
+	mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
+	if (!mempolicy_kobj) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
+			     GFP_KERNEL);
+	if (!node_attrs) {
+		err = -ENOMEM;
+		goto mempol_out;
+	}
+
+	err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
+				   "mempolicy");
+	if (err)
+		goto node_out;
+
+	err = add_weighted_interleave_group(mempolicy_kobj);
+	if (err) {
+		pr_err("mempolicy sysfs structure failed to initialize\n");
+		kobject_put(mempolicy_kobj);
+		return err;
+	}
+
+	return err;
+node_out:
+	kfree(node_attrs);
+mempol_out:
+	kfree(mempolicy_kobj);
+err_out:
+	pr_err("failed to add mempolicy kobject to the system\n");
+	return err;
+}
+
+late_initcall(mempolicy_sysfs_init);
+#endif /* CONFIG_SYSFS */

From 01e5bdb29292ea9200d2d6ae062fdf7c374ff0e0 Mon Sep 17 00:00:00 2001
From: Gregory Price <gourry.memverge@gmail.com>
Date: Tue, 30 Jan 2024 13:20:45 -0500
Subject: [PATCH 552/707] mm/mempolicy: refactor a read-once mechanism into a
 function for re-use

Move the use of barrier() to force policy->nodemask onto the stack into a
function `read_once_policy_nodemask` so that it may be re-used.

Link: https://lkml.kernel.org/r/20240130182046.74278-3-gregory.price@memverge.com
Signed-off-by: Gregory Price <gregory.price@memverge.com>
Suggested-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Hasan Al Maruf <Hasan.Maruf@amd.com>
Cc: Honggyu Kim <honggyu.kim@sk.com>
Cc: Hyeongtak Ji <hyeongtak.ji@sk.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Cc: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mempolicy.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e62b205b4ee55e..95682af4c478a5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1905,6 +1905,20 @@ unsigned int mempolicy_slab_node(void)
 	}
 }
 
+static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
+					      nodemask_t *mask)
+{
+	/*
+	 * barrier stabilizes the nodemask locally so that it can be iterated
+	 * over safely without concern for changes. Allocators validate node
+	 * selection does not violate mems_allowed, so this is safe.
+	 */
+	barrier();
+	memcpy(mask, &pol->nodes, sizeof(nodemask_t));
+	barrier();
+	return nodes_weight(*mask);
+}
+
 /*
  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
@@ -1912,20 +1926,12 @@ unsigned int mempolicy_slab_node(void)
  */
 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
 {
-	nodemask_t nodemask = pol->nodes;
+	nodemask_t nodemask;
 	unsigned int target, nnodes;
 	int i;
 	int nid;
-	/*
-	 * The barrier will stabilize the nodemask in a register or on
-	 * the stack so that it will stop changing under the code.
-	 *
-	 * Between first_node() and next_node(), pol->nodes could be changed
-	 * by other threads. So we put pol->nodes in a local stack.
-	 */
-	barrier();
 
-	nnodes = nodes_weight(nodemask);
+	nnodes = read_once_policy_nodemask(pol, &nodemask);
 	if (!nnodes)
 		return numa_node_id();
 	target = ilx % nnodes;

From 147371fd14993ae64ef2f612eb15f37fcef12d83 Mon Sep 17 00:00:00 2001
From: Gregory Price <gourry.memverge@gmail.com>
Date: Tue, 30 Jan 2024 13:20:46 -0500
Subject: [PATCH 553/707] mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for
 weighted interleaving

When a system has multiple NUMA nodes and it becomes bandwidth hungry,
using the current MPOL_INTERLEAVE could be an wise option.

However, if those NUMA nodes consist of different types of memory such
as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin
based interleave policy does not optimally distribute data to make use
of their different bandwidth characteristics.

Instead, interleave is more effective when the allocation policy follows
each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution.

This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE,
enabling weighted interleave between NUMA nodes.  Weighted interleave
allows for proportional distribution of memory across multiple numa
nodes, preferably apportioned to match the bandwidth of each node.

For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1),
with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate
weight distribution is (2:1).

Weights for each node can be assigned via the new sysfs extension:
/sys/kernel/mm/mempolicy/weighted_interleave/

For now, the default value of all nodes will be `1`, which matches
the behavior of standard 1:1 round-robin interleave. An extension
will be added in the future to allow default values to be registered
at kernel and device bringup time.

The policy allocates a number of pages equal to the set weights. For
example, if the weights are (2,1), then 2 pages will be allocated on
node0 for every 1 page allocated on node1.

The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2)
and mbind(2).

Some high level notes about the pieces of weighted interleave:

current->il_prev:
    Default interleave uses this to track the last used node.
    Weighted interleave uses this to track the *current* node, and
    when weight reaches 0 it will be used to acquire the next node.

current->il_weight:
    The active weight of the current node (current->il_prev)
    When this reaches 0, current->il_prev is set to the next node
    and current->il_weight is set to the next weight.

weighted_interleave_nodes:
    Counts the number of allocations as they occur, and applies the
    weight for the current node.  When the weight reaches 0, switch
    to the next node.  Operates only on task->mempolicy.

weighted_interleave_nid:
    Gets the total weight of the nodemask as well as each individual
    node weight, then calculates the node based on the given index.
    Operates on VMA policies.

bulk_array_weighted_interleave:
    Gets the total weight of the nodemask as well as each individual
    node weight, then calculates the number of "interleave rounds" as
    well as any delta ("partial round").  Calculates the number of
    pages for each node and allocates them.

    If a node was scheduled for interleave via interleave_nodes, the
    current weight will be allocated first.

    Operates only on the task->mempolicy.

One piece of complexity is the interaction between a recent refactor
which split the logic to acquire the "ilx" (interleave index) of an
allocation and the actual application of the interleave. If a call
to alloc_pages_mpol() were made with a weighted-interleave policy and
ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would
operate on a VMA policy - violating the description above.

An inspection of all callers of alloc_pages_mpol() shows that all
external callers set ilx to `0`, an index value, or will call
get_vma_policy() to acquire the ilx.

For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks
all set (pgoff_t ilx) or end up in `get_vma_policy()`.  This enforces
the `weighted_interleave_nodes()` and `weighted_interleave_nid()`
policy requirements (task/vma respectively).

Link: https://lkml.kernel.org/r/20240130182046.74278-4-gregory.price@memverge.com
Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com>
Signed-off-by: Gregory Price <gregory.price@memverge.com>
Co-developed-by: Rakie Kim <rakie.kim@sk.com>
Signed-off-by: Rakie Kim <rakie.kim@sk.com>
Co-developed-by: Honggyu Kim <honggyu.kim@sk.com>
Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../admin-guide/mm/numa_memory_policy.rst     |   9 +
 include/linux/sched.h                         |   1 +
 include/uapi/linux/mempolicy.h                |   1 +
 mm/mempolicy.c                                | 231 +++++++++++++++++-
 4 files changed, 238 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index eca38fa81e0f98..a70f20ce1ffb4f 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -250,6 +250,15 @@ MPOL_PREFERRED_MANY
 	can fall back to all existing numa nodes. This is effectively
 	MPOL_PREFERRED allowed for a mask rather than a single node.
 
+MPOL_WEIGHTED_INTERLEAVE
+	This mode operates the same as MPOL_INTERLEAVE, except that
+	interleaving behavior is executed based on weights set in
+	/sys/kernel/mm/mempolicy/weighted_interleave/
+
+	Weighted interleave allocates pages on nodes according to a
+	weight.  For example if nodes [0,1] are weighted [5,2], 5 pages
+	will be allocated on node0 for every 2 pages allocated on node1.
+
 NUMA memory policy supports the following optional mode flags:
 
 MPOL_F_STATIC_NODES
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cdb8ea53c365ba..e6095570a464e7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1259,6 +1259,7 @@ struct task_struct {
 	/* Protected by alloc_lock: */
 	struct mempolicy		*mempolicy;
 	short				il_prev;
+	u8				il_weight;
 	short				pref_node_fork;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index a8963f7ef4c279..1f9bb10d1a473f 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -23,6 +23,7 @@ enum {
 	MPOL_INTERLEAVE,
 	MPOL_LOCAL,
 	MPOL_PREFERRED_MANY,
+	MPOL_WEIGHTED_INTERLEAVE,
 	MPOL_MAX,	/* always last member of enum */
 };
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 95682af4c478a5..eca3e41e7787b9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -19,6 +19,13 @@
  *                for anonymous memory. For process policy an process counter
  *                is used.
  *
+ * weighted interleave
+ *                Allocate memory interleaved over a set of nodes based on
+ *                a set of weights (per-node), with normal fallback if it
+ *                fails.  Otherwise operates the same as interleave.
+ *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
+ *                on node 0 for every 1 page allocated on node 1.
+ *
  * bind           Only allocate memory on a specific set of nodes,
  *                no fallback.
  *                FIXME: memory is allocated starting with the first node
@@ -441,6 +448,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 		.create = mpol_new_nodemask,
 		.rebind = mpol_rebind_preferred,
 	},
+	[MPOL_WEIGHTED_INTERLEAVE] = {
+		.create = mpol_new_nodemask,
+		.rebind = mpol_rebind_nodemask,
+	},
 };
 
 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
@@ -858,8 +869,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 
 	old = current->mempolicy;
 	current->mempolicy = new;
-	if (new && new->mode == MPOL_INTERLEAVE)
+	if (new && (new->mode == MPOL_INTERLEAVE ||
+		    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
 		current->il_prev = MAX_NUMNODES-1;
+		current->il_weight = 0;
+	}
 	task_unlock(current);
 	mpol_put(old);
 	ret = 0;
@@ -884,6 +898,7 @@ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		*nodes = pol->nodes;
 		break;
 	case MPOL_LOCAL:
@@ -968,6 +983,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 		} else if (pol == current->mempolicy &&
 				pol->mode == MPOL_INTERLEAVE) {
 			*policy = next_node_in(current->il_prev, pol->nodes);
+		} else if (pol == current->mempolicy &&
+				pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
+			if (current->il_weight)
+				*policy = current->il_prev;
+			else
+				*policy = next_node_in(current->il_prev,
+						       pol->nodes);
 		} else {
 			err = -EINVAL;
 			goto out;
@@ -1332,7 +1354,8 @@ static long do_mbind(unsigned long start, unsigned long len,
 		 * VMAs, the nodes will still be interleaved from the targeted
 		 * nodemask, but one by one may be selected differently.
 		 */
-		if (new->mode == MPOL_INTERLEAVE) {
+		if (new->mode == MPOL_INTERLEAVE ||
+		    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
 			struct page *page;
 			unsigned int order;
 			unsigned long addr = -EFAULT;
@@ -1780,7 +1803,8 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
  * @vma: virtual memory area whose policy is sought
  * @addr: address in @vma for shared policy lookup
  * @order: 0, or appropriate huge_page_order for interleaving
- * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE
+ * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
+ *       MPOL_WEIGHTED_INTERLEAVE
  *
  * Returns effective policy for a VMA at specified address.
  * Falls back to current->mempolicy or system default policy, as necessary.
@@ -1797,7 +1821,8 @@ struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
 	pol = __get_vma_policy(vma, addr, ilx);
 	if (!pol)
 		pol = get_task_policy(current);
-	if (pol->mode == MPOL_INTERLEAVE) {
+	if (pol->mode == MPOL_INTERLEAVE ||
+	    pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
 		*ilx += vma->vm_pgoff >> order;
 		*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
 	}
@@ -1847,6 +1872,22 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
 	return zone >= dynamic_policy_zone;
 }
 
+static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
+{
+	unsigned int node = current->il_prev;
+
+	if (!current->il_weight || !node_isset(node, policy->nodes)) {
+		node = next_node_in(node, policy->nodes);
+		/* can only happen if nodemask is being rebound */
+		if (node == MAX_NUMNODES)
+			return node;
+		current->il_prev = node;
+		current->il_weight = get_il_weight(node);
+	}
+	current->il_weight--;
+	return node;
+}
+
 /* Do dynamic interleaving for a process */
 static unsigned int interleave_nodes(struct mempolicy *policy)
 {
@@ -1881,6 +1922,9 @@ unsigned int mempolicy_slab_node(void)
 	case MPOL_INTERLEAVE:
 		return interleave_nodes(policy);
 
+	case MPOL_WEIGHTED_INTERLEAVE:
+		return weighted_interleave_nodes(policy);
+
 	case MPOL_BIND:
 	case MPOL_PREFERRED_MANY:
 	{
@@ -1919,6 +1963,45 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
 	return nodes_weight(*mask);
 }
 
+static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
+{
+	nodemask_t nodemask;
+	unsigned int target, nr_nodes;
+	u8 __rcu *table;
+	unsigned int weight_total = 0;
+	u8 weight;
+	int nid;
+
+	nr_nodes = read_once_policy_nodemask(pol, &nodemask);
+	if (!nr_nodes)
+		return numa_node_id();
+
+	rcu_read_lock();
+	table = rcu_dereference(iw_table);
+	/* calculate the total weight */
+	for_each_node_mask(nid, nodemask) {
+		/* detect system default usage */
+		weight = table ? table[nid] : 1;
+		weight = weight ? weight : 1;
+		weight_total += weight;
+	}
+
+	/* Calculate the node offset based on totals */
+	target = ilx % weight_total;
+	nid = first_node(nodemask);
+	while (target) {
+		/* detect system default usage */
+		weight = table ? table[nid] : 1;
+		weight = weight ? weight : 1;
+		if (target < weight)
+			break;
+		target -= weight;
+		nid = next_node_in(nid, nodemask);
+	}
+	rcu_read_unlock();
+	return nid;
+}
+
 /*
  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
  * node in pol->nodes (starting from ilx=0), wrapping around if ilx
@@ -1979,6 +2062,11 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
 		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
 			interleave_nodes(pol) : interleave_nid(pol, ilx);
 		break;
+	case MPOL_WEIGHTED_INTERLEAVE:
+		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
+			weighted_interleave_nodes(pol) :
+			weighted_interleave_nid(pol, ilx);
+		break;
 	}
 
 	return nodemask;
@@ -2040,6 +2128,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 	case MPOL_PREFERRED_MANY:
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		*mask = mempolicy->nodes;
 		break;
 
@@ -2140,6 +2229,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
 		 * node in its nodemask, we allocate the standard way.
 		 */
 		if (pol->mode != MPOL_INTERLEAVE &&
+		    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
 		    (!nodemask || node_isset(nid, *nodemask))) {
 			/*
 			 * First, try to allocate THP only on local node, but
@@ -2275,6 +2365,127 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
 	return total_allocated;
 }
 
+static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+		struct mempolicy *pol, unsigned long nr_pages,
+		struct page **page_array)
+{
+	struct task_struct *me = current;
+	unsigned long total_allocated = 0;
+	unsigned long nr_allocated = 0;
+	unsigned long rounds;
+	unsigned long node_pages, delta;
+	u8 __rcu *table, *weights, weight;
+	unsigned int weight_total = 0;
+	unsigned long rem_pages = nr_pages;
+	nodemask_t nodes;
+	int nnodes, node, next_node;
+	int resume_node = MAX_NUMNODES - 1;
+	u8 resume_weight = 0;
+	int prev_node;
+	int i;
+
+	if (!nr_pages)
+		return 0;
+
+	nnodes = read_once_policy_nodemask(pol, &nodes);
+	if (!nnodes)
+		return 0;
+
+	/* Continue allocating from most recent node and adjust the nr_pages */
+	node = me->il_prev;
+	weight = me->il_weight;
+	if (weight && node_isset(node, nodes)) {
+		node_pages = min(rem_pages, weight);
+		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+						  NULL, page_array);
+		page_array += nr_allocated;
+		total_allocated += nr_allocated;
+		/* if that's all the pages, no need to interleave */
+		if (rem_pages < weight) {
+			/* stay on current node, adjust il_weight */
+			me->il_weight -= rem_pages;
+			return total_allocated;
+		} else if (rem_pages == weight) {
+			/* move to next node / weight */
+			me->il_prev = next_node_in(node, nodes);
+			me->il_weight = get_il_weight(next_node);
+			return total_allocated;
+		}
+		/* Otherwise we adjust remaining pages, continue from there */
+		rem_pages -= weight;
+	}
+	/* clear active weight in case of an allocation failure */
+	me->il_weight = 0;
+	prev_node = node;
+
+	/* create a local copy of node weights to operate on outside rcu */
+	weights = kzalloc(nr_node_ids, GFP_KERNEL);
+	if (!weights)
+		return total_allocated;
+
+	rcu_read_lock();
+	table = rcu_dereference(iw_table);
+	if (table)
+		memcpy(weights, table, nr_node_ids);
+	rcu_read_unlock();
+
+	/* calculate total, detect system default usage */
+	for_each_node_mask(node, nodes) {
+		if (!weights[node])
+			weights[node] = 1;
+		weight_total += weights[node];
+	}
+
+	/*
+	 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
+	 * Track which node weighted interleave should resume from.
+	 *
+	 * if (rounds > 0) and (delta == 0), resume_node will always be
+	 * the node following prev_node and its weight.
+	 */
+	rounds = rem_pages / weight_total;
+	delta = rem_pages % weight_total;
+	resume_node = next_node_in(prev_node, nodes);
+	resume_weight = weights[resume_node];
+	for (i = 0; i < nnodes; i++) {
+		node = next_node_in(prev_node, nodes);
+		weight = weights[node];
+		node_pages = weight * rounds;
+		/* If a delta exists, add this node's portion of the delta */
+		if (delta > weight) {
+			node_pages += weight;
+			delta -= weight;
+		} else if (delta) {
+			node_pages += delta;
+			/* delta may deplete on a boundary or w/ a remainder */
+			if (delta == weight) {
+				/* boundary: resume from next node/weight */
+				resume_node = next_node_in(node, nodes);
+				resume_weight = weights[resume_node];
+			} else {
+				/* remainder: resume this node w/ remainder */
+				resume_node = node;
+				resume_weight = weight - delta;
+			}
+			delta = 0;
+		}
+		/* node_pages can be 0 if an allocation fails and rounds == 0 */
+		if (!node_pages)
+			break;
+		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+						  NULL, page_array);
+		page_array += nr_allocated;
+		total_allocated += nr_allocated;
+		if (total_allocated == nr_pages)
+			break;
+		prev_node = node;
+	}
+	me->il_prev = resume_node;
+	me->il_weight = resume_weight;
+	kfree(weights);
+	return total_allocated;
+}
+
 static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
 		struct mempolicy *pol, unsigned long nr_pages,
 		struct page **page_array)
@@ -2315,6 +2526,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
 		return alloc_pages_bulk_array_interleave(gfp, pol,
 							 nr_pages, page_array);
 
+	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+		return alloc_pages_bulk_array_weighted_interleave(
+				  gfp, pol, nr_pages, page_array);
+
 	if (pol->mode == MPOL_PREFERRED_MANY)
 		return alloc_pages_bulk_array_preferred_many(gfp,
 				numa_node_id(), pol, nr_pages, page_array);
@@ -2390,6 +2605,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		return !!nodes_equal(a->nodes, b->nodes);
 	case MPOL_LOCAL:
 		return true;
@@ -2526,6 +2742,10 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
 		polnid = interleave_nid(pol, ilx);
 		break;
 
+	case MPOL_WEIGHTED_INTERLEAVE:
+		polnid = weighted_interleave_nid(pol, ilx);
+		break;
+
 	case MPOL_PREFERRED:
 		if (node_isset(curnid, pol->nodes))
 			goto out;
@@ -2900,6 +3120,7 @@ static const char * const policy_modes[] =
 	[MPOL_PREFERRED]  = "prefer",
 	[MPOL_BIND]       = "bind",
 	[MPOL_INTERLEAVE] = "interleave",
+	[MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
 	[MPOL_LOCAL]      = "local",
 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
 };
@@ -2959,6 +3180,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
 		}
 		break;
 	case MPOL_INTERLEAVE:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		/*
 		 * Default to online nodes with memory if no nodelist
 		 */
@@ -3069,6 +3291,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	case MPOL_PREFERRED_MANY:
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		nodes = pol->nodes;
 		break;
 	default:

From 041aafc133cc4c8826c58d8c0da7635cce9444cd Mon Sep 17 00:00:00 2001
From: Gregory Price <gregory.price@memverge.com>
Date: Wed, 31 Jan 2024 00:12:24 -0500
Subject: [PATCH 554/707] 
 mm-mempolicy-introduce-mpol_weighted_interleave-for-weighted-interleaving-fix.

kill next_node in favor of operating directly on il_prev

Link: https://lkml.kernel.org/r/ZbnWuB4dRCEFRz2m@memverge.com
Signed-off-by: Gregory Price <gregory.price@memverge.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Hasan Al Maruf <Hasan.Maruf@amd.com>
Cc: Honggyu Kim <honggyu.kim@sk.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hyeongtak Ji <hyeongtak.ji@sk.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Cc: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mempolicy.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eca3e41e7787b9..bbf7cd328ea6eb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2378,7 +2378,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
 	unsigned int weight_total = 0;
 	unsigned long rem_pages = nr_pages;
 	nodemask_t nodes;
-	int nnodes, node, next_node;
+	int nnodes, node;
 	int resume_node = MAX_NUMNODES - 1;
 	u8 resume_weight = 0;
 	int prev_node;
@@ -2408,7 +2408,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
 		} else if (rem_pages == weight) {
 			/* move to next node / weight */
 			me->il_prev = next_node_in(node, nodes);
-			me->il_weight = get_il_weight(next_node);
+			me->il_weight = get_il_weight(me->il_prev);
 			return total_allocated;
 		}
 		/* Otherwise we adjust remaining pages, continue from there */

From f5e62bcf472e53a5e37cf9ebe46d5067442541f1 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 30 Jan 2024 11:34:32 +0100
Subject: [PATCH 555/707] arm: ptdump: rename CONFIG_DEBUG_WX to
 CONFIG_ARM_DEBUG_WX

Patch series "mm: ptdump: Refactor CONFIG_DEBUG_WX and check_wx_pages
debugfs attribute", v2.

This series refactors CONFIG_DEBUG_WX for the 5 architectures implementing
CONFIG_GENERIC_PTDUMP

First rename stuff in ARM which uses similar names while not implementing
CONFIG_GENERIC_PTDUMP.

Then define a generic version of debug_checkwx() that calls
ptdump_check_wx() when CONFIG_DEBUG_WX is set.  Call it immediately after
calling mark_rodata_ro() instead of calling it at the end of every
mark_rodata_ro().

Then implement a debugfs attribute that can be used to trigger a W^X test
at anytime and regardless of CONFIG_DEBUG_WX


CONFIG_DEBUG_WX is a core option defined in mm/Kconfig.debug

To avoid any future conflict, rename ARM version into CONFIG_ARM_DEBUG_WX.


Link: https://lore.kernel.org/lkml/20200422152656.GF676@willie-the-truck/T/#m802eaf33efd6f8d575939d157301b35ac0d4a64f
Link: https://github.com/KSPP/linux/issues/35
Link: https://lkml.kernel.org/r/cover.1706610398.git.christophe.leroy@csgroup.eu
Link: https://lkml.kernel.org/r/fa297aa90caeb61eee2b70c6c5897a2ab58a9562.1706610398.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Greg KH <greg@kroah.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Phong Tran <tranmanphong@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Steven Price <steven.price@arm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/Kconfig.debug               | 2 +-
 arch/arm/configs/aspeed_g4_defconfig | 2 +-
 arch/arm/configs/aspeed_g5_defconfig | 2 +-
 arch/arm/include/asm/ptdump.h        | 6 +++---
 arch/arm/mm/init.c                   | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug
index 5fbbac1b708b0a..f1fc278081d035 100644
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -17,7 +17,7 @@ config ARM_PTDUMP_DEBUGFS
 	  kernel.
 	  If in doubt, say "N"
 
-config DEBUG_WX
+config ARM_DEBUG_WX
 	bool "Warn on W+X mappings at boot"
 	depends on MMU
 	select ARM_PTDUMP_CORE
diff --git a/arch/arm/configs/aspeed_g4_defconfig b/arch/arm/configs/aspeed_g4_defconfig
index b3dc0465796f9a..28b724d59e7e23 100644
--- a/arch/arm/configs/aspeed_g4_defconfig
+++ b/arch/arm/configs/aspeed_g4_defconfig
@@ -252,7 +252,7 @@ CONFIG_DEBUG_INFO_REDUCED=y
 CONFIG_GDB_SCRIPTS=y
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
-CONFIG_DEBUG_WX=y
+CONFIG_ARM_DEBUG_WX=y
 CONFIG_SCHED_STACK_END_CHECK=y
 CONFIG_PANIC_ON_OOPS=y
 CONFIG_PANIC_TIMEOUT=-1
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 3fdf4dbfdea5db..61cee1e7ebea61 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -302,7 +302,7 @@ CONFIG_DEBUG_INFO_REDUCED=y
 CONFIG_GDB_SCRIPTS=y
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
-CONFIG_DEBUG_WX=y
+CONFIG_ARM_DEBUG_WX=y
 CONFIG_SCHED_STACK_END_CHECK=y
 CONFIG_PANIC_ON_OOPS=y
 CONFIG_PANIC_TIMEOUT=-1
diff --git a/arch/arm/include/asm/ptdump.h b/arch/arm/include/asm/ptdump.h
index aad1d034136cea..46a4575146ee85 100644
--- a/arch/arm/include/asm/ptdump.h
+++ b/arch/arm/include/asm/ptdump.h
@@ -32,10 +32,10 @@ void ptdump_check_wx(void);
 
 #endif /* CONFIG_ARM_PTDUMP_CORE */
 
-#ifdef CONFIG_DEBUG_WX
-#define debug_checkwx() ptdump_check_wx()
+#ifdef CONFIG_ARM_DEBUG_WX
+#define arm_debug_checkwx() ptdump_check_wx()
 #else
-#define debug_checkwx() do { } while (0)
+#define arm_debug_checkwx() do { } while (0)
 #endif
 
 #endif /* __ASM_PTDUMP_H */
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index a42e4cd11db294..4c3d78691279d3 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -458,7 +458,7 @@ static int __mark_rodata_ro(void *unused)
 void mark_rodata_ro(void)
 {
 	stop_machine(__mark_rodata_ro, NULL, NULL);
-	debug_checkwx();
+	arm_debug_checkwx();
 }
 
 #else

From b7062272ee15bad310018ee4538a9b229e70062f Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 30 Jan 2024 11:34:33 +0100
Subject: [PATCH 556/707] arm64, powerpc, riscv, s390, x86: ptdump: refactor
 CONFIG_DEBUG_WX

All architectures using the core ptdump functionality also implement
CONFIG_DEBUG_WX, and they all do it more or less the same way, with a
function called debug_checkwx() that is called by mark_rodata_ro(), which
is a substitute to ptdump_check_wx() when CONFIG_DEBUG_WX is set and a
no-op otherwise.

Refactor by centrally defining debug_checkwx() in linux/ptdump.h and call
debug_checkwx() immediately after calling mark_rodata_ro() instead of
calling it at the end of every mark_rodata_ro().

On x86_32, mark_rodata_ro() first checks __supported_pte_mask has _PAGE_NX
before calling debug_checkwx().  Now the check is inside the callee
ptdump_walk_pgd_level_checkwx().

On powerpc_64, mark_rodata_ro() bails out early before calling
ptdump_check_wx() when the MMU doesn't have KERNEL_RO feature.  The check
is now also done in ptdump_check_wx() as it is called outside
mark_rodata_ro().

Link: https://lkml.kernel.org/r/a59b102d7964261d31ead0316a9f18628e4e7a8e.1706610398.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Greg KH <greg@kroah.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Phong Tran <tranmanphong@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Steven Price <steven.price@arm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/ptdump.h |  7 -------
 arch/arm64/mm/mmu.c             |  2 --
 arch/powerpc/mm/mmu_decl.h      |  6 ------
 arch/powerpc/mm/pgtable_32.c    |  4 ----
 arch/powerpc/mm/pgtable_64.c    |  3 ---
 arch/powerpc/mm/ptdump/ptdump.c |  3 +++
 arch/riscv/include/asm/ptdump.h | 22 ----------------------
 arch/riscv/mm/init.c            |  3 ---
 arch/riscv/mm/ptdump.c          |  1 -
 arch/s390/include/asm/ptdump.h  | 14 --------------
 arch/s390/mm/dump_pagetables.c  |  1 -
 arch/s390/mm/init.c             |  2 --
 arch/x86/include/asm/pgtable.h  |  3 +--
 arch/x86/mm/dump_pagetables.c   |  3 +++
 arch/x86/mm/init_32.c           |  2 --
 arch/x86/mm/init_64.c           |  2 --
 include/linux/ptdump.h          |  7 +++++++
 init/main.c                     |  2 ++
 18 files changed, 16 insertions(+), 71 deletions(-)
 delete mode 100644 arch/riscv/include/asm/ptdump.h
 delete mode 100644 arch/s390/include/asm/ptdump.h

diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h
index 581caac525b03a..5b1701c76d1cec 100644
--- a/arch/arm64/include/asm/ptdump.h
+++ b/arch/arm64/include/asm/ptdump.h
@@ -29,13 +29,6 @@ void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name);
 static inline void ptdump_debugfs_register(struct ptdump_info *info,
 					   const char *name) { }
 #endif
-void ptdump_check_wx(void);
 #endif /* CONFIG_PTDUMP_CORE */
 
-#ifdef CONFIG_DEBUG_WX
-#define debug_checkwx()	ptdump_check_wx()
-#else
-#define debug_checkwx()	do { } while (0)
-#endif
-
 #endif /* __ASM_PTDUMP_H */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index d794b2f4b5a3cd..34f3f777c4f192 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -632,8 +632,6 @@ void mark_rodata_ro(void)
 	section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
 	update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
 			    section_size, PAGE_KERNEL_RO);
-
-	debug_checkwx();
 }
 
 static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 72341b9fb5521f..90dcc284405629 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -171,12 +171,6 @@ static inline void mmu_mark_rodata_ro(void) { }
 void __init mmu_mapin_immr(void);
 #endif
 
-#ifdef CONFIG_DEBUG_WX
-void ptdump_check_wx(void);
-#else
-static inline void ptdump_check_wx(void) { }
-#endif
-
 static inline bool debug_pagealloc_enabled_or_kfence(void)
 {
 	return IS_ENABLED(CONFIG_KFENCE) || debug_pagealloc_enabled();
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 5c02fd08d61eff..12498017da8e43 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -153,7 +153,6 @@ void mark_rodata_ro(void)
 
 	if (v_block_mapped((unsigned long)_stext + 1)) {
 		mmu_mark_rodata_ro();
-		ptdump_check_wx();
 		return;
 	}
 
@@ -166,9 +165,6 @@ void mark_rodata_ro(void)
 		   PFN_DOWN((unsigned long)_stext);
 
 	set_memory_ro((unsigned long)_stext, numpages);
-
-	// mark_initmem_nx() should have already run by now
-	ptdump_check_wx();
 }
 #endif
 
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 5ac1fd30341bb2..1b366526f4f21e 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -150,9 +150,6 @@ void mark_rodata_ro(void)
 		radix__mark_rodata_ro();
 	else
 		hash__mark_rodata_ro();
-
-	// mark_initmem_nx() should have already run by now
-	ptdump_check_wx();
 }
 
 void mark_initmem_nx(void)
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 2313053fe679ed..620d4917ebe8a6 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -343,6 +343,9 @@ void ptdump_check_wx(void)
 		}
 	};
 
+	if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !mmu_has_feature(MMU_FTR_KERNEL_RO))
+		return;
+
 	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
 
 	if (st.wx_pages)
diff --git a/arch/riscv/include/asm/ptdump.h b/arch/riscv/include/asm/ptdump.h
deleted file mode 100644
index 3c9ea6dd5af7eb..00000000000000
--- a/arch/riscv/include/asm/ptdump.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2019 SiFive
- */
-
-#ifndef _ASM_RISCV_PTDUMP_H
-#define _ASM_RISCV_PTDUMP_H
-
-void ptdump_check_wx(void);
-
-#ifdef CONFIG_DEBUG_WX
-static inline void debug_checkwx(void)
-{
-	ptdump_check_wx();
-}
-#else
-static inline void debug_checkwx(void)
-{
-}
-#endif
-
-#endif /* _ASM_RISCV_PTDUMP_H */
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 245919dda91043..a69d8ce289ae1a 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -29,7 +29,6 @@
 #include <asm/io.h>
 #include <asm/numa.h>
 #include <asm/pgtable.h>
-#include <asm/ptdump.h>
 #include <asm/sections.h>
 #include <asm/soc.h>
 #include <asm/tlbflush.h>
@@ -723,8 +722,6 @@ void mark_rodata_ro(void)
 	if (IS_ENABLED(CONFIG_64BIT))
 		set_kernel_memory(lm_alias(__start_rodata), lm_alias(_data),
 				  set_memory_ro);
-
-	debug_checkwx();
 }
 #else
 static __init pgprot_t pgprot_from_va(uintptr_t va)
diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
index 657c27bc07a769..07526560331366 100644
--- a/arch/riscv/mm/ptdump.c
+++ b/arch/riscv/mm/ptdump.c
@@ -9,7 +9,6 @@
 #include <linux/seq_file.h>
 #include <linux/ptdump.h>
 
-#include <asm/ptdump.h>
 #include <linux/pgtable.h>
 #include <asm/kasan.h>
 
diff --git a/arch/s390/include/asm/ptdump.h b/arch/s390/include/asm/ptdump.h
deleted file mode 100644
index f960b2896606a1..00000000000000
--- a/arch/s390/include/asm/ptdump.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _ASM_S390_PTDUMP_H
-#define _ASM_S390_PTDUMP_H
-
-void ptdump_check_wx(void);
-
-static inline void debug_checkwx(void)
-{
-	if (IS_ENABLED(CONFIG_DEBUG_WX))
-		ptdump_check_wx();
-}
-
-#endif /* _ASM_S390_PTDUMP_H */
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index d37a8f607b7188..8dcb4e0c71bde6 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -6,7 +6,6 @@
 #include <linux/mm.h>
 #include <linux/kfence.h>
 #include <linux/kasan.h>
-#include <asm/ptdump.h>
 #include <asm/kasan.h>
 #include <asm/abs_lowcore.h>
 #include <asm/nospec-branch.h>
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 8d9a60ccb7771a..f6391442c0c2ad 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -37,7 +37,6 @@
 #include <asm/pgalloc.h>
 #include <asm/ctlreg.h>
 #include <asm/kfence.h>
-#include <asm/ptdump.h>
 #include <asm/dma.h>
 #include <asm/abs_lowcore.h>
 #include <asm/tlb.h>
@@ -109,7 +108,6 @@ void mark_rodata_ro(void)
 
 	__set_memory_ro(__start_ro_after_init, __end_ro_after_init);
 	pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
-	debug_checkwx();
 }
 
 int set_memory_encrypted(unsigned long vaddr, int numpages)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 9d077bca6a103e..6c979028e5212f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -32,6 +32,7 @@ void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
 void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
 				   bool user);
 void ptdump_walk_pgd_level_checkwx(void);
+#define ptdump_check_wx ptdump_walk_pgd_level_checkwx
 void ptdump_walk_user_pgd_level_checkwx(void);
 
 /*
@@ -41,10 +42,8 @@ void ptdump_walk_user_pgd_level_checkwx(void);
 #define pgprot_decrypted(prot)	__pgprot(cc_mkdec(pgprot_val(prot)))
 
 #ifdef CONFIG_DEBUG_WX
-#define debug_checkwx()		ptdump_walk_pgd_level_checkwx()
 #define debug_checkwx_user()	ptdump_walk_user_pgd_level_checkwx()
 #else
-#define debug_checkwx()		do { } while (0)
 #define debug_checkwx_user()	do { } while (0)
 #endif
 
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e1b599ecbbc26d..0008524eebe9af 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -433,6 +433,9 @@ void ptdump_walk_user_pgd_level_checkwx(void)
 
 void ptdump_walk_pgd_level_checkwx(void)
 {
+	if (!(__supported_pte_mask & _PAGE_NX))
+		return;
+
 	ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
 }
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index b63403d7179df4..5c736b707caea0 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -800,6 +800,4 @@ void mark_rodata_ro(void)
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 #endif
 	mark_nxdata_nx();
-	if (__supported_pte_mask & _PAGE_NX)
-		debug_checkwx();
 }
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a0dffaca6d2bfc..ebdbcae48011d4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1412,8 +1412,6 @@ void mark_rodata_ro(void)
 				(void *)text_end, (void *)rodata_start);
 	free_kernel_image_pages("unused kernel image (rodata/data gap)",
 				(void *)rodata_end, (void *)_sdata);
-
-	debug_checkwx();
 }
 
 /*
diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h
index 2a3a955864259a..c10513739bf951 100644
--- a/include/linux/ptdump.h
+++ b/include/linux/ptdump.h
@@ -19,5 +19,12 @@ struct ptdump_state {
 };
 
 void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd);
+void ptdump_check_wx(void);
+
+static inline void debug_checkwx(void)
+{
+	if (IS_ENABLED(CONFIG_DEBUG_WX))
+		ptdump_check_wx();
+}
 
 #endif /* _LINUX_PTDUMP_H */
diff --git a/init/main.c b/init/main.c
index e24b0780fdff7a..749a9f8d2c9b0d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -99,6 +99,7 @@
 #include <linux/init_syscalls.h>
 #include <linux/stackdepot.h>
 #include <linux/randomize_kstack.h>
+#include <linux/ptdump.h>
 #include <net/net_namespace.h>
 
 #include <asm/io.h>
@@ -1408,6 +1409,7 @@ static void mark_readonly(void)
 		 */
 		rcu_barrier();
 		mark_rodata_ro();
+		debug_checkwx();
 		rodata_test();
 	} else
 		pr_info("Kernel memory protection disabled.\n");

From 938533d46be747f58fc495c04ef8cb1fa2b2f67c Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 30 Jan 2024 11:34:34 +0100
Subject: [PATCH 557/707] powerpc,s390: ptdump: define ptdump_check_wx()
 regardless of CONFIG_DEBUG_WX

Following patch will use ptdump_check_wx() regardless of CONFIG_DEBUG_WX,
so define it at all times on powerpc and s390 just like other
architectures.  Though keep the WARN_ON_ONCE() only when CONFIG_DEBUG_WX
is set.

Link: https://lkml.kernel.org/r/07bfb04c7fec58e84413e91d2533581be357a696.1706610398.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Greg KH <greg@kroah.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Phong Tran <tranmanphong@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Steven Price <steven.price@arm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/mm/ptdump/ptdump.c | 7 +++----
 arch/s390/mm/dump_pagetables.c  | 7 ++-----
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 620d4917ebe8a6..b835c80371cd28 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -184,13 +184,14 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
 {
 	pte_t pte = __pte(st->current_flags);
 
-	if (!IS_ENABLED(CONFIG_DEBUG_WX) || !st->check_wx)
+	if (!st->check_wx)
 		return;
 
 	if (!pte_write(pte) || !pte_exec(pte))
 		return;
 
-	WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n",
+	WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX),
+		  "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n",
 		  (void *)st->start_address, (void *)st->start_address);
 
 	st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
@@ -326,7 +327,6 @@ static void __init build_pgtable_complete_mask(void)
 				pg_level[i].mask |= pg_level[i].flag[j].mask;
 }
 
-#ifdef CONFIG_DEBUG_WX
 void ptdump_check_wx(void)
 {
 	struct pg_state st = {
@@ -354,7 +354,6 @@ void ptdump_check_wx(void)
 	else
 		pr_info("Checked W+X mappings: passed, no W+X pages found\n");
 }
-#endif
 
 static int __init ptdump_init(void)
 {
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 8dcb4e0c71bde6..99da5a5602a8ae 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -121,7 +121,6 @@ static void print_prot(struct seq_file *m, unsigned int pr, int level)
 
 static void note_prot_wx(struct pg_state *st, unsigned long addr)
 {
-#ifdef CONFIG_DEBUG_WX
 	if (!st->check_wx)
 		return;
 	if (st->current_prot & _PAGE_INVALID)
@@ -138,10 +137,10 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
 	 */
 	if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)))
 		return;
-	WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n",
+	WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX),
+		  "s390/mm: Found insecure W+X mapping at address %pS\n",
 		  (void *)st->start_address);
 	st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
-#endif /* CONFIG_DEBUG_WX */
 }
 
 static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
@@ -193,7 +192,6 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
 	}
 }
 
-#ifdef CONFIG_DEBUG_WX
 void ptdump_check_wx(void)
 {
 	struct pg_state st = {
@@ -226,7 +224,6 @@ void ptdump_check_wx(void)
 			(nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ?
 			"unexpected " : "");
 }
-#endif /* CONFIG_DEBUG_WX */
 
 #ifdef CONFIG_PTDUMP_DEBUGFS
 static int ptdump_show(struct seq_file *m, void *v)

From 7282bf6c3c75d9d06894a87e9e7783ada7e80467 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 30 Jan 2024 11:34:35 +0100
Subject: [PATCH 558/707] mm: ptdump: have ptdump_check_wx() return bool

Have ptdump_check_wx() return true when the check is successful or false
otherwise.

Link: https://lkml.kernel.org/r/7943149fe955458cb7b57cd483bf41a3aad94684.1706610398.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Greg KH <greg@kroah.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Phong Tran <tranmanphong@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Steven Price <steven.price@arm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/mm/ptdump.c          | 11 ++++++++---
 arch/powerpc/mm/ptdump/ptdump.c | 13 +++++++++----
 arch/riscv/mm/ptdump.c          | 11 ++++++++---
 arch/s390/mm/dump_pagetables.c  | 13 +++++++++----
 arch/x86/include/asm/pgtable.h  |  2 +-
 arch/x86/mm/dump_pagetables.c   | 19 ++++++++++++-------
 include/linux/ptdump.h          |  2 +-
 7 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index e305b6593c4e23..696822f755827e 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -345,7 +345,7 @@ static struct ptdump_info kernel_ptdump_info = {
 	.base_addr	= PAGE_OFFSET,
 };
 
-void ptdump_check_wx(void)
+bool ptdump_check_wx(void)
 {
 	struct pg_state st = {
 		.seq = NULL,
@@ -366,11 +366,16 @@ void ptdump_check_wx(void)
 
 	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
 
-	if (st.wx_pages || st.uxn_pages)
+	if (st.wx_pages || st.uxn_pages) {
 		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n",
 			st.wx_pages, st.uxn_pages);
-	else
+
+		return false;
+	} else {
 		pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+
+		return true;
+	}
 }
 
 static int __init ptdump_init(void)
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index b835c80371cd28..9dc239967b77f7 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -327,7 +327,7 @@ static void __init build_pgtable_complete_mask(void)
 				pg_level[i].mask |= pg_level[i].flag[j].mask;
 }
 
-void ptdump_check_wx(void)
+bool ptdump_check_wx(void)
 {
 	struct pg_state st = {
 		.seq = NULL,
@@ -344,15 +344,20 @@ void ptdump_check_wx(void)
 	};
 
 	if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !mmu_has_feature(MMU_FTR_KERNEL_RO))
-		return;
+		return true;
 
 	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
 
-	if (st.wx_pages)
+	if (st.wx_pages) {
 		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n",
 			st.wx_pages);
-	else
+
+		return false;
+	} else {
 		pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+
+		return true;
+	}
 }
 
 static int __init ptdump_init(void)
diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
index 07526560331366..1289cc6d3700cd 100644
--- a/arch/riscv/mm/ptdump.c
+++ b/arch/riscv/mm/ptdump.c
@@ -335,7 +335,7 @@ static void ptdump_walk(struct seq_file *s, struct ptd_mm_info *pinfo)
 	ptdump_walk_pgd(&st.ptdump, pinfo->mm, NULL);
 }
 
-void ptdump_check_wx(void)
+bool ptdump_check_wx(void)
 {
 	struct pg_state st = {
 		.seq = NULL,
@@ -356,11 +356,16 @@ void ptdump_check_wx(void)
 
 	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
 
-	if (st.wx_pages)
+	if (st.wx_pages) {
 		pr_warn("Checked W+X mappings: failed, %lu W+X pages found\n",
 			st.wx_pages);
-	else
+
+		return false;
+	} else {
 		pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+
+		return true;
+	}
 }
 
 static int ptdump_show(struct seq_file *m, void *v)
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 99da5a5602a8ae..ffd07ed7b4af88 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -192,7 +192,7 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
 	}
 }
 
-void ptdump_check_wx(void)
+bool ptdump_check_wx(void)
 {
 	struct pg_state st = {
 		.ptdump = {
@@ -215,14 +215,19 @@ void ptdump_check_wx(void)
 	};
 
 	if (!MACHINE_HAS_NX)
-		return;
+		return true;
 	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
-	if (st.wx_pages)
+	if (st.wx_pages) {
 		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages);
-	else
+
+		return false;
+	} else {
 		pr_info("Checked W+X mappings: passed, no %sW+X pages found\n",
 			(nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ?
 			"unexpected " : "");
+
+		return true;
+	}
 }
 
 #ifdef CONFIG_PTDUMP_DEBUGFS
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6c979028e5212f..b50b2ef63672f4 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -31,7 +31,7 @@ struct seq_file;
 void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
 void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
 				   bool user);
-void ptdump_walk_pgd_level_checkwx(void);
+bool ptdump_walk_pgd_level_checkwx(void);
 #define ptdump_check_wx ptdump_walk_pgd_level_checkwx
 void ptdump_walk_user_pgd_level_checkwx(void);
 
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0008524eebe9af..c58c01f560fd87 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -362,7 +362,7 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
 	}
 }
 
-static void ptdump_walk_pgd_level_core(struct seq_file *m,
+bool void ptdump_walk_pgd_level_core(struct seq_file *m,
 				       struct mm_struct *mm, pgd_t *pgd,
 				       bool checkwx, bool dmesg)
 {
@@ -391,12 +391,17 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m,
 	ptdump_walk_pgd(&st.ptdump, mm, pgd);
 
 	if (!checkwx)
-		return;
-	if (st.wx_pages)
+		return true;
+	if (st.wx_pages) {
 		pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
 			st.wx_pages);
-	else
+
+		return false;
+	} else {
 		pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
+
+		return true;
+	}
 }
 
 void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm)
@@ -431,12 +436,12 @@ void ptdump_walk_user_pgd_level_checkwx(void)
 #endif
 }
 
-void ptdump_walk_pgd_level_checkwx(void)
+bool ptdump_walk_pgd_level_checkwx(void)
 {
 	if (!(__supported_pte_mask & _PAGE_NX))
-		return;
+		return true;
 
-	ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
+	return ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
 }
 
 static int __init pt_dump_init(void)
diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h
index c10513739bf951..953b61696ccf7b 100644
--- a/include/linux/ptdump.h
+++ b/include/linux/ptdump.h
@@ -19,7 +19,7 @@ struct ptdump_state {
 };
 
 void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd);
-void ptdump_check_wx(void);
+bool ptdump_check_wx(void);
 
 static inline void debug_checkwx(void)
 {

From 24976a6ac9176a30e731cc0c2cd790d3336dd75c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 31 Jan 2024 01:13:01 -0800
Subject: [PATCH 559/707] mm-ptdump-have-ptdump_check_wx-return-bool-fix

fix a couple of build issues (x86_64 allmodconfig)

Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Greg KH <greg@kroah.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Phong Tran <tranmanphong@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Steven Price <steven.price@arm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/mm/dump_pagetables.c | 6 +++---
 include/linux/ptdump.h        | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index c58c01f560fd87..35b2cfd4791418 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -362,9 +362,9 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
 	}
 }
 
-bool void ptdump_walk_pgd_level_core(struct seq_file *m,
-				       struct mm_struct *mm, pgd_t *pgd,
-				       bool checkwx, bool dmesg)
+bool ptdump_walk_pgd_level_core(struct seq_file *m,
+				struct mm_struct *mm, pgd_t *pgd,
+				bool checkwx, bool dmesg)
 {
 	const struct ptdump_range ptdump_ranges[] = {
 #ifdef CONFIG_X86_64
diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h
index 953b61696ccf7b..8dbd51ea862678 100644
--- a/include/linux/ptdump.h
+++ b/include/linux/ptdump.h
@@ -18,6 +18,9 @@ struct ptdump_state {
 	const struct ptdump_range *range;
 };
 
+bool ptdump_walk_pgd_level_core(struct seq_file *m,
+				struct mm_struct *mm, pgd_t *pgd,
+				bool checkwx, bool dmesg);
 void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd);
 bool ptdump_check_wx(void);
 

From b654b75b3060bc5ce65cda6fa110fc6a321de10c Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 30 Jan 2024 11:34:36 +0100
Subject: [PATCH 560/707] mm: ptdump: add check_wx_pages debugfs attribute

Add a readable attribute in debugfs to trigger a W^X pages check at any
time.

To trigger the test, just read /sys/kernel/debug/check_wx_pages It will
report FAILED if the test failed, SUCCESS otherwise.

Detailed result is provided into dmesg.

Link: https://lkml.kernel.org/r/e947fb1a9f3f5466344823e532d343ff194ae03d.1706610398.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Greg KH <greg@kroah.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Phong Tran <tranmanphong@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Steven Price <steven.price@arm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/ptdump.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/mm/ptdump.c b/mm/ptdump.c
index 03c1bdae4a4368..106e1d66e9f9ee 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/pagewalk.h>
+#include <linux/debugfs.h>
 #include <linux/ptdump.h>
 #include <linux/kasan.h>
 
@@ -163,3 +164,24 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
 	/* Flush out the last page */
 	st->note_page(st, 0, -1, 0);
 }
+
+static int check_wx_show(struct seq_file *m, void *v)
+{
+	if (ptdump_check_wx())
+		seq_puts(m, "SUCCESS\n");
+	else
+		seq_puts(m, "FAILED\n");
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(check_wx);
+
+static int ptdump_debugfs_init(void)
+{
+	debugfs_create_file("check_wx_pages", 0400, NULL, NULL, &check_wx_fops);
+
+	return 0;
+}
+
+device_initcall(ptdump_debugfs_init);

From 2bcb578627a37b56e2fa2099be7bcd46f115bd5b Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@huawei.com>
Date: Mon, 29 Jan 2024 10:03:04 +0800
Subject: [PATCH 561/707] modules: wait do_free_init correctly

commit 1a7b7d922081 ("modules: Use vmalloc special flag") moves
do_free_init() into a global workqueue instead of call_rcu().  So now
rcu_barrier() can not ensure that do_free_init has completed.  We should
wait it via flush_work().

Without this fix, we still could encounter false positive reports in W+X
checking, and rcu synchronization is unnecessary.

Link: https://lkml.kernel.org/r/20240129020304.1981372-1-changbin.du@huawei.com
Fixes: 1a7b7d922081 ("modules: Use vmalloc special flag")
Signed-off-by: Changbin Du <changbin.du@huawei.com>
Cc: Xiaoyi Su <suxiaoyi@huawei.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/moduleloader.h | 8 ++++++++
 init/main.c                  | 5 +++--
 kernel/module/main.c         | 5 +++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 001b2ce83832ed..89b1e0ed981144 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -115,6 +115,14 @@ int module_finalize(const Elf_Ehdr *hdr,
 		    const Elf_Shdr *sechdrs,
 		    struct module *mod);
 
+#ifdef CONFIG_MODULES
+void flush_module_init_free_work(void);
+#else
+static inline void flush_module_init_free_work(void)
+{
+}
+#endif
+
 /* Any cleanup needed when module leaves. */
 void module_arch_cleanup(struct module *mod);
 
diff --git a/init/main.c b/init/main.c
index 749a9f8d2c9b0d..504d417ab9f0f3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -87,6 +87,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/context_tracking.h>
+#include <linux/moduleloader.h>
 #include <linux/random.h>
 #include <linux/list.h>
 #include <linux/integrity.h>
@@ -1403,11 +1404,11 @@ static void mark_readonly(void)
 	if (rodata_enabled) {
 		/*
 		 * load_module() results in W+X mappings, which are cleaned
-		 * up with call_rcu().  Let's make sure that queued work is
+		 * up with init_free_wq. Let's make sure that queued work is
 		 * flushed so that we don't hit false positives looking for
 		 * insecure pages which are W+X.
 		 */
-		rcu_barrier();
+		flush_module_init_free_work();
 		mark_rodata_ro();
 		debug_checkwx();
 		rodata_test();
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 36681911c05acd..ea66b5c2a2a157 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2489,6 +2489,11 @@ static void do_free_init(struct work_struct *w)
 	}
 }
 
+void flush_module_init_free_work(void)
+{
+	flush_work(&init_free_wq);
+}
+
 #undef MODULE_PARAM_PREFIX
 #define MODULE_PARAM_PREFIX "module."
 /* Default value for module->async_probe_requested */

From c387d996faff9bf2d105dfbfef337da82bff6c5c Mon Sep 17 00:00:00 2001
From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
Date: Thu, 11 May 2023 13:22:30 +0800
Subject: [PATCH 562/707] mm: optimization on page allocation when CMA enabled

According to current CMA utilization policy, an alloc_pages(GFP_USER)
could 'steal' UNMOVABLE & RECLAIMABLE page blocks via the help of CMA(pass
zone_watermark_ok by counting CMA in but use U&R in rmqueue), which could
lead to following alloc_pages(GFP_KERNEL) fail.  Solving this by
introducing second watermark checking for GFP_MOVABLE, which could have
the allocation use CMA when proper.

-- Free_pages(30MB)
|
|
-- WMARK_LOW(25MB)
|
-- Free_CMA(12MB)
|
|
--

Link: https://lkml.kernel.org/r/20231016071245.2865233-1-zhaoyang.huang@unisoc.com
Link: https://lkml.kernel.org/r/1683782550-25799-1-git-send-email-zhaoyang.huang@unisoc.com
Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: ke.wang <ke.wang@unisoc.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Zhaoyang Huang <huangzhaoyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 140c4f372db169..e6e2ac722a82fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2084,6 +2084,43 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
 
 }
 
+#ifdef CONFIG_CMA
+/*
+ * GFP_MOVABLE allocation could drain UNMOVABLE & RECLAIMABLE page blocks via
+ * the help of CMA which makes GFP_KERNEL failed. Checking if zone_watermark_ok
+ * again without ALLOC_CMA to see if to use CMA first.
+ */
+static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
+{
+	unsigned long watermark;
+	bool cma_first = false;
+
+	watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
+	/* check if GFP_MOVABLE pass previous zone_watermark_ok via the help of CMA */
+	if (zone_watermark_ok(zone, order, watermark, 0, alloc_flags & (~ALLOC_CMA))) {
+		/*
+		 * Balance movable allocations between regular and CMA areas by
+		 * allocating from CMA when over half of the zone's free memory
+		 * is in the CMA area.
+		 */
+		cma_first = (zone_page_state(zone, NR_FREE_CMA_PAGES) >
+				zone_page_state(zone, NR_FREE_PAGES) / 2);
+	} else {
+		/*
+		 * watermark failed means UNMOVABLE & RECLAIMBLE is not enough
+		 * now, we should use cma first to keep them stay around the
+		 * corresponding watermark
+		 */
+		cma_first = true;
+	}
+	return cma_first;
+}
+#else
+static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
+{
+	return false;
+}
+#endif
 /*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
@@ -2097,12 +2134,11 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
 	if (IS_ENABLED(CONFIG_CMA)) {
 		/*
 		 * Balance movable allocations between regular and CMA areas by
-		 * allocating from CMA when over half of the zone's free memory
-		 * is in the CMA area.
+		 * allocating from CMA base on judging zone_watermark_ok again
+		 * to see if the latest check got pass via the help of CMA
 		 */
 		if (alloc_flags & ALLOC_CMA &&
-		    zone_page_state(zone, NR_FREE_CMA_PAGES) >
-		    zone_page_state(zone, NR_FREE_PAGES) / 2) {
+			use_cma_first(zone, order, alloc_flags)) {
 			page = __rmqueue_cma_fallback(zone, order);
 			if (page)
 				return page;

From 196694dc994059d47ed08c762981020ded1700e5 Mon Sep 17 00:00:00 2001
From: Dan Schatzberg <schatzberg.dan@gmail.com>
Date: Wed, 3 Jan 2024 08:48:36 -0800
Subject: [PATCH 563/707] mm: add defines for min/max swappiness

Patch series "Add swappiness argument to memory.reclaim", v6.

This patch proposes augmenting the memory.reclaim interface with a
swappiness=<val> argument that overrides the swappiness value for that
instance of proactive reclaim.

Userspace proactive reclaimers use the memory.reclaim interface to trigger
reclaim.  The memory.reclaim interface does not allow for any way to
effect the balance of file vs anon during proactive reclaim.  The only
approach is to adjust the vm.swappiness setting.  However, there are a few
reasons we look to control the balance of file vs anon during proactive
reclaim, separately from reactive reclaim:

* Swapout should be limited to manage SSD write endurance.  In near-OOM
  situations we are fine with lots of swap-out to avoid OOMs.  As these
  are typically rare events, they have relatively little impact on write
  endurance.  However, proactive reclaim runs continuously and so its
  impact on SSD write endurance is more significant.  Therefore it is
  desireable to control swap-out for proactive reclaim separately from
  reactive reclaim

* Some userspace OOM killers like systemd-oomd[1] support OOM killing on
  swap exhaustion.  This makes sense if the swap exhaustion is triggered
  due to reactive reclaim but less so if it is triggered due to proactive
  reclaim (e.g.  one could see OOMs when free memory is ample but anon is
  just particularly cold).  Therefore, it's desireable to have proactive
  reclaim reduce or stop swap-out before the threshold at which OOM
  killing occurs.

In the case of Meta's Senpai proactive reclaimer, we adjust vm.swappiness
before writes to memory.reclaim[2].  This has been in production for
nearly two years and has addressed our needs to control proactive vs
reactive reclaim behavior but is still not ideal for a number of reasons:

* vm.swappiness is a global setting, adjusting it can race/interfere
  with other system administration that wishes to control vm.swappiness.
  In our case, we need to disable Senpai before adjusting vm.swappiness.

* vm.swappiness is stateful - so a crash or restart of Senpai can leave
  a misconfigured setting.  This requires some additional management to
  record the "desired" setting and ensure Senpai always adjusts to it.

With this patch, we avoid these downsides of adjusting vm.swappiness
globally.

Previously, this exact interface addition was proposed by Yosry[3].  In
response, Roman proposed instead an interface to specify precise
file/anon/slab reclaim amounts[4].  More recently Huan also proposed this
as well[5] and others similarly questioned if this was the proper
interface.

Previous proposals sought to use this to allow proactive reclaimers to
effectively perform a custom reclaim algorithm by issuing proactive
reclaim with different settings to control file vs anon reclaim (e.g.  to
only reclaim anon from some applications).  Responses argued that
adjusting swappiness is a poor interface for custom reclaim.

In contrast, I argue in favor of a swappiness setting not as a way to
implement custom reclaim algorithms but rather to bias the balance of anon
vs file due to differences of proactive vs reactive reclaim.  In this
context, swappiness is the existing interface for controlling this balance
and this patch simply allows for it to be configured differently for
proactive vs reactive reclaim.

Specifying explicit amounts of anon vs file pages to reclaim feels
inappropriate for this prupose.  Proactive reclaimers are un-aware of the
relative age of file vs anon for a cgroup which makes it difficult to
manage proactive reclaim of different memory pools.  A proactive reclaimer
would need some amount of anon reclaim attempts separate from the amount
of file reclaim attempts which seems brittle given that it's difficult to
observe the impact.

[1]https://www.freedesktop.org/software/systemd/man/latest/systemd-oomd.service.html
[2]https://github.com/facebookincubator/oomd/blob/main/src/oomd/plugins/Senpai.cpp#L585-L598
[3]https://lore.kernel.org/linux-mm/CAJD7tkbDpyoODveCsnaqBBMZEkDvshXJmNdbk51yKSNgD7aGdg@mail.gmail.com/
[4]https://lore.kernel.org/linux-mm/YoPHtHXzpK51F%2F1Z@carbon/
[5]https://lore.kernel.org/lkml/20231108065818.19932-1-link@vivo.com/


This patch (of 2):

We use the constants 0 and 200 in a few places in the mm code when
referring to the min and max swappiness.  This patch adds MIN_SWAPPINESS
and MAX_SWAPPINESS #defines to improve clarity.  There are no functional
changes.

Link: https://lkml.kernel.org/r/20240103164841.2800183-1-schatzberg.dan@gmail.com
Link: https://lkml.kernel.org/r/20240103164841.2800183-2-schatzberg.dan@gmail.com
Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Chris Li <chrisl@kernel.org>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Yue Zhao <findns94@gmail.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h |  2 ++
 mm/memcontrol.c      |  2 +-
 mm/vmscan.c          | 14 +++++++-------
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4db00ddad26169..d210a5dc43013c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -404,6 +404,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 
 #define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
 #define MEMCG_RECLAIM_PROACTIVE (1 << 2)
+#define MIN_SWAPPINESS 0
+#define MAX_SWAPPINESS 200
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 						  unsigned long nr_pages,
 						  gfp_t gfp_mask,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index eb8684269906e9..dcfe5189e3be60 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4384,7 +4384,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
-	if (val > 200)
+	if (val > MAX_SWAPPINESS)
 		return -EINVAL;
 
 	if (!mem_cgroup_is_root(memcg))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1f139830b26f6c..44b92fb3d09775 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -183,7 +183,7 @@ struct scan_control {
 #endif
 
 /*
- * From 0 .. 200.  Higher means more swappy.
+ * From 0 .. MAX_SWAPPINESS.  Higher means more swappy.
  */
 int vm_swappiness = 60;
 
@@ -2404,7 +2404,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	ap = swappiness * (total_cost + 1);
 	ap /= anon_cost + 1;
 
-	fp = (200 - swappiness) * (total_cost + 1);
+	fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
 	fp /= file_cost + 1;
 
 	fraction[0] = ap;
@@ -4422,7 +4422,7 @@ static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx
 {
 	int type, tier;
 	struct ctrl_pos sp, pv;
-	int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
+	int gain[ANON_AND_FILE] = { swappiness, MAX_SWAPPINESS - swappiness };
 
 	/*
 	 * Compare the first tier of anon with that of file to determine which
@@ -4458,7 +4458,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
 	/*
 	 * Try to make the obvious choice first. When anon and file are both
 	 * available from the same generation, interpret swappiness 1 as file
-	 * first and 200 as anon first.
+	 * first and MAX_SWAPPINESS as anon first.
 	 */
 	if (!swappiness)
 		type = LRU_GEN_FILE;
@@ -4466,7 +4466,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
 		type = LRU_GEN_ANON;
 	else if (swappiness == 1)
 		type = LRU_GEN_FILE;
-	else if (swappiness == 200)
+	else if (swappiness == MAX_SWAPPINESS)
 		type = LRU_GEN_ANON;
 	else
 		type = get_type_to_scan(lruvec, swappiness, &tier);
@@ -5408,9 +5408,9 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
 
 	lruvec = get_lruvec(memcg, nid);
 
-	if (swappiness < 0)
+	if (swappiness < MIN_SWAPPINESS)
 		swappiness = get_swappiness(lruvec, sc);
-	else if (swappiness > 200)
+	else if (swappiness > MAX_SWAPPINESS)
 		goto done;
 
 	switch (cmd) {

From 3a92c45e4ba694381c46994f3fde0d8544a2088b Mon Sep 17 00:00:00 2001
From: Dan Schatzberg <schatzberg.dan@gmail.com>
Date: Wed, 3 Jan 2024 08:48:37 -0800
Subject: [PATCH 564/707] mm: add swappiness= arg to memory.reclaim

Allow proactive reclaimers to submit an additional swappiness=<val>
argument to memory.reclaim.  This overrides the global or per-memcg
swappiness setting for that reclaim attempt.

For example:

echo "2M swappiness=0" > /sys/fs/cgroup/memory.reclaim

will perform reclaim on the rootcg with a swappiness setting of 0 (no
swap) regardless of the vm.swappiness sysctl setting.

Userspace proactive reclaimers use the memory.reclaim interface to trigger
reclaim.  The memory.reclaim interface does not allow for any way to
effect the balance of file vs anon during proactive reclaim.  The only
approach is to adjust the vm.swappiness setting.  However, there are a few
reasons we look to control the balance of file vs anon during proactive
reclaim, separately from reactive reclaim:

* Swapout should be limited to manage SSD write endurance.  In near-OOM
  situations we are fine with lots of swap-out to avoid OOMs.  As these
  are typically rare events, they have relatively little impact on write
  endurance.  However, proactive reclaim runs continuously and so its
  impact on SSD write endurance is more significant.  Therefore it is
  desireable to control swap-out for proactive reclaim separately from
  reactive reclaim

* Some userspace OOM killers like systemd-oomd[1] support OOM killing on
  swap exhaustion.  This makes sense if the swap exhaustion is triggered
  due to reactive reclaim but less so if it is triggered due to proactive
  reclaim (e.g.  one could see OOMs when free memory is ample but anon is
  just particularly cold).  Therefore, it's desireable to have proactive
  reclaim reduce or stop swap-out before the threshold at which OOM
  killing occurs.

In the case of Meta's Senpai proactive reclaimer, we adjust vm.swappiness
before writes to memory.reclaim[2].  This has been in production for
nearly two years and has addressed our needs to control proactive vs
reactive reclaim behavior but is still not ideal for a number of reasons:

* vm.swappiness is a global setting, adjusting it can race/interfere
  with other system administration that wishes to control vm.swappiness.
  In our case, we need to disable Senpai before adjusting vm.swappiness.

* vm.swappiness is stateful - so a crash or restart of Senpai can leave
  a misconfigured setting.  This requires some additional management to
  record the "desired" setting and ensure Senpai always adjusts to it.

With this patch, we avoid these downsides of adjusting vm.swappiness
globally.

[1]https://www.freedesktop.org/software/systemd/man/latest/systemd-oomd.service.html
[2]https://github.com/facebookincubator/oomd/blob/main/src/oomd/plugins/Senpai.cpp#L585-L598

Link: https://lkml.kernel.org/r/20240103164841.2800183-3-schatzberg.dan@gmail.com
Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Suggested-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Yue Zhao <findns94@gmail.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 18 ++++----
 include/linux/swap.h                    |  3 +-
 mm/memcontrol.c                         | 56 ++++++++++++++++++++-----
 mm/vmscan.c                             | 25 +++++++++--
 4 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 17e6e956515640..0270517ade47cf 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1296,17 +1296,10 @@ PAGE_SIZE multiple when read back.
 	This is a simple interface to trigger memory reclaim in the
 	target cgroup.
 
-	This file accepts a single key, the number of bytes to reclaim.
-	No nested keys are currently supported.
-
 	Example::
 
 	  echo "1G" > memory.reclaim
 
-	The interface can be later extended with nested keys to
-	configure the reclaim behavior. For example, specify the
-	type of memory to reclaim from (anon, file, ..).
-
 	Please note that the kernel can over or under reclaim from
 	the target cgroup. If less bytes are reclaimed than the
 	specified amount, -EAGAIN is returned.
@@ -1318,6 +1311,17 @@ PAGE_SIZE multiple when read back.
 	This means that the networking layer will not adapt based on
 	reclaim induced by memory.reclaim.
 
+The following nested keys are defined.
+
+	  ==========            ================================
+	  swappiness            Swappiness value to reclaim with
+	  ==========            ================================
+
+	Specifying a swappiness value instructs the kernel to perform
+	the reclaim with that swappiness value. Note that this has the
+	same semantics as vm.swappiness applied to memcg reclaim with
+	all the existing limitations and potential future extensions.
+
   memory.peak
 	A read-only single value file which exists on non-root
 	cgroups.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d210a5dc43013c..41e4b484bc346d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -409,7 +409,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 						  unsigned long nr_pages,
 						  gfp_t gfp_mask,
-						  unsigned int reclaim_options);
+						  unsigned int reclaim_options,
+						  int *swappiness);
 extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
 						pg_data_t *pgdat,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dcfe5189e3be60..9ae6a11ecbf2ed 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -52,6 +52,7 @@
 #include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
+#include <linux/parser.h>
 #include <linux/vmpressure.h>
 #include <linux/memremap.h>
 #include <linux/mm_inline.h>
@@ -2475,7 +2476,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
 		psi_memstall_enter(&pflags);
 		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
 							gfp_mask,
-							MEMCG_RECLAIM_MAY_SWAP);
+							MEMCG_RECLAIM_MAY_SWAP,
+							NULL);
 		psi_memstall_leave(&pflags);
 	} while ((memcg = parent_mem_cgroup(memcg)) &&
 		 !mem_cgroup_is_root(memcg));
@@ -2781,7 +2783,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 	psi_memstall_enter(&pflags);
 	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
-						    gfp_mask, reclaim_options);
+						    gfp_mask, reclaim_options, NULL);
 	psi_memstall_leave(&pflags);
 
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -3707,7 +3709,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
 		}
 
 		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
-					memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
+					memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
 			ret = -EBUSY;
 			break;
 		}
@@ -3821,7 +3823,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 			return -EINTR;
 
 		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
-						  MEMCG_RECLAIM_MAY_SWAP))
+						  MEMCG_RECLAIM_MAY_SWAP, NULL))
 			nr_retries--;
 	}
 
@@ -6787,7 +6789,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 		}
 
 		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
-					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
+					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
 
 		if (!reclaimed && !nr_retries--)
 			break;
@@ -6836,7 +6838,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 
 		if (nr_reclaims) {
 			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
-					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
+					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
 				nr_reclaims--;
 			continue;
 		}
@@ -6966,19 +6968,50 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+enum {
+	MEMORY_RECLAIM_SWAPPINESS = 0,
+	MEMORY_RECLAIM_NULL,
+};
+
+static const match_table_t tokens = {
+	{ MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
+	{ MEMORY_RECLAIM_NULL, NULL },
+};
+
 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 			      size_t nbytes, loff_t off)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
 	unsigned long nr_to_reclaim, nr_reclaimed = 0;
+	int swappiness = -1;
 	unsigned int reclaim_options;
-	int err;
+	char *old_buf, *start;
+	substring_t args[MAX_OPT_ARGS];
 
 	buf = strstrip(buf);
-	err = page_counter_memparse(buf, "", &nr_to_reclaim);
-	if (err)
-		return err;
+
+	old_buf = buf;
+	nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
+	if (buf == old_buf)
+		return -EINVAL;
+
+	buf = strstrip(buf);
+
+	while ((start = strsep(&buf, " ")) != NULL) {
+		if (!strlen(start))
+			continue;
+		switch (match_token(start, tokens, args)) {
+		case MEMORY_RECLAIM_SWAPPINESS:
+			if (match_int(&args[0], &swappiness))
+				return -EINVAL;
+			if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS)
+				return -EINVAL;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
 
 	reclaim_options	= MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
 	while (nr_reclaimed < nr_to_reclaim) {
@@ -6997,7 +7030,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 
 		reclaimed = try_to_free_mem_cgroup_pages(memcg,
 					min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX),
-					GFP_KERNEL, reclaim_options);
+					GFP_KERNEL, reclaim_options,
+					swappiness == -1 ? NULL : &swappiness);
 
 		if (!reclaimed && !nr_retries--)
 			return -EAGAIN;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 44b92fb3d09775..895f03e2d9327b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -92,6 +92,11 @@ struct scan_control {
 	unsigned long	anon_cost;
 	unsigned long	file_cost;
 
+#ifdef CONFIG_MEMCG
+	/* Swappiness value for proactive reclaim. Always use sc_swappiness()! */
+	int *proactive_swappiness;
+#endif
+
 	/* Can active folios be deactivated as part of reclaim? */
 #define DEACTIVATE_ANON 1
 #define DEACTIVATE_FILE 2
@@ -227,6 +232,13 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 #endif
 	return false;
 }
+
+static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
+{
+	if (sc->proactive && sc->proactive_swappiness)
+		return *sc->proactive_swappiness;
+	return mem_cgroup_swappiness(memcg);
+}
 #else
 static bool cgroup_reclaim(struct scan_control *sc)
 {
@@ -242,6 +254,11 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 {
 	return true;
 }
+
+static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
+{
+	return READ_ONCE(vm_swappiness);
+}
 #endif
 
 static void set_task_reclaim_state(struct task_struct *task,
@@ -2328,7 +2345,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	unsigned long anon_cost, file_cost, total_cost;
-	int swappiness = mem_cgroup_swappiness(memcg);
+	int swappiness = sc_swappiness(sc, memcg);
 	u64 fraction[ANON_AND_FILE];
 	u64 denominator = 0;	/* gcc */
 	enum scan_balance scan_balance;
@@ -2609,7 +2626,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
 	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
 		return 0;
 
-	return mem_cgroup_swappiness(memcg);
+	return sc_swappiness(sc, memcg);
 }
 
 static int get_nr_gens(struct lruvec *lruvec, int type)
@@ -6488,12 +6505,14 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					   unsigned long nr_pages,
 					   gfp_t gfp_mask,
-					   unsigned int reclaim_options)
+					   unsigned int reclaim_options,
+					   int *swappiness)
 {
 	unsigned long nr_reclaimed;
 	unsigned int noreclaim_flag;
 	struct scan_control sc = {
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
+		.proactive_swappiness = swappiness,
 		.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
 				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
 		.reclaim_idx = MAX_NR_ZONES - 1,

From fd8a03639636bd0b544b4108f1c1b216cce87156 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 10 Oct 2023 15:55:49 +0100
Subject: [PATCH 565/707] bounds: support non-power-of-two CONFIG_NR_CPUS

ilog2() rounds down, so for example when PowerPC 85xx sets CONFIG_NR_CPUS
to 24, we will only allocate 4 bits to store the number of CPUs instead of
5.  Use bits_per() instead, which rounds up.  Found by code inspection.
The effect of this would probably be a misaccounting when doing NUMA
balancing, so to a user, it would only be a performance penalty.  The
effects may be more wide-spread; it's hard to tell.

Link: https://lkml.kernel.org/r/20231010145549.1244748-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Fixes: 90572890d202 ("mm: numa: Change page last {nid,pid} into {cpu,pid}")
Reviewed-by: Rik van Riel <riel@surriel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/bounds.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bounds.c b/kernel/bounds.c
index b529182e8b04fc..c5a9fcd2d62281 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -19,7 +19,7 @@ int main(void)
 	DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
 	DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
 #ifdef CONFIG_SMP
-	DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+	DEFINE(NR_CPUS_BITS, bits_per(CONFIG_NR_CPUS));
 #endif
 	DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
 #ifdef CONFIG_LRU_GEN

From e31a7064b797812bca319e35af6f113d66bf0137 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 9 Jan 2024 15:16:30 -0700
Subject: [PATCH 566/707] arch and include: update LLVM Phabricator links

reviews.llvm.org was LLVM's Phabricator instances for code review.  It has
been abandoned in favor of GitHub pull requests.  While the majority of
links in the kernel sources still work because of the work Fangrui has
done turning the dynamic Phabricator instance into a static archive, there
are some issues with that work, so preemptively convert all the links in
the kernel sources to point to the commit on GitHub.

Most of the commits have the corresponding differential review link in the
commit message itself so there should not be any loss of fidelity in the
relevant information.

Link: https://discourse.llvm.org/t/update-on-github-pull-requests/71540/172
Link: https://lkml.kernel.org/r/20240109-update-llvm-links-v1-2-eb09b59db071@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Fangrui Song <maskray@google.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Mykola Lysenko <mykolal@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/Kconfig              | 4 ++--
 arch/riscv/Kconfig              | 2 +-
 arch/riscv/include/asm/ftrace.h | 2 +-
 include/linux/compiler-clang.h  | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index aa7c1d43513968..5a8acca4dbf495 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -382,7 +382,7 @@ config BROKEN_GAS_INST
 config BUILTIN_RETURN_ADDRESS_STRIPS_PAC
 	bool
 	# Clang's __builtin_return_adddress() strips the PAC since 12.0.0
-	# https://reviews.llvm.org/D75044
+	# https://github.com/llvm/llvm-project/commit/2a96f47c5ffca84cd774ad402cacd137f4bf45e2
 	default y if CC_IS_CLANG && (CLANG_VERSION >= 120000)
 	# GCC's __builtin_return_address() strips the PAC since 11.1.0,
 	# and this was backported to 10.2.0, 9.4.0, 8.5.0, but not earlier
@@ -2222,7 +2222,7 @@ config STACKPROTECTOR_PER_TASK
 
 config UNWIND_PATCH_PAC_INTO_SCS
 	bool "Enable shadow call stack dynamically using code patching"
-	# needs Clang with https://reviews.llvm.org/D111780 incorporated
+	# needs Clang with https://github.com/llvm/llvm-project/commit/de07cde67b5d205d58690be012106022aea6d2b3 incorporated
 	depends on CC_IS_CLANG && CLANG_VERSION >= 150000
 	depends on ARM64_PTR_AUTH_KERNEL && CC_HAS_BRANCH_PROT_PAC_RET
 	depends on SHADOW_CALL_STACK
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index bffbd869a06828..69d24f51392206 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -312,7 +312,7 @@ config AS_HAS_INSN
 	def_bool $(as-instr,.insn r 51$(comma) 0$(comma) 0$(comma) t0$(comma) t0$(comma) zero)
 
 config AS_HAS_OPTION_ARCH
-	# https://reviews.llvm.org/D123515
+	# https://github.com/llvm/llvm-project/commit/9e8ed3403c191ab9c4903e8eeb8f732ff8a43cb4
 	def_bool y
 	depends on $(as-instr, .option arch$(comma) +m)
 	depends on !$(as-instr, .option arch$(comma) -i)
diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index 32917212295234..06874fb1311e5e 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -15,7 +15,7 @@
 
 /*
  * Clang prior to 13 had "mcount" instead of "_mcount":
- * https://reviews.llvm.org/D98881
+ * https://github.com/llvm/llvm-project/commit/ef58ae86ba778ed7d01cd3f6bd6d08f943abab44
  */
 #if defined(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 130000
 #define MCOUNT_NAME _mcount
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index ddab1ef22beef3..f0a47afef12581 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -9,7 +9,7 @@
  * Clang prior to 17 is being silly and considers many __cleanup() variables
  * as unused (because they are, their sole purpose is to go out of scope).
  *
- * https://reviews.llvm.org/D152180
+ * https://github.com/llvm/llvm-project/commit/877210faa447f4cc7db87812f8ed80e398fedd61
  */
 #undef __cleanup
 #define __cleanup(func) __maybe_unused __attribute__((__cleanup__(func)))

From 983fac0ad90f86dcc52496b10c04495b6e20c735 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 9 Jan 2024 15:16:31 -0700
Subject: [PATCH 567/707] treewide: update LLVM Bugzilla links

LLVM moved their issue tracker from their own Bugzilla instance to GitHub
issues.  While all of the links are still valid, they may not necessarily
show the most up to date information around the issues, as all updates
will occur on GitHub, not Bugzilla.

Another complication is that the Bugzilla issue number is not always the
same as the GitHub issue number.  Thankfully, LLVM maintains this mapping
through two shortlinks:

  https://llvm.org/bz<num> -> https://bugs.llvm.org/show_bug.cgi?id=<num>
  https://llvm.org/pr<num> -> https://github.com/llvm/llvm-project/issues/<mapped_num>

Switch all "https://bugs.llvm.org/show_bug.cgi?id=<num>" links to the
"https://llvm.org/pr<num>" shortlink so that the links show the most up to
date information.  Each migrated issue links back to the Bugzilla entry,
so there should be no loss of fidelity of information here.

Link: https://lkml.kernel.org/r/20240109-update-llvm-links-v1-3-eb09b59db071@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Fangrui Song <maskray@google.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Mykola Lysenko <mykolal@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/Makefile                           | 4 ++--
 arch/powerpc/kvm/book3s_hv_nested.c             | 2 +-
 arch/s390/include/asm/ftrace.h                  | 2 +-
 arch/x86/power/Makefile                         | 2 +-
 crypto/blake2b_generic.c                        | 2 +-
 drivers/firmware/efi/libstub/Makefile           | 2 +-
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c        | 2 +-
 drivers/media/test-drivers/vicodec/codec-fwht.c | 2 +-
 drivers/regulator/Kconfig                       | 2 +-
 include/asm-generic/vmlinux.lds.h               | 2 +-
 lib/Kconfig.kasan                               | 2 +-
 lib/raid6/Makefile                              | 2 +-
 lib/stackinit_kunit.c                           | 2 +-
 mm/slab_common.c                                | 2 +-
 net/bridge/br_multicast.c                       | 2 +-
 security/Kconfig                                | 2 +-
 16 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 051247027da0ba..457cee9b03ee04 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -144,11 +144,11 @@ CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mno-pointers-to-nested-functions)
 CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mlong-double-128)
 
 # Clang unconditionally reserves r2 on ppc32 and does not support the flag
-# https://bugs.llvm.org/show_bug.cgi?id=39555
+# https://llvm.org/pr39555
 CFLAGS-$(CONFIG_PPC32)	:= $(call cc-option, -ffixed-r2)
 
 # Clang doesn't support -mmultiple / -mno-multiple
-# https://bugs.llvm.org/show_bug.cgi?id=39556
+# https://llvm.org/pr39556
 CFLAGS-$(CONFIG_PPC32)	+= $(call cc-option, $(MULTIPLEWORD))
 
 CFLAGS-$(CONFIG_PPC32)	+= $(call cc-option,-mno-readonly-in-sdata)
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 5c375ec1a3c608..05f5220960c63b 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -55,7 +55,7 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
 	hr->dawrx1 = vcpu->arch.dawrx1;
 }
 
-/* Use noinline_for_stack due to https://bugs.llvm.org/show_bug.cgi?id=49610 */
+/* Use noinline_for_stack due to https://llvm.org/pr49610 */
 static noinline_for_stack void byteswap_pt_regs(struct pt_regs *regs)
 {
 	unsigned long *addr = (unsigned long *) regs;
diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h
index 5a82b08f03cd3e..621f23d5ae30a6 100644
--- a/arch/s390/include/asm/ftrace.h
+++ b/arch/s390/include/asm/ftrace.h
@@ -9,7 +9,7 @@
 #ifndef __ASSEMBLY__
 
 #ifdef CONFIG_CC_IS_CLANG
-/* https://bugs.llvm.org/show_bug.cgi?id=41424 */
+/* https://llvm.org/pr41424 */
 #define ftrace_return_address(n) 0UL
 #else
 #define ftrace_return_address(n) __builtin_return_address(n)
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index 379777572bc9fe..e0cd7afd53022a 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -5,7 +5,7 @@
 CFLAGS_cpu.o	:= -fno-stack-protector
 
 # Clang may incorrectly inline functions with stack protector enabled into
-# __restore_processor_state(): https://bugs.llvm.org/show_bug.cgi?id=47479
+# __restore_processor_state(): https://llvm.org/pr47479
 CFLAGS_REMOVE_cpu.o := $(CC_FLAGS_LTO)
 
 obj-$(CONFIG_PM_SLEEP)		+= cpu.o
diff --git a/crypto/blake2b_generic.c b/crypto/blake2b_generic.c
index 6704c035588967..32e380b714b6cc 100644
--- a/crypto/blake2b_generic.c
+++ b/crypto/blake2b_generic.c
@@ -102,7 +102,7 @@ static void blake2b_compress_one_generic(struct blake2b_state *S,
 	ROUND(10);
 	ROUND(11);
 #ifdef CONFIG_CC_IS_CLANG
-#pragma nounroll /* https://bugs.llvm.org/show_bug.cgi?id=45803 */
+#pragma nounroll /* https://llvm.org/pr45803 */
 #endif
 	for (i = 0; i < 8; ++i)
 		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 06964a3c130f6a..a223bd10564b1b 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -105,7 +105,7 @@ lib-y				:= $(patsubst %.o,%.stub.o,$(lib-y))
 # Even when -mbranch-protection=none is set, Clang will generate a
 # .note.gnu.property for code-less object files (like lib/ctype.c),
 # so work around this by explicitly removing the unwanted section.
-# https://bugs.llvm.org/show_bug.cgi?id=46480
+# https://llvm.org/pr46480
 STUBCOPY_FLAGS-y		+= --remove-section=.note.gnu.property
 
 STUBCOPY_RELOC-$(CONFIG_X86_32)	:= R_386_32
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 2d688dca26bedb..78a2773b74f2f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -610,7 +610,7 @@ static uint32_t sdma_v4_4_2_rb_cntl(struct amdgpu_ring *ring, uint32_t rb_cntl)
 	/* Set ring buffer size in dwords */
 	uint32_t rb_bufsz = order_base_2(ring->ring_size / 4);
 
-	barrier(); /* work around https://bugs.llvm.org/show_bug.cgi?id=42576 */
+	barrier(); /* work around https://llvm.org/pr42576 */
 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA_GFX_RB_CNTL, RB_SIZE, rb_bufsz);
 #ifdef __BIG_ENDIAN
 	rb_cntl = REG_SET_FIELD(rb_cntl, SDMA_GFX_RB_CNTL, RB_SWAP_ENABLE, 1);
diff --git a/drivers/media/test-drivers/vicodec/codec-fwht.c b/drivers/media/test-drivers/vicodec/codec-fwht.c
index 1ce682e1b85c32..fd75457d03b202 100644
--- a/drivers/media/test-drivers/vicodec/codec-fwht.c
+++ b/drivers/media/test-drivers/vicodec/codec-fwht.c
@@ -49,7 +49,7 @@ static const uint8_t zigzag[64] = {
 
 /*
  * noinline_for_stack to work around
- * https://bugs.llvm.org/show_bug.cgi?id=38809
+ * https://llvm.org/pr38809
  */
 static int noinline_for_stack
 rlc(const s16 *in, __be16 *output, int blocktype)
diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 550145f82726e9..7db0a29b5b8dcd 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -288,7 +288,7 @@ config REGULATOR_CROS_EC
 config REGULATOR_DA903X
 	tristate "Dialog Semiconductor DA9030/DA9034 regulators"
 	depends on PMIC_DA903X
-	depends on !CC_IS_CLANG # https://bugs.llvm.org/show_bug.cgi?id=38789
+	depends on !CC_IS_CLANG # https://llvm.org/pr38789
 	help
 	  Say y here to support the BUCKs and LDOs regulators found on
 	  Dialog Semiconductor DA9030/DA9034 PMIC.
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 5dd3a61d673d4f..f7749d0f2562f1 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -984,7 +984,7 @@
  * -fsanitize=thread produce unwanted sections (.eh_frame
  * and .init_array.*), but CONFIG_CONSTRUCTORS wants to
  * keep any .init_array.* sections.
- * https://bugs.llvm.org/show_bug.cgi?id=46478
+ * https://llvm.org/pr46478
  */
 #ifdef CONFIG_UNWIND_TABLES
 #define DISCARD_EH_FRAME
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index e6eda054ab275f..98016e137b7f09 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -158,7 +158,7 @@ config KASAN_STACK
 	  out-of-bounds bugs in stack variables.
 
 	  With Clang, stack instrumentation has a problem that causes excessive
-	  stack usage, see https://bugs.llvm.org/show_bug.cgi?id=38809. Thus,
+	  stack usage, see https://llvm.org/pr38809. Thus,
 	  with Clang, this option is deemed unsafe.
 
 	  This option is always disabled when compile-testing with Clang to
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 1c5420ff254e84..385a94aa0b999b 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -21,7 +21,7 @@ altivec_flags += -isystem $(shell $(CC) -print-file-name=include)
 ifdef CONFIG_CC_IS_CLANG
 # clang ppc port does not yet support -maltivec when -msoft-float is
 # enabled. A future release of clang will resolve this
-# https://bugs.llvm.org/show_bug.cgi?id=31177
+# https://llvm.org/pr31177
 CFLAGS_REMOVE_altivec1.o  += -msoft-float
 CFLAGS_REMOVE_altivec2.o  += -msoft-float
 CFLAGS_REMOVE_altivec4.o  += -msoft-float
diff --git a/lib/stackinit_kunit.c b/lib/stackinit_kunit.c
index 05947a2feb93c0..7a10e1d1725817 100644
--- a/lib/stackinit_kunit.c
+++ b/lib/stackinit_kunit.c
@@ -404,7 +404,7 @@ static noinline int leaf_switch_2_none(unsigned long sp, bool fill,
  * These are expected to fail for most configurations because neither
  * GCC nor Clang have a way to perform initialization of variables in
  * non-code areas (i.e. in a switch statement before the first "case").
- * https://bugs.llvm.org/show_bug.cgi?id=44916
+ * https://llvm.org/pr44916
  */
 DEFINE_TEST_DRIVER(switch_1_none, uint64_t, SCALAR, ALWAYS_FAIL);
 DEFINE_TEST_DRIVER(switch_2_none, uint64_t, SCALAR, ALWAYS_FAIL);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 238293b1dbe14b..954af676d79ee8 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -651,7 +651,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
 
 struct kmem_cache *
 kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init =
-{ /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ };
+{ /* initialization for https://llvm.org/pr42570 */ };
 EXPORT_SYMBOL(kmalloc_caches);
 
 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index d7d021af102981..523f72ac9633d2 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -5043,7 +5043,7 @@ void br_multicast_uninit_stats(struct net_bridge *br)
 	free_percpu(br->mcast_stats);
 }
 
-/* noinline for https://bugs.llvm.org/show_bug.cgi?id=45802#c9 */
+/* noinline for https://llvm.org/pr45802#c9 */
 static noinline_for_stack void mcast_stats_add_dir(u64 *dst, u64 *src)
 {
 	dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX];
diff --git a/security/Kconfig b/security/Kconfig
index 52c9af08ad35d3..606a87c29a0170 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -142,7 +142,7 @@ config HARDENED_USERCOPY
 config FORTIFY_SOURCE
 	bool "Harden common str/mem functions against buffer overflows"
 	depends on ARCH_HAS_FORTIFY_SOURCE
-	# https://bugs.llvm.org/show_bug.cgi?id=41459
+	# https://llvm.org/pr41459
 	depends on !CC_IS_CLANG || CLANG_VERSION >= 120001
 	# https://github.com/llvm/llvm-project/issues/53645
 	depends on !CC_IS_CLANG || !X86_32

From 28dab03cca2028c790d15d7ad61c4b30c7f5d0d7 Mon Sep 17 00:00:00 2001
From: Wen Yang <wenyang.linux@foxmail.com>
Date: Mon, 8 Jan 2024 23:51:32 +0800
Subject: [PATCH 568/707] selftests: add eventfd selftests

This adds the promised selftest for eventfd.  It will verify the flags of
eventfd2, including EFD_CLOEXEC, EFD_NONBLOCK and EFD_SEMAPHORE.

Link: https://lkml.kernel.org/r/tencent_3C9A298878D22B5D8F79DC2FEE99BB4A8F05@qq.com
Signed-off-by: Wen Yang <wenyang.linux@foxmail.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Javier Martinez Canillas <javierm@redhat.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Pengfei Xu <pengfei.xu@intel.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: Andrei Vagin <avagin@google.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../selftests/filesystems/eventfd/.gitignore  |   2 +
 .../selftests/filesystems/eventfd/Makefile    |   7 +
 .../filesystems/eventfd/eventfd_test.c        | 186 ++++++++++++++++++
 3 files changed, 195 insertions(+)
 create mode 100644 tools/testing/selftests/filesystems/eventfd/.gitignore
 create mode 100644 tools/testing/selftests/filesystems/eventfd/Makefile
 create mode 100644 tools/testing/selftests/filesystems/eventfd/eventfd_test.c

diff --git a/tools/testing/selftests/filesystems/eventfd/.gitignore b/tools/testing/selftests/filesystems/eventfd/.gitignore
new file mode 100644
index 00000000000000..483faf59fe4adb
--- /dev/null
+++ b/tools/testing/selftests/filesystems/eventfd/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+eventfd_test
diff --git a/tools/testing/selftests/filesystems/eventfd/Makefile b/tools/testing/selftests/filesystems/eventfd/Makefile
new file mode 100644
index 00000000000000..0a8e3910df1572
--- /dev/null
+++ b/tools/testing/selftests/filesystems/eventfd/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += $(KHDR_INCLUDES)
+LDLIBS += -lpthread
+TEST_GEN_PROGS := eventfd_test
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
new file mode 100644
index 00000000000000..f142a137526cda
--- /dev/null
+++ b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <asm/unistd.h>
+#include <linux/time_types.h>
+#include <unistd.h>
+#include <assert.h>
+#include <signal.h>
+#include <pthread.h>
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+#include "../../kselftest_harness.h"
+
+struct error {
+	int  code;
+	char msg[512];
+};
+
+static int error_set(struct error *err, int code, const char *fmt, ...)
+{
+	va_list args;
+	int r;
+
+	if (code == 0 || !err || err->code != 0)
+		return code;
+
+	err->code = code;
+	va_start(args, fmt);
+	r = vsnprintf(err->msg, sizeof(err->msg), fmt, args);
+	assert((size_t)r < sizeof(err->msg));
+	va_end(args);
+
+	return code;
+}
+
+static inline int sys_eventfd2(unsigned int count, int flags)
+{
+	return syscall(__NR_eventfd2, count, flags);
+}
+
+TEST(eventfd01)
+{
+	int fd, flags;
+
+	fd = sys_eventfd2(0, 0);
+	ASSERT_GE(fd, 0);
+
+	flags = fcntl(fd, F_GETFL);
+	// since the kernel automatically added O_RDWR.
+	EXPECT_EQ(flags, O_RDWR);
+
+	close(fd);
+}
+
+TEST(eventfd02)
+{
+	int fd, flags;
+
+	fd = sys_eventfd2(0, EFD_CLOEXEC);
+	ASSERT_GE(fd, 0);
+
+	flags = fcntl(fd, F_GETFD);
+	ASSERT_GT(flags, -1);
+	EXPECT_EQ(flags, FD_CLOEXEC);
+
+	close(fd);
+}
+
+TEST(eventfd03)
+{
+	int fd, flags;
+
+	fd = sys_eventfd2(0, EFD_NONBLOCK);
+	ASSERT_GE(fd, 0);
+
+	flags = fcntl(fd, F_GETFL);
+	ASSERT_GT(flags, -1);
+	EXPECT_EQ(flags & EFD_NONBLOCK, EFD_NONBLOCK);
+	EXPECT_EQ(flags & O_RDWR, O_RDWR);
+
+	close(fd);
+}
+
+TEST(eventfd04)
+{
+	int fd, flags;
+
+	fd = sys_eventfd2(0, EFD_CLOEXEC|EFD_NONBLOCK);
+	ASSERT_GE(fd, 0);
+
+	flags = fcntl(fd, F_GETFL);
+	ASSERT_GT(flags, -1);
+	EXPECT_EQ(flags & EFD_NONBLOCK, EFD_NONBLOCK);
+	EXPECT_EQ(flags & O_RDWR, O_RDWR);
+
+	flags = fcntl(fd, F_GETFD);
+	ASSERT_GT(flags, -1);
+	EXPECT_EQ(flags, FD_CLOEXEC);
+
+	close(fd);
+}
+
+static inline void trim_newline(char *str)
+{
+	char *pos = strrchr(str, '\n');
+
+	if (pos)
+		*pos = '\0';
+}
+
+static int verify_fdinfo(int fd, struct error *err, const char *prefix,
+		size_t prefix_len, const char *expect, ...)
+{
+	char buffer[512] = {0, };
+	char path[512] = {0, };
+	va_list args;
+	FILE *f;
+	char *line = NULL;
+	size_t n = 0;
+	int found = 0;
+	int r;
+
+	va_start(args, expect);
+	r = vsnprintf(buffer, sizeof(buffer), expect, args);
+	assert((size_t)r < sizeof(buffer));
+	va_end(args);
+
+	snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", fd);
+	f = fopen(path, "re");
+	if (!f)
+		return error_set(err, -1, "fdinfo open failed for %d", fd);
+
+	while (getline(&line, &n, f) != -1) {
+		char *val;
+
+		if (strncmp(line, prefix, prefix_len))
+			continue;
+
+		found = 1;
+
+		val = line + prefix_len;
+		r = strcmp(val, buffer);
+		if (r != 0) {
+			trim_newline(line);
+			trim_newline(buffer);
+			error_set(err, -1, "%s '%s' != '%s'",
+				  prefix, val, buffer);
+		}
+		break;
+	}
+
+	free(line);
+	fclose(f);
+
+	if (found == 0)
+		return error_set(err, -1, "%s not found for fd %d",
+				 prefix, fd);
+
+	return 0;
+}
+
+TEST(eventfd05)
+{
+	struct error err = {0};
+	int fd, ret;
+
+	fd = sys_eventfd2(0, EFD_SEMAPHORE);
+	ASSERT_GE(fd, 0);
+
+	ret = fcntl(fd, F_GETFL);
+	ASSERT_GT(ret, -1);
+	EXPECT_EQ(ret & O_RDWR, O_RDWR);
+
+	// The semaphore could only be obtained from fdinfo.
+	ret = verify_fdinfo(fd, &err, "eventfd-semaphore: ", 19, "1\n");
+	if (ret != 0)
+		ksft_print_msg("eventfd-semaphore check failed, msg: %s\n",
+				err.msg);
+	EXPECT_EQ(ret, 0);
+
+	close(fd);
+}
+
+TEST_HARNESS_MAIN

From bdb59f8a69909f583b41d6363ce70a6e50d52b85 Mon Sep 17 00:00:00 2001
From: Pierre Gondois <pierre.gondois@arm.com>
Date: Thu, 4 Jan 2024 17:49:33 +0100
Subject: [PATCH 569/707] list: add hlist_count_nodes()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a generic hlist_count_nodes() function and use it in two drivers.


This patch (of 3):

Add a function to count nodes in a hlist.  hlist_count_nodes() is similar
to list_count_nodes().

Link: https://lkml.kernel.org/r/20240104164937.424320-1-pierre.gondois@arm.com
Link: https://lkml.kernel.org/r/20240104164937.424320-2-pierre.gondois@arm.com
Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Reviewed-by: Carlos Llamas <cmllamas@google.com>
Acked-by: Coly Li <colyli@suse.de>
Acked-by: Marco Elver <elver@google.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Martijn Coenen <maco@android.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Todd Kjos <tkjos@android.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/list.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/include/linux/list.h b/include/linux/list.h
index 059aa1fff41e9c..523b7c4d000a1f 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -1195,4 +1195,19 @@ static inline void hlist_splice_init(struct hlist_head *from,
 	     pos && ({ n = pos->member.next; 1; });			\
 	     pos = hlist_entry_safe(n, typeof(*pos), member))
 
+/**
+ * hlist_count_nodes - count nodes in the hlist
+ * @head:	the head for your hlist.
+ */
+static inline size_t hlist_count_nodes(struct hlist_head *head)
+{
+	struct hlist_node *pos;
+	size_t count = 0;
+
+	hlist_for_each(pos, head)
+		count++;
+
+	return count;
+}
+
 #endif

From 4490487dd154734cab35578e1e741cf134ff5a30 Mon Sep 17 00:00:00 2001
From: Pierre Gondois <pierre.gondois@arm.com>
Date: Thu, 4 Jan 2024 17:49:34 +0100
Subject: [PATCH 570/707] binder: use of hlist_count_nodes()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make use of the newly added hlist_count_nodes().

Link: https://lkml.kernel.org/r/20240104164937.424320-3-pierre.gondois@arm.com
Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Acked-by: Carlos Llamas <cmllamas@google.com>
Acked-by: Marco Elver <elver@google.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Martijn Coenen <maco@android.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Todd Kjos <tkjos@android.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/android/binder.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 8dd23b19e99731..5a4f8d7aa05169 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -6076,9 +6076,7 @@ static void print_binder_node_nilocked(struct seq_file *m,
 	struct binder_work *w;
 	int count;
 
-	count = 0;
-	hlist_for_each_entry(ref, &node->refs, node_entry)
-		count++;
+	count = hlist_count_nodes(&node->refs);
 
 	seq_printf(m, "  node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d tr %d",
 		   node->debug_id, (u64)node->ptr, (u64)node->cookie,

From 7a2af157898260b4a323581a92a3b646dc2affad Mon Sep 17 00:00:00 2001
From: Pierre Gondois <pierre.gondois@arm.com>
Date: Thu, 4 Jan 2024 17:49:35 +0100
Subject: [PATCH 571/707] bcache: use of hlist_count_nodes()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make use of the newly added hlist_count_nodes().

Link: https://lkml.kernel.org/r/20240104164937.424320-4-pierre.gondois@arm.com
Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Acked-by: Coly Li <colyli@suse.de>
Acked-by: Marco Elver <elver@google.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Martijn Coenen <maco@android.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Todd Kjos <tkjos@android.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/md/bcache/sysfs.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index a438efb660699b..6956beb55326f5 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -702,13 +702,7 @@ static unsigned int bch_cache_max_chain(struct cache_set *c)
 	for (h = c->bucket_hash;
 	     h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
 	     h++) {
-		unsigned int i = 0;
-		struct hlist_node *p;
-
-		hlist_for_each(p, h)
-			i++;
-
-		ret = max(ret, i);
+		ret = max(ret, hlist_count_nodes(h));
 	}
 
 	mutex_unlock(&c->bucket_lock);

From 380111fbb447b4ad1dc4658a0741588e95179a41 Mon Sep 17 00:00:00 2001
From: Yongzhen Zhang <zhangyongzhen@kylinos.cn>
Date: Mon, 8 Jan 2024 09:56:04 +0800
Subject: [PATCH 572/707] ocfs2: Spelling fix

Modify reques to request in the comment.

Link: https://lkml.kernel.org/r/20240108015604.38377-1-zhangyongzhen@kylinos.cn
Signed-off-by: Yongzhen Zhang <zhangyongzhen@kylinos.cn>
Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/dlmglue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 64a6ef638495c2..cb40cafbc06237 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1615,7 +1615,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
 unlock:
 	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
 
-	/* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
+	/* ocfs2_unblock_lock request on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
 	kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED);
 
 	spin_unlock_irqrestore(&lockres->l_lock, flags);

From 321d96d70dd5dea785e5ebde8e3a0cb4bbd63c55 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 7 Jan 2024 14:01:55 -0800
Subject: [PATCH 573/707] lib/win_minmax: fix header comments

Don't use "/**" kernel-doc comment marker for non-kernel-doc
comment.

Correct the filename but omit the path since we know where it is
and it could change (but not likely).

Link: https://lkml.kernel.org/r/20240107220155.29013-1-rdunlap@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/win_minmax.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/win_minmax.h b/include/linux/win_minmax.h
index 4ca2842d2842d0..6a5bb052fcc27f 100644
--- a/include/linux/win_minmax.h
+++ b/include/linux/win_minmax.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/**
- * lib/minmax.c: windowed min/max tracker by Kathleen Nichols.
+/*
+ * win_minmax.h: windowed min/max tracker by Kathleen Nichols.
  *
  */
 #ifndef MINMAX_H

From 13f86266e3cd77a19ebabab9f28be63b566eaaee Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Sun, 7 Jan 2024 17:16:41 +0800
Subject: [PATCH 574/707] panic: suppress gnu_printf warning

with GCC 13.2.1 and W=1, there's compiling warning like this:

kernel/panic.c: In function `__warn':
kernel/panic.c:676:17: warning: function `__warn' might be a candidate for `gnu_printf' format attribute [-Wsuggest-attribute=format]
  676 |                 vprintk(args->fmt, args->args);
      |                 ^~~~~~~

The normal __printf(x,y) adding can't fix it. So add workaround which
disables -Wsuggest-attribute=format to mute it.

Link: https://lkml.kernel.org/r/20240107091641.579849-1-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/panic.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/panic.c b/kernel/panic.c
index 2807639aab51d1..d49b68184c563e 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -666,8 +666,13 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 		pr_warn("WARNING: CPU: %d PID: %d at %pS\n",
 			raw_smp_processor_id(), current->pid, caller);
 
+#pragma GCC diagnostic push
+#ifndef __clang__
+#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
+#endif
 	if (args)
 		vprintk(args->fmt, args->args);
+#pragma GCC diagnostic pop
 
 	print_modules();
 

From 74f4422df6e27efcf42e7ffef9340fc3c0b7f402 Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Wed, 10 Jan 2024 16:12:12 +0800
Subject: [PATCH 575/707] lib min_heap: optimize number of calls to
 min_heapify()

Patch series "lib min_heap: Min heap optimizations".

The purpose of this patch series is to enhance the existing min heap
implementation.  The optimization focuses on both the heap construction
process and the number of comparisons made during the heapify operation.


This patch (of 2):

Improve the heap construction process by reducing unnecessary heapify
operations.  Specifically, adjust the starting condition from n / 2 to n /
2 - 1 in the loop that iterates over all non-leaf elements.

Link: https://lkml.kernel.org/r/20240110081213.2289636-1-visitorckw@gmail.com
Link: https://lkml.kernel.org/r/20240110081213.2289636-2-visitorckw@gmail.com
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/min_heap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h
index 44077837385f89..18a581310eb350 100644
--- a/include/linux/min_heap.h
+++ b/include/linux/min_heap.h
@@ -70,7 +70,7 @@ void min_heapify_all(struct min_heap *heap,
 {
 	int i;
 
-	for (i = heap->nr / 2; i >= 0; i--)
+	for (i = heap->nr / 2 - 1; i >= 0; i--)
 		min_heapify(heap, i, func);
 }
 

From c145b97f2dcb45ad32f59c559c624c55a1c08069 Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Wed, 10 Jan 2024 16:12:13 +0800
Subject: [PATCH 576/707] lib min_heap: optimize number of comparisons in
 min_heapify()

Optimize the min_heapify() function, resulting in a significant reduction
of approximately 50% in the number of comparisons for large random inputs,
while maintaining identical results.

The current implementation performs two comparisons per level to identify
the minimum among three elements.  In contrast, the proposed bottom-up
variation uses only one comparison per level to assess two children until
reaching the leaves.  Then, it sifts up until the correct position is
determined.

Typically, the process of sifting down proceeds to the leaf level,
resulting in O(1) secondary comparisons instead of log2(n).  This
optimization significantly reduces the number of costly indirect function
calls and improves overall performance.

Link: https://lkml.kernel.org/r/20240110081213.2289636-3-visitorckw@gmail.com
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/min_heap.h | 42 +++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h
index 18a581310eb350..d52daf45861b9a 100644
--- a/include/linux/min_heap.h
+++ b/include/linux/min_heap.h
@@ -35,31 +35,33 @@ static __always_inline
 void min_heapify(struct min_heap *heap, int pos,
 		const struct min_heap_callbacks *func)
 {
-	void *left, *right, *parent, *smallest;
+	void *left, *right;
 	void *data = heap->data;
+	void *root = data + pos * func->elem_size;
+	int i = pos, j;
 
+	/* Find the sift-down path all the way to the leaves. */
 	for (;;) {
-		if (pos * 2 + 1 >= heap->nr)
+		if (i * 2 + 2 >= heap->nr)
 			break;
+		left = data + (i * 2 + 1) * func->elem_size;
+		right = data + (i * 2 + 2) * func->elem_size;
+		i = func->less(left, right) ? i * 2 + 1 : i * 2 + 2;
+	}
 
-		left = data + ((pos * 2 + 1) * func->elem_size);
-		parent = data + (pos * func->elem_size);
-		smallest = parent;
-		if (func->less(left, smallest))
-			smallest = left;
-
-		if (pos * 2 + 2 < heap->nr) {
-			right = data + ((pos * 2 + 2) * func->elem_size);
-			if (func->less(right, smallest))
-				smallest = right;
-		}
-		if (smallest == parent)
-			break;
-		func->swp(smallest, parent);
-		if (smallest == left)
-			pos = (pos * 2) + 1;
-		else
-			pos = (pos * 2) + 2;
+	/* Special case for the last leaf with no sibling. */
+	if (i * 2 + 2 == heap->nr)
+		i = i * 2 + 1;
+
+	/* Backtrack to the correct location. */
+	while (i != pos && func->less(root, data + i * func->elem_size))
+		i = (i - 1) / 2;
+
+	/* Shift the element into its correct place. */
+	j = i;
+	while (i != pos) {
+		i = (i - 1) / 2;
+		func->swp(data + i * func->elem_size, data + j * func->elem_size);
 	}
 }
 

From 0df2b25ce9d89976fba0c800e5562148a3345a46 Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Mon, 15 Jan 2024 15:46:41 +0000
Subject: [PATCH 577/707] sysctl: allow change system v ipc sysctls inside ipc
 namespace

Patch series "Allow to change ipc/mq sysctls inside ipc namespace", v3.

Right now ipc and mq limits count as per ipc namespace, but only real root
can change them.  By default, the current values of these limits are such
that it can only be reduced.  Since only root can change the values, it is
impossible to reduce these limits in the rootless container.

We can allow limit changes within ipc namespace because mq parameters are
limited by RLIMIT_MSGQUEUE and ipc parameters are not limited to anything
other than cgroups.


This patch (of 3):

Rootless containers are not allowed to modify kernel IPC parameters.

All default limits are set to such high values that in fact there are no
limits at all.  All limits are not inherited and are initialized to
default values when a new ipc_namespace is created.

For new ipc_namespace:

size_t       ipc_ns.shm_ctlmax = SHMMAX; // (ULONG_MAX - (1UL << 24))
size_t       ipc_ns.shm_ctlall = SHMALL; // (ULONG_MAX - (1UL << 24))
int          ipc_ns.shm_ctlmni = IPCMNI; // (1 << 15)
int          ipc_ns.shm_rmid_forced = 0;
unsigned int ipc_ns.msg_ctlmax = MSGMAX; // 8192
unsigned int ipc_ns.msg_ctlmni = MSGMNI; // 32000
unsigned int ipc_ns.msg_ctlmnb = MSGMNB; // 16384

The shm_tot (total amount of shared pages) has also ceased to be global,
it is located in ipc_namespace and is not inherited from anywhere.

In such conditions, it cannot be said that these limits limit anything.
The real limiter for them is cgroups.

If we allow rootless containers to change these parameters, then it can
only be reduced.

Link: https://lkml.kernel.org/r/cover.1705333426.git.legion@kernel.org
Link: https://lkml.kernel.org/r/d2f4603305cbfed58a24755aa61d027314b73a45.1705333426.git.legion@kernel.org
Signed-off-by: Alexey Gladkov <legion@kernel.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Link: https://lkml.kernel.org/r/e2d84d3ec0172cfff759e6065da84ce0cc2736f8.1663756794.git.legion@kernel.org
Cc: Christian Brauner <brauner@kernel.org>
Cc: Joel Granados <joel.granados@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 ipc/ipc_sysctl.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 8c62e443f78b3c..01c4a50d22b2d2 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -14,6 +14,7 @@
 #include <linux/ipc_namespace.h>
 #include <linux/msg.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include "util.h"
 
 static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
@@ -190,25 +191,57 @@ static int set_is_seen(struct ctl_table_set *set)
 	return &current->nsproxy->ipc_ns->ipc_set == set;
 }
 
+static void ipc_set_ownership(struct ctl_table_header *head,
+			      struct ctl_table *table,
+			      kuid_t *uid, kgid_t *gid)
+{
+	struct ipc_namespace *ns =
+		container_of(head->set, struct ipc_namespace, ipc_set);
+
+	kuid_t ns_root_uid = make_kuid(ns->user_ns, 0);
+	kgid_t ns_root_gid = make_kgid(ns->user_ns, 0);
+
+	*uid = uid_valid(ns_root_uid) ? ns_root_uid : GLOBAL_ROOT_UID;
+	*gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID;
+}
+
 static int ipc_permissions(struct ctl_table_header *head, struct ctl_table *table)
 {
 	int mode = table->mode;
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
-	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+	struct ipc_namespace *ns =
+		container_of(head->set, struct ipc_namespace, ipc_set);
 
 	if (((table->data == &ns->ids[IPC_SEM_IDS].next_id) ||
 	     (table->data == &ns->ids[IPC_MSG_IDS].next_id) ||
 	     (table->data == &ns->ids[IPC_SHM_IDS].next_id)) &&
 	    checkpoint_restore_ns_capable(ns->user_ns))
 		mode = 0666;
+	else
 #endif
-	return mode;
+	{
+		kuid_t ns_root_uid;
+		kgid_t ns_root_gid;
+
+		ipc_set_ownership(head, table, &ns_root_uid, &ns_root_gid);
+
+		if (uid_eq(current_euid(), ns_root_uid))
+			mode >>= 6;
+
+		else if (in_egroup_p(ns_root_gid))
+			mode >>= 3;
+	}
+
+	mode &= 7;
+
+	return (mode << 6) | (mode << 3) | mode;
 }
 
 static struct ctl_table_root set_root = {
 	.lookup = set_lookup,
 	.permissions = ipc_permissions,
+	.set_ownership = ipc_set_ownership,
 };
 
 bool setup_ipc_sysctls(struct ipc_namespace *ns)

From 41d8f2d3ad4aa64c15752928a2edbe9af5470fec Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Mon, 15 Jan 2024 15:46:42 +0000
Subject: [PATCH 578/707] docs: add information about ipc sysctls limitations

After 25b21cb2f6d6 ("[PATCH] IPC namespace core") and 4e9823111bdc
("[PATCH] IPC namespace - shm") the shared memory page count stopped being
global and started counting per ipc namespace.  The documentation and
shmget(2) still says that shmall is a global option.

shmget(2):

SHMALL System-wide limit on the total amount of shared memory, measured in
units of the system page size.  On Linux, this limit can be read and
modified via /proc/sys/kernel/shmall.

I think the changes made in 2006 should be documented.

Link: https://lkml.kernel.org/r/09e99911071766958af488beb4e8a728a4f12135.1705333426.git.legion@kernel.org
Signed-off-by: Alexey Gladkov <legion@kernel.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Link: https://lkml.kernel.org/r/ede20ddf7be48b93e8084c3be2e920841ee1a641.1663756794.git.legion@kernel.org
Cc: Christian Brauner <brauner@kernel.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Joel Granados <joel.granados@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/sysctl/kernel.rst | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 6584a1f9bfe39d..bc578663619d6e 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -594,6 +594,9 @@ default (``MSGMNB``).
 ``msgmni`` is the maximum number of IPC queues. 32000 by default
 (``MSGMNI``).
 
+All of these parameters are set per ipc namespace. The maximum number of bytes
+in POSIX message queues is limited by ``RLIMIT_MSGQUEUE``. This limit is
+respected hierarchically in the each user namespace.
 
 msg_next_id, sem_next_id, and shm_next_id (System V IPC)
 ========================================================
@@ -1274,15 +1277,20 @@ are doing anyway :)
 shmall
 ======
 
-This parameter sets the total amount of shared memory pages that
-can be used system wide. Hence, ``shmall`` should always be at least
-``ceil(shmmax/PAGE_SIZE)``.
+This parameter sets the total amount of shared memory pages that can be used
+inside ipc namespace. The shared memory pages counting occurs for each ipc
+namespace separately and is not inherited. Hence, ``shmall`` should always be at
+least ``ceil(shmmax/PAGE_SIZE)``.
 
 If you are not sure what the default ``PAGE_SIZE`` is on your Linux
 system, you can run the following command::
 
 	# getconf PAGE_SIZE
 
+To reduce or disable the ability to allocate shared memory, you must create a
+new ipc namespace, set this parameter to the required value and prohibit the
+creation of a new ipc namespace in the current user namespace or cgroups can
+be used.
 
 shmmax
 ======

From 29dcc97276179e45688de7cb53b6b69c3d85d4c4 Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Mon, 15 Jan 2024 15:46:43 +0000
Subject: [PATCH 579/707] sysctl: allow to change limits for posix messages
 queues

All parameters of posix messages queues (queues_max/msg_max/msgsize_max)
end up being limited by RLIMIT_MSGQUEUE.  The code in mqueue_get_inode is
where that limiting happens.

The RLIMIT_MSGQUEUE is bound to the user namespace and is counted
hierarchically.

We can allow root in the user namespace to modify the posix messages
queues parameters.

Link: https://lkml.kernel.org/r/6ad67f23d1459a4f4339f74aa73bac0ecf3995e1.1705333426.git.legion@kernel.org
Signed-off-by: Alexey Gladkov <legion@kernel.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Link: https://lkml.kernel.org/r/7eb21211c8622e91d226e63416b1b93c079f60ee.1663756794.git.legion@kernel.org
Cc: Christian Brauner <brauner@kernel.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Joel Granados <joel.granados@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 ipc/mq_sysctl.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index ebb5ed81c151a8..21fba3a6edaf7a 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -12,6 +12,7 @@
 #include <linux/stat.h>
 #include <linux/capability.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 
 static int msg_max_limit_min = MIN_MSGMAX;
 static int msg_max_limit_max = HARD_MSGMAX;
@@ -76,8 +77,43 @@ static int set_is_seen(struct ctl_table_set *set)
 	return &current->nsproxy->ipc_ns->mq_set == set;
 }
 
+static void mq_set_ownership(struct ctl_table_header *head,
+			     struct ctl_table *table,
+			     kuid_t *uid, kgid_t *gid)
+{
+	struct ipc_namespace *ns =
+		container_of(head->set, struct ipc_namespace, mq_set);
+
+	kuid_t ns_root_uid = make_kuid(ns->user_ns, 0);
+	kgid_t ns_root_gid = make_kgid(ns->user_ns, 0);
+
+	*uid = uid_valid(ns_root_uid) ? ns_root_uid : GLOBAL_ROOT_UID;
+	*gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID;
+}
+
+static int mq_permissions(struct ctl_table_header *head, struct ctl_table *table)
+{
+	int mode = table->mode;
+	kuid_t ns_root_uid;
+	kgid_t ns_root_gid;
+
+	mq_set_ownership(head, table, &ns_root_uid, &ns_root_gid);
+
+	if (uid_eq(current_euid(), ns_root_uid))
+		mode >>= 6;
+
+	else if (in_egroup_p(ns_root_gid))
+		mode >>= 3;
+
+	mode &= 7;
+
+	return (mode << 6) | (mode << 3) | mode;
+}
+
 static struct ctl_table_root set_root = {
 	.lookup = set_lookup,
+	.permissions = mq_permissions,
+	.set_ownership = mq_set_ownership,
 };
 
 bool setup_mq_sysctls(struct ipc_namespace *ns)

From 31607d64203b6dc464ab54d76f4d6e39e31a708b Mon Sep 17 00:00:00 2001
From: Li zeming <zeming@nfschina.com>
Date: Mon, 15 Jan 2024 14:25:19 +0800
Subject: [PATCH 580/707] user_namespace: Remove unnecessary NULL values from
 kbuf

kbuf is assigned first, so it does not need to initialize the assignment.

Link: https://lkml.kernel.org/r/20240115062519.31298-1-zeming@nfschina.com
Signed-off-by: Li zeming <zeming@nfschina.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/user_namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index ce4d99df5f0eb4..0b0b95418b16a7 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -931,7 +931,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	struct uid_gid_map new_map;
 	unsigned idx;
 	struct uid_gid_extent extent;
-	char *kbuf = NULL, *pos, *next_line;
+	char *kbuf, *pos, *next_line;
 	ssize_t ret;
 
 	/* Only allow < page size writes at the beginning of the file */

From 7908dfd28df944dfbd26111ad0954fdd6e3fdba4 Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Sat, 13 Jan 2024 11:13:51 +0800
Subject: [PATCH 581/707] lib/sort: optimize heapsort for equal elements in
 sift-down path

Patch series "lib/sort: Optimize the number of swaps and comparisons".

This patch series aims to optimize the heapsort algorithm, specifically
targeting a reduction in the number of swaps and comparisons required.


This patch (of 2):

Currently, when searching for the sift-down path and encountering equal
elements, the algorithm chooses the left child.  However, considering that
the height of the right subtree may be one less than that of the left
subtree, selecting the right child in such cases can potentially reduce
the number of comparisons and swaps.

For instance, when sorting an array of 10,000 identical elements, the
current implementation requires 247,209 comparisons.  With this patch, the
number of comparisons can be reduced to 227,241.

Link: https://lkml.kernel.org/r/20240113031352.2395118-1-visitorckw@gmail.com
Link: https://lkml.kernel.org/r/20240113031352.2395118-2-visitorckw@gmail.com
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/sort.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/sort.c b/lib/sort.c
index b399bf10d6759b..fe4efd4a1410f7 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -262,7 +262,7 @@ void sort_r(void *base, size_t num, size_t size,
 		 * average, 3/4 worst-case.)
 		 */
 		for (b = a; c = 2*b + size, (d = c + size) < n;)
-			b = do_cmp(base + c, base + d, cmp_func, priv) >= 0 ? c : d;
+			b = do_cmp(base + c, base + d, cmp_func, priv) > 0 ? c : d;
 		if (d == n)	/* Special case last leaf with no sibling */
 			b = c;
 

From 26a69239d7edbc74e50286dade3a99ca1da5446d Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Sat, 13 Jan 2024 11:13:52 +0800
Subject: [PATCH 582/707] lib/sort: Optimize heapsort with double-pop variation

Instead of popping only the maximum element from the heap during each
iteration, we now pop the two largest elements at once.  Although this
introduces an additional comparison to determine the second largest
element, it enables a reduction in the height of the tree by one during
the heapify operations starting from root's left/right child.  This
reduction in tree height by one leads to a decrease of one comparison and
one swap.

This optimization results in saving approximately 0.5 * n swaps without
increasing the number of comparisons.  Additionally, the heap size during
heapify is now one less than the original size, offering a chance for
further reduction in comparisons and swaps.

The following experimental data is based on the array generated using
get_random_u32().

| N     | swaps (old) | swaps (new) | comparisons (old) | comparisons (new) |
|-------|-------------|-------------|-------------------|-------------------|
| 1000  | 9054        | 8569        | 10328             | 10320             |
| 2000  | 20137       | 19182       | 22634             | 22587             |
| 3000  | 32062       | 30623       | 35833             | 35752             |
| 4000  | 44274       | 42282       | 49332             | 49306             |
| 5000  | 57195       | 54676       | 63300             | 63294             |
| 6000  | 70205       | 67202       | 77599             | 77557             |
| 7000  | 83276       | 79831       | 92113             | 92032             |
| 8000  | 96630       | 92678       | 106635            | 106617            |
| 9000  | 110349      | 105883      | 121505            | 121404            |
| 10000 | 124165      | 119202      | 136628            | 136617            |


Link: https://lkml.kernel.org/r/20240113031352.2395118-3-visitorckw@gmail.com
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
Cc: George Spelvin <lkml@sdf.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/sort.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index fe4efd4a1410f7..a0509088f82aa5 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -215,6 +215,7 @@ void sort_r(void *base, size_t num, size_t size,
 	/* pre-scale counters for performance */
 	size_t n = num * size, a = (num/2) * size;
 	const unsigned int lsbit = size & -size;  /* Used to find parent */
+	size_t shift = 0;
 
 	if (!a)		/* num < 2 || size == 0 */
 		return;
@@ -242,12 +243,21 @@ void sort_r(void *base, size_t num, size_t size,
 	for (;;) {
 		size_t b, c, d;
 
-		if (a)			/* Building heap: sift down --a */
-			a -= size;
-		else if (n -= size)	/* Sorting: Extract root to --n */
+		if (a)			/* Building heap: sift down a */
+			a -= size << shift;
+		else if (n > 3 * size) { /* Sorting: Extract two largest elements */
+			n -= size;
 			do_swap(base, base + n, size, swap_func, priv);
-		else			/* Sort complete */
+			shift = do_cmp(base + size, base + 2 * size, cmp_func, priv) <= 0;
+			a = size << shift;
+			n -= size;
+			do_swap(base + a, base + n, size, swap_func, priv);
+		} else if (n > size) {	/* Sorting: Extract root */
+			n -= size;
+			do_swap(base, base + n, size, swap_func, priv);
+		} else	{		/* Sort complete */
 			break;
+		}
 
 		/*
 		 * Sift element at "a" down into heap.  This is the

From ccef4769a428e3e2056f9698c3212677192c2a09 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Wed, 17 Jan 2024 06:16:36 +0000
Subject: [PATCH 583/707] kprobes: use synchronize_rcu_tasks_rude in
 kprobe_optimizer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a deadlock scenario in kprobe_optimizer():

pid A				pid B			pid C
kprobe_optimizer()		do_exit()		perf_kprobe_init()
mutex_lock(&kprobe_mutex)	exit_tasks_rcu_start()	mutex_lock(&kprobe_mutex)
synchronize_rcu_tasks()		zap_pid_ns_processes()	// waiting kprobe_mutex
// waiting tasks_rcu_exit_srcu	kernel_wait4()
				// waiting pid C exit

To avoid this deadlock loop, use synchronize_rcu_tasks_rude() in
kprobe_optimizer() rather than synchronize_rcu_tasks().
synchronize_rcu_tasks_rude() can also promise that all preempted tasks
have scheduled, but it will not wait tasks_rcu_exit_srcu.

Link: https://lkml.kernel.org/r/20240117061636.288412-1-chenzhongjin@huawei.com
Fixes: a30b85df7d59 ("kprobes: Use synchronize_rcu_tasks() for optprobe with CONFIG_PREEMPT=y")
Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Eric DeVolder <eric.devolder@oracle.com>
Cc: Jakob Koschel <jkl820.git@gmail.com>
Cc: Juerg Haefliger <juerg.haefliger@canonical.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Mickaël Salaün <mic@digikod.net>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yang Jihong <yangjihong1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/Kconfig     | 2 +-
 kernel/kprobes.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index c91917b508736d..fe3942d9f0bf56 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -104,7 +104,7 @@ config STATIC_CALL_SELFTEST
 config OPTPROBES
 	def_bool y
 	depends on KPROBES && HAVE_OPTPROBES
-	select TASKS_RCU if PREEMPTION
+	select TASKS_RUDE_RCU
 
 config KPROBES_ON_FTRACE
 	def_bool y
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d5a0ee40bf66c5..09056ae50c5832 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -623,7 +623,7 @@ static void kprobe_optimizer(struct work_struct *work)
 	 * Note that on non-preemptive kernel, this is transparently converted
 	 * to synchronoze_sched() to wait for all interrupts to have completed.
 	 */
-	synchronize_rcu_tasks();
+	synchronize_rcu_tasks_rude();
 
 	/* Step 3: Optimize kprobes after quiesence period */
 	do_optimize_kprobes();

From 7343ccc1e41c7bc75dad6fc79b45a37c19efa572 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 17 Jan 2024 12:33:11 -0800
Subject: [PATCH 584/707] 
 kprobes-use-synchronize_rcu_tasks_rude-in-kprobe_optimizer-fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

unrelated comment typo fix

Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Chen Zhongjin <chenzhongjin@huawei.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Eric DeVolder <eric.devolder@oracle.com>
Cc: Jakob Koschel <jkl820.git@gmail.com>
Cc: Juerg Haefliger <juerg.haefliger@canonical.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mickaël Salaün <mic@digikod.net>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yang Jihong <yangjihong1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 09056ae50c5832..e8512177b8bb4a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -621,7 +621,7 @@ static void kprobe_optimizer(struct work_struct *work)
 	 * instruction is preempted. In that case, such tasks can return
 	 * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
 	 * Note that on non-preemptive kernel, this is transparently converted
-	 * to synchronoze_sched() to wait for all interrupts to have completed.
+	 * to synchronize_sched() to wait for all interrupts to have completed.
 	 */
 	synchronize_rcu_tasks_rude();
 

From 24e8b9bdc03565824b4cfbc50c5a8dd61b374421 Mon Sep 17 00:00:00 2001
From: Kemeng Shi <shikemeng@huaweicloud.com>
Date: Fri, 19 Jan 2024 04:13:21 +0800
Subject: [PATCH 585/707] flex_proportions: remove unused fprop_local_single

The single variant of flex_proportions is not used.  Simply remove it.

Link: https://lkml.kernel.org/r/20240118201321.759174-1-shikemeng@huaweicloud.com
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/flex_proportions.h | 32 -------------
 lib/flex_proportions.c           | 77 --------------------------------
 2 files changed, 109 deletions(-)

diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h
index 3e378b1fb0bc82..e9a72fd0bfe78b 100644
--- a/include/linux/flex_proportions.h
+++ b/include/linux/flex_proportions.h
@@ -38,38 +38,6 @@ int fprop_global_init(struct fprop_global *p, gfp_t gfp);
 void fprop_global_destroy(struct fprop_global *p);
 bool fprop_new_period(struct fprop_global *p, int periods);
 
-/*
- *  ---- SINGLE ----
- */
-struct fprop_local_single {
-	/* the local events counter */
-	unsigned long events;
-	/* Period in which we last updated events */
-	unsigned int period;
-	raw_spinlock_t lock;	/* Protect period and numerator */
-};
-
-#define INIT_FPROP_LOCAL_SINGLE(name)			\
-{	.lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock),	\
-}
-
-int fprop_local_init_single(struct fprop_local_single *pl);
-void fprop_local_destroy_single(struct fprop_local_single *pl);
-void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl);
-void fprop_fraction_single(struct fprop_global *p,
-	struct fprop_local_single *pl, unsigned long *numerator,
-	unsigned long *denominator);
-
-static inline
-void fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__fprop_inc_single(p, pl);
-	local_irq_restore(flags);
-}
-
 /*
  * ---- PERCPU ----
  */
diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c
index 83332fefa6f42e..84ecccddc77182 100644
--- a/lib/flex_proportions.c
+++ b/lib/flex_proportions.c
@@ -83,83 +83,6 @@ bool fprop_new_period(struct fprop_global *p, int periods)
 	return true;
 }
 
-/*
- * ---- SINGLE ----
- */
-
-int fprop_local_init_single(struct fprop_local_single *pl)
-{
-	pl->events = 0;
-	pl->period = 0;
-	raw_spin_lock_init(&pl->lock);
-	return 0;
-}
-
-void fprop_local_destroy_single(struct fprop_local_single *pl)
-{
-}
-
-static void fprop_reflect_period_single(struct fprop_global *p,
-					struct fprop_local_single *pl)
-{
-	unsigned int period = p->period;
-	unsigned long flags;
-
-	/* Fast path - period didn't change */
-	if (pl->period == period)
-		return;
-	raw_spin_lock_irqsave(&pl->lock, flags);
-	/* Someone updated pl->period while we were spinning? */
-	if (pl->period >= period) {
-		raw_spin_unlock_irqrestore(&pl->lock, flags);
-		return;
-	}
-	/* Aging zeroed our fraction? */
-	if (period - pl->period < BITS_PER_LONG)
-		pl->events >>= period - pl->period;
-	else
-		pl->events = 0;
-	pl->period = period;
-	raw_spin_unlock_irqrestore(&pl->lock, flags);
-}
-
-/* Event of type pl happened */
-void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl)
-{
-	fprop_reflect_period_single(p, pl);
-	pl->events++;
-	percpu_counter_add(&p->events, 1);
-}
-
-/* Return fraction of events of type pl */
-void fprop_fraction_single(struct fprop_global *p,
-			   struct fprop_local_single *pl,
-			   unsigned long *numerator, unsigned long *denominator)
-{
-	unsigned int seq;
-	s64 num, den;
-
-	do {
-		seq = read_seqcount_begin(&p->sequence);
-		fprop_reflect_period_single(p, pl);
-		num = pl->events;
-		den = percpu_counter_read_positive(&p->events);
-	} while (read_seqcount_retry(&p->sequence, seq));
-
-	/*
-	 * Make fraction <= 1 and denominator > 0 even in presence of percpu
-	 * counter errors
-	 */
-	if (den <= num) {
-		if (num)
-			den = num;
-		else
-			den = 1;
-	}
-	*denominator = den;
-	*numerator = num;
-}
-
 /*
  * ---- PERCPU ----
  */

From 8b088f4def0cc472369ac39bb3a3a393160a5c51 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 22 Jan 2024 18:16:31 +0100
Subject: [PATCH 586/707] ptrace_attach: shift send(SIGSTOP) into
 ptrace_set_stopped()

Turn send_sig_info(SIGSTOP) into send_signal_locked(SIGSTOP) and move it
from ptrace_attach() to ptrace_set_stopped().

This looks more logical and avoids lock(siglock) right after unlock().

Link: https://lkml.kernel.org/r/20240122171631.GA29844@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/ptrace.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2fabd497d65988..d5f89f9ef29f65 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -375,10 +375,13 @@ static int check_ptrace_options(unsigned long data)
 	return 0;
 }
 
-static inline void ptrace_set_stopped(struct task_struct *task)
+static inline void ptrace_set_stopped(struct task_struct *task, bool seize)
 {
 	guard(spinlock)(&task->sighand->siglock);
 
+	/* SEIZE doesn't trap tracee on attach */
+	if (!seize)
+		send_signal_locked(SIGSTOP, SEND_SIG_PRIV, task, PIDTYPE_PID);
 	/*
 	 * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
 	 * TRAPPING, and kick it so that it transits to TRACED.  TRAPPING
@@ -457,14 +460,8 @@ static int ptrace_attach(struct task_struct *task, long request,
 				return -EPERM;
 
 			task->ptrace = flags;
-
 			ptrace_link(task, current);
-
-			/* SEIZE doesn't trap tracee on attach */
-			if (!seize)
-				send_sig_info(SIGSTOP, SEND_SIG_PRIV, task);
-
-			ptrace_set_stopped(task);
+			ptrace_set_stopped(task, seize);
 		}
 	}
 

From 939f96166af258188876a5d720b0aa3ed97072e0 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 22 Jan 2024 15:50:43 +0100
Subject: [PATCH 587/707] lib: dhry: remove unneeded <linux/mutex.h>

Patch series "lib: dhry: miscellaneous cleanups".

This patch series contains a few miscellaneous cleanups for the
Dhrystone benchmark test.


This patch (of 3):

The Dhrystone benchmark test does not use mutexes.

Link: https://lkml.kernel.org/r/cover.1705934853.git.geert+renesas@glider.be
Link: https://lkml.kernel.org/r/cf8fafaedccf96143f1513745c43a457480bfc24.1705934853.git.geert+renesas@glider.be
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/dhry_run.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/dhry_run.c b/lib/dhry_run.c
index f15ac666e9d38b..e6a279dabf848e 100644
--- a/lib/dhry_run.c
+++ b/lib/dhry_run.c
@@ -10,7 +10,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/mutex.h>
 #include <linux/smp.h>
 
 #define DHRY_VAX	1757

From 99de0f4878a9c624b4a06f6e9546ec95a3bfa24e Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 22 Jan 2024 15:50:44 +0100
Subject: [PATCH 588/707] lib: dhry: use ktime_ms_delta() helper

Use the existing ktime_ms_delta() helper instead of open-coding the same
operation.

Link: https://lkml.kernel.org/r/bb43c67a7580de6152f5e6eb225071166d33b6e4.1705934853.git.geert+renesas@glider.be
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/dhry_1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/dhry_1.c b/lib/dhry_1.c
index 08edbbb19f573f..ca6c87232c5809 100644
--- a/lib/dhry_1.c
+++ b/lib/dhry_1.c
@@ -277,7 +277,7 @@ int dhry(int n)
 	dhry_assert_string_eq(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING");
 	dhry_assert_string_eq(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING");
 
-	User_Time = ktime_to_ms(ktime_sub(End_Time, Begin_Time));
+	User_Time = ktime_ms_delta(End_Time, Begin_Time);
 
 	kfree(Ptr_Glob);
 	kfree(Next_Ptr_Glob);

From e495a4c40a4122d495664709823e966d3e0b1916 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 22 Jan 2024 15:50:45 +0100
Subject: [PATCH 589/707] lib: dhry: add missing closing parenthesis

The help text for the Dhrystone benchmark test lacks a matching closing
parenthesis.

Link: https://lkml.kernel.org/r/772b43271bcb3dd17a6aae671b2084f08c05b079.1705934853.git.geert+renesas@glider.be
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 975a07f9f1cc08..8f502f15dc7fb5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2142,7 +2142,7 @@ config TEST_DHRY
 
 	  To run the benchmark, it needs to be enabled explicitly, either from
 	  the kernel command line (when built-in), or from userspace (when
-	  built-in or modular.
+	  built-in or modular).
 
 	  Run once during kernel boot:
 

From 3fa0dc95b4a2d0d3901b019c087b920c63f1668f Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:49 +0900
Subject: [PATCH 590/707] nilfs2: convert segment buffer to use kmap_local

In the segment buffer code used for log writing, a CRC calculation routine
uses the deprecated kmap_atomic(), so convert it to use kmap_local.

Link: https://lkml.kernel.org/r/20240122140202.6950-3-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/segbuf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 6e59dc19a73249..dc431b4c34c96c 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -220,9 +220,9 @@ static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
 		crc = crc32_le(crc, bh->b_data, bh->b_size);
 	}
 	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
-		kaddr = kmap_atomic(bh->b_page);
+		kaddr = kmap_local_page(bh->b_page);
 		crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 	}
 	raw_sum->ss_datasum = cpu_to_le32(crc);
 }

From 3f0169197563cc5d0efbf4b20cb7cacf9ef58034 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:50 +0900
Subject: [PATCH 591/707] nilfs2: convert nilfs_copy_buffer() to use kmap_local

The routine nilfs_copy_buffer() that copies a block buffer still uses the
deprecated kmap_atomic(), so convert it to use kmap_local.

Link: https://lkml.kernel.org/r/20240122140202.6950-4-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/page.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 5c2eba1987bd70..14e470fb88706a 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -103,11 +103,11 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
 	struct page *spage = sbh->b_page, *dpage = dbh->b_page;
 	struct buffer_head *bh;
 
-	kaddr0 = kmap_atomic(spage);
-	kaddr1 = kmap_atomic(dpage);
+	kaddr0 = kmap_local_page(spage);
+	kaddr1 = kmap_local_page(dpage);
 	memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
-	kunmap_atomic(kaddr1);
-	kunmap_atomic(kaddr0);
+	kunmap_local(kaddr1);
+	kunmap_local(kaddr0);
 
 	dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
 	dbh->b_blocknr = sbh->b_blocknr;

From 1319083353489215d23d8cdedd5fa09f740e3020 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:51 +0900
Subject: [PATCH 592/707] nilfs2: convert metadata file common code to use
 kmap_local

In the common code of metadata files, the new block creation routine
nilfs_mdt_insert_new_block() still uses the deprecated kmap_atomic(), so
convert it to use kmap_local.

Link: https://lkml.kernel.org/r/20240122140202.6950-5-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/mdt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index e45c01a559c013..4f792a0ad0f0ff 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -47,12 +47,12 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
 
 	set_buffer_mapped(bh);
 
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	memset(kaddr + bh_offset(bh), 0, i_blocksize(inode));
 	if (init_block)
 		init_block(inode, bh, kaddr);
 	flush_dcache_page(bh->b_page);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	set_buffer_uptodate(bh);
 	mark_buffer_dirty(bh);

From 690bb2b2b6d4cf3e35ad5c44bad1b46d547ff2e7 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:52 +0900
Subject: [PATCH 593/707] nilfs2: convert sufile to use kmap_local

Concerning the code of the metadata file sufile for segment management,
convert all parts that uses the deprecated kmap_atomic() to use
kmap_local.  All transformations are directly possible here.

Link: https://lkml.kernel.org/r/20240122140202.6950-6-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/sufile.c | 86 +++++++++++++++++++++++-----------------------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 0a8119456c2136..abf05dc5750c79 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -107,11 +107,11 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
 	struct nilfs_sufile_header *header;
 	void *kaddr;
 
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
 	le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
 	le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(header_bh);
 }
@@ -315,10 +315,10 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
 	if (ret < 0)
 		goto out_sem;
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
 	last_alloc = le64_to_cpu(header->sh_last_alloc);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nsegments = nilfs_sufile_get_nsegments(sufile);
 	maxsegnum = sui->allocmax;
@@ -352,7 +352,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 							   &su_bh);
 		if (ret < 0)
 			goto out_header;
-		kaddr = kmap_atomic(su_bh->b_page);
+		kaddr = kmap_local_page(su_bh->b_page);
 		su = nilfs_sufile_block_get_segment_usage(
 			sufile, segnum, su_bh, kaddr);
 
@@ -363,14 +363,14 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 				continue;
 			/* found a clean segment */
 			nilfs_segment_usage_set_dirty(su);
-			kunmap_atomic(kaddr);
+			kunmap_local(kaddr);
 
-			kaddr = kmap_atomic(header_bh->b_page);
+			kaddr = kmap_local_page(header_bh->b_page);
 			header = kaddr + bh_offset(header_bh);
 			le64_add_cpu(&header->sh_ncleansegs, -1);
 			le64_add_cpu(&header->sh_ndirtysegs, 1);
 			header->sh_last_alloc = cpu_to_le64(segnum);
-			kunmap_atomic(kaddr);
+			kunmap_local(kaddr);
 
 			sui->ncleansegs--;
 			mark_buffer_dirty(header_bh);
@@ -384,7 +384,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 			goto out_header;
 		}
 
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(su_bh);
 	}
 
@@ -406,16 +406,16 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
 	struct nilfs_segment_usage *su;
 	void *kaddr;
 
-	kaddr = kmap_atomic(su_bh->b_page);
+	kaddr = kmap_local_page(su_bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
 	if (unlikely(!nilfs_segment_usage_clean(su))) {
 		nilfs_warn(sufile->i_sb, "%s: segment %llu must be clean",
 			   __func__, (unsigned long long)segnum);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		return;
 	}
 	nilfs_segment_usage_set_dirty(su);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nilfs_sufile_mod_counter(header_bh, -1, 1);
 	NILFS_SUI(sufile)->ncleansegs--;
@@ -432,11 +432,11 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
 	void *kaddr;
 	int clean, dirty;
 
-	kaddr = kmap_atomic(su_bh->b_page);
+	kaddr = kmap_local_page(su_bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
 	if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) &&
 	    su->su_nblocks == cpu_to_le32(0)) {
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		return;
 	}
 	clean = nilfs_segment_usage_clean(su);
@@ -446,7 +446,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
 	su->su_lastmod = cpu_to_le64(0);
 	su->su_nblocks = cpu_to_le32(0);
 	su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY));
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
 	NILFS_SUI(sufile)->ncleansegs -= clean;
@@ -463,12 +463,12 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
 	void *kaddr;
 	int sudirty;
 
-	kaddr = kmap_atomic(su_bh->b_page);
+	kaddr = kmap_local_page(su_bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
 	if (nilfs_segment_usage_clean(su)) {
 		nilfs_warn(sufile->i_sb, "%s: segment %llu is already clean",
 			   __func__, (unsigned long long)segnum);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		return;
 	}
 	if (unlikely(nilfs_segment_usage_error(su)))
@@ -481,7 +481,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
 			   (unsigned long long)segnum);
 
 	nilfs_segment_usage_set_clean(su);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	mark_buffer_dirty(su_bh);
 
 	nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
@@ -509,12 +509,12 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
 	if (ret)
 		goto out_sem;
 
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
 	if (unlikely(nilfs_segment_usage_error(su))) {
 		struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
 
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(bh);
 		if (nilfs_segment_is_active(nilfs, segnum)) {
 			nilfs_error(sufile->i_sb,
@@ -532,7 +532,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
 		ret = -EIO;
 	} else {
 		nilfs_segment_usage_set_dirty(su);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		mark_buffer_dirty(bh);
 		nilfs_mdt_mark_dirty(sufile);
 		brelse(bh);
@@ -562,7 +562,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
 	if (ret < 0)
 		goto out_sem;
 
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
 	if (modtime) {
 		/*
@@ -573,7 +573,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
 		su->su_lastmod = cpu_to_le64(modtime);
 	}
 	su->su_nblocks = cpu_to_le32(nblocks);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(bh);
 	nilfs_mdt_mark_dirty(sufile);
@@ -614,7 +614,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 	if (ret < 0)
 		goto out_sem;
 
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
 	sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
 	sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
@@ -624,7 +624,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 	spin_lock(&nilfs->ns_last_segment_lock);
 	sustat->ss_prot_seq = nilfs->ns_prot_seq;
 	spin_unlock(&nilfs->ns_last_segment_lock);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(header_bh);
 
  out_sem:
@@ -640,15 +640,15 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
 	void *kaddr;
 	int suclean;
 
-	kaddr = kmap_atomic(su_bh->b_page);
+	kaddr = kmap_local_page(su_bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
 	if (nilfs_segment_usage_error(su)) {
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		return;
 	}
 	suclean = nilfs_segment_usage_clean(su);
 	nilfs_segment_usage_set_error(su);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	if (suclean) {
 		nilfs_sufile_mod_counter(header_bh, -1, 0);
@@ -717,7 +717,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
 			/* hole */
 			continue;
 		}
-		kaddr = kmap_atomic(su_bh->b_page);
+		kaddr = kmap_local_page(su_bh->b_page);
 		su = nilfs_sufile_block_get_segment_usage(
 			sufile, segnum, su_bh, kaddr);
 		su2 = su;
@@ -726,7 +726,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
 			     ~BIT(NILFS_SEGMENT_USAGE_ERROR)) ||
 			    nilfs_segment_is_active(nilfs, segnum + j)) {
 				ret = -EBUSY;
-				kunmap_atomic(kaddr);
+				kunmap_local(kaddr);
 				brelse(su_bh);
 				goto out_header;
 			}
@@ -738,7 +738,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
 				nc++;
 			}
 		}
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		if (nc > 0) {
 			mark_buffer_dirty(su_bh);
 			ncleaned += nc;
@@ -823,10 +823,10 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)
 		sui->allocmin = 0;
 	}
 
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
 	header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(header_bh);
 	nilfs_mdt_mark_dirty(sufile);
@@ -891,7 +891,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
 			continue;
 		}
 
-		kaddr = kmap_atomic(su_bh->b_page);
+		kaddr = kmap_local_page(su_bh->b_page);
 		su = nilfs_sufile_block_get_segment_usage(
 			sufile, segnum, su_bh, kaddr);
 		for (j = 0; j < n;
@@ -904,7 +904,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
 				si->sui_flags |=
 					BIT(NILFS_SEGMENT_USAGE_ACTIVE);
 		}
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(su_bh);
 	}
 	ret = nsegs;
@@ -973,7 +973,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
 		goto out_header;
 
 	for (;;) {
-		kaddr = kmap_atomic(bh->b_page);
+		kaddr = kmap_local_page(bh->b_page);
 		su = nilfs_sufile_block_get_segment_usage(
 			sufile, sup->sup_segnum, bh, kaddr);
 
@@ -1010,7 +1010,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
 			su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags);
 		}
 
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 
 		sup = (void *)sup + supsz;
 		if (sup >= supend)
@@ -1115,7 +1115,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
 			continue;
 		}
 
-		kaddr = kmap_atomic(su_bh->b_page);
+		kaddr = kmap_local_page(su_bh->b_page);
 		su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
 				su_bh, kaddr);
 		for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
@@ -1145,7 +1145,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
 			}
 
 			if (nblocks >= minlen) {
-				kunmap_atomic(kaddr);
+				kunmap_local(kaddr);
 
 				ret = blkdev_issue_discard(nilfs->ns_bdev,
 						start * sects_per_block,
@@ -1157,7 +1157,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
 				}
 
 				ndiscarded += nblocks;
-				kaddr = kmap_atomic(su_bh->b_page);
+				kaddr = kmap_local_page(su_bh->b_page);
 				su = nilfs_sufile_block_get_segment_usage(
 					sufile, segnum, su_bh, kaddr);
 			}
@@ -1166,7 +1166,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
 			start = seg_start;
 			nblocks = seg_end - seg_start + 1;
 		}
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		put_bh(su_bh);
 	}
 
@@ -1246,10 +1246,10 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
 		goto failed;
 
 	sui = NILFS_SUI(sufile);
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
 	sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(header_bh);
 
 	sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1;

From 3ce4276b207eda048014dabc2295bf056278fdb9 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:53 +0900
Subject: [PATCH 594/707] nilfs2: convert persistent object allocator to use
 kmap_local

Regarding the allocator code that is commonly used in the ondisk inode
metadata file ifile and the disk address translation metadata file DAT,
convert the parts that use the deprecated kmap_atomic() and kmap() to use
kmap_local.

Most can be converted directly, but only
nilfs_palloc_prepare_alloc_entry() needs to be rewritten to change mapping
sections so that multiple kmap_local/kunmap_local calls are nested and
disk I/O can be avoided within the mapping sections.

Link: https://lkml.kernel.org/r/20240122140202.6950-7-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/alloc.c | 91 ++++++++++++++++++++++++-----------------------
 1 file changed, 46 insertions(+), 45 deletions(-)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 7342de296ec3c6..89caef7513db35 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -525,54 +525,55 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 		ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
 		if (ret < 0)
 			return ret;
-		desc_kaddr = kmap(desc_bh->b_page);
+		desc_kaddr = kmap_local_page(desc_bh->b_page);
 		desc = nilfs_palloc_block_get_group_desc(
 			inode, group, desc_bh, desc_kaddr);
 		n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
 							   maxgroup);
-		for (j = 0; j < n; j++, desc++, group++) {
+		for (j = 0; j < n; j++, desc++, group++, group_offset = 0) {
 			lock = nilfs_mdt_bgl_lock(inode, group);
-			if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) {
-				ret = nilfs_palloc_get_bitmap_block(
-					inode, group, 1, &bitmap_bh);
-				if (ret < 0)
-					goto out_desc;
-				bitmap_kaddr = kmap(bitmap_bh->b_page);
-				bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
-				pos = nilfs_palloc_find_available_slot(
-					bitmap, group_offset,
-					entries_per_group, lock);
-				if (pos >= 0) {
-					/* found a free entry */
-					nilfs_palloc_group_desc_add_entries(
-						desc, lock, -1);
-					req->pr_entry_nr =
-						entries_per_group * group + pos;
-					kunmap(desc_bh->b_page);
-					kunmap(bitmap_bh->b_page);
-
-					req->pr_desc_bh = desc_bh;
-					req->pr_bitmap_bh = bitmap_bh;
-					return 0;
-				}
-				kunmap(bitmap_bh->b_page);
-				brelse(bitmap_bh);
+			if (nilfs_palloc_group_desc_nfrees(desc, lock) == 0)
+				continue;
+
+			kunmap_local(desc_kaddr);
+			ret = nilfs_palloc_get_bitmap_block(inode, group, 1,
+							    &bitmap_bh);
+			if (unlikely(ret < 0)) {
+				brelse(desc_bh);
+				return ret;
 			}
 
-			group_offset = 0;
+			desc_kaddr = kmap_local_page(desc_bh->b_page);
+			desc = nilfs_palloc_block_get_group_desc(
+				inode, group, desc_bh, desc_kaddr);
+
+			bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
+			bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
+			pos = nilfs_palloc_find_available_slot(
+				bitmap, group_offset, entries_per_group, lock);
+			kunmap_local(bitmap_kaddr);
+			if (pos >= 0)
+				goto found;
+
+			brelse(bitmap_bh);
 		}
 
-		kunmap(desc_bh->b_page);
+		kunmap_local(desc_kaddr);
 		brelse(desc_bh);
 	}
 
 	/* no entries left */
 	return -ENOSPC;
 
- out_desc:
-	kunmap(desc_bh->b_page);
-	brelse(desc_bh);
-	return ret;
+found:
+	/* found a free entry */
+	nilfs_palloc_group_desc_add_entries(desc, lock, -1);
+	req->pr_entry_nr = entries_per_group * group + pos;
+	kunmap_local(desc_kaddr);
+
+	req->pr_desc_bh = desc_bh;
+	req->pr_bitmap_bh = bitmap_bh;
+	return 0;
 }
 
 /**
@@ -606,10 +607,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
 	spinlock_t *lock;
 
 	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
-	desc_kaddr = kmap(req->pr_desc_bh->b_page);
+	desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page);
 	desc = nilfs_palloc_block_get_group_desc(inode, group,
 						 req->pr_desc_bh, desc_kaddr);
-	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+	bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page);
 	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
 	lock = nilfs_mdt_bgl_lock(inode, group);
 
@@ -621,8 +622,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
 	else
 		nilfs_palloc_group_desc_add_entries(desc, lock, 1);
 
-	kunmap(req->pr_bitmap_bh->b_page);
-	kunmap(req->pr_desc_bh->b_page);
+	kunmap_local(bitmap_kaddr);
+	kunmap_local(desc_kaddr);
 
 	mark_buffer_dirty(req->pr_desc_bh);
 	mark_buffer_dirty(req->pr_bitmap_bh);
@@ -647,10 +648,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
 	spinlock_t *lock;
 
 	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
-	desc_kaddr = kmap(req->pr_desc_bh->b_page);
+	desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page);
 	desc = nilfs_palloc_block_get_group_desc(inode, group,
 						 req->pr_desc_bh, desc_kaddr);
-	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+	bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page);
 	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
 	lock = nilfs_mdt_bgl_lock(inode, group);
 
@@ -662,8 +663,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
 	else
 		nilfs_palloc_group_desc_add_entries(desc, lock, 1);
 
-	kunmap(req->pr_bitmap_bh->b_page);
-	kunmap(req->pr_desc_bh->b_page);
+	kunmap_local(bitmap_kaddr);
+	kunmap_local(desc_kaddr);
 
 	brelse(req->pr_bitmap_bh);
 	brelse(req->pr_desc_bh);
@@ -755,7 +756,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 		/* Get the first entry number of the group */
 		group_min_nr = (__u64)group * epg;
 
-		bitmap_kaddr = kmap(bitmap_bh->b_page);
+		bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
 		bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
 		lock = nilfs_mdt_bgl_lock(inode, group);
 
@@ -801,7 +802,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 			entry_start = rounddown(group_offset, epb);
 		} while (true);
 
-		kunmap(bitmap_bh->b_page);
+		kunmap_local(bitmap_kaddr);
 		mark_buffer_dirty(bitmap_bh);
 		brelse(bitmap_bh);
 
@@ -815,11 +816,11 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 					   inode->i_ino);
 		}
 
-		desc_kaddr = kmap_atomic(desc_bh->b_page);
+		desc_kaddr = kmap_local_page(desc_bh->b_page);
 		desc = nilfs_palloc_block_get_group_desc(
 			inode, group, desc_bh, desc_kaddr);
 		nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n);
-		kunmap_atomic(desc_kaddr);
+		kunmap_local(desc_kaddr);
 		mark_buffer_dirty(desc_bh);
 		nilfs_mdt_mark_dirty(inode);
 		brelse(desc_bh);

From 676cc3b2e60e1b183026a3ecc0e92d3449d62946 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:54 +0900
Subject: [PATCH 595/707] nilfs2: convert DAT to use kmap_local

Concerning the code of the metadata file DAT for disk address translation,
convert all parts that use the deprecated kmap_atomic to use kmap_local.
All transformations are directly possible.

Link: https://lkml.kernel.org/r/20240122140202.6950-8-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/dat.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 9cf6ba58f5859f..8f71f8b0e2188b 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -91,13 +91,13 @@ void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
 	struct nilfs_dat_entry *entry;
 	void *kaddr;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
 	entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
 	entry->de_blocknr = cpu_to_le64(0);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nilfs_palloc_commit_alloc_entry(dat, req);
 	nilfs_dat_commit_entry(dat, req);
@@ -115,13 +115,13 @@ static void nilfs_dat_commit_free(struct inode *dat,
 	struct nilfs_dat_entry *entry;
 	void *kaddr;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
 	entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
 	entry->de_blocknr = cpu_to_le64(0);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nilfs_dat_commit_entry(dat, req);
 
@@ -145,12 +145,12 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
 	struct nilfs_dat_entry *entry;
 	void *kaddr;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
 	entry->de_blocknr = cpu_to_le64(blocknr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nilfs_dat_commit_entry(dat, req);
 }
@@ -167,12 +167,12 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
 	if (ret < 0)
 		return ret;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	start = le64_to_cpu(entry->de_start);
 	blocknr = le64_to_cpu(entry->de_blocknr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	if (blocknr == 0) {
 		ret = nilfs_palloc_prepare_free_entry(dat, req);
@@ -202,7 +202,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
 	sector_t blocknr;
 	void *kaddr;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	end = start = le64_to_cpu(entry->de_start);
@@ -212,7 +212,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
 	}
 	entry->de_end = cpu_to_le64(end);
 	blocknr = le64_to_cpu(entry->de_blocknr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	if (blocknr == 0)
 		nilfs_dat_commit_free(dat, req);
@@ -227,12 +227,12 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
 	sector_t blocknr;
 	void *kaddr;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	start = le64_to_cpu(entry->de_start);
 	blocknr = le64_to_cpu(entry->de_blocknr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	if (start == nilfs_mdt_cno(dat) && blocknr == 0)
 		nilfs_palloc_abort_free_entry(dat, req);
@@ -362,7 +362,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
 		}
 	}
 
-	kaddr = kmap_atomic(entry_bh->b_page);
+	kaddr = kmap_local_page(entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
 	if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
 		nilfs_crit(dat->i_sb,
@@ -370,13 +370,13 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
 			   __func__, (unsigned long long)vblocknr,
 			   (unsigned long long)le64_to_cpu(entry->de_start),
 			   (unsigned long long)le64_to_cpu(entry->de_end));
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(entry_bh);
 		return -EINVAL;
 	}
 	WARN_ON(blocknr == 0);
 	entry->de_blocknr = cpu_to_le64(blocknr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(entry_bh);
 	nilfs_mdt_mark_dirty(dat);
@@ -426,7 +426,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 		}
 	}
 
-	kaddr = kmap_atomic(entry_bh->b_page);
+	kaddr = kmap_local_page(entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
 	blocknr = le64_to_cpu(entry->de_blocknr);
 	if (blocknr == 0) {
@@ -436,7 +436,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 	*blocknrp = blocknr;
 
  out:
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(entry_bh);
 	return ret;
 }
@@ -457,7 +457,7 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
 						   0, &entry_bh);
 		if (ret < 0)
 			return ret;
-		kaddr = kmap_atomic(entry_bh->b_page);
+		kaddr = kmap_local_page(entry_bh->b_page);
 		/* last virtual block number in this block */
 		first = vinfo->vi_vblocknr;
 		do_div(first, entries_per_block);
@@ -473,7 +473,7 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
 			vinfo->vi_end = le64_to_cpu(entry->de_end);
 			vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr);
 		}
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(entry_bh);
 	}
 

From 5705f6e6e8b5c0a35324d4596e93624fc57df291 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:55 +0900
Subject: [PATCH 596/707] nilfs2: move nilfs_bmap_write call out of
 nilfs_write_inode_common

Before converting the disk inode management metadata file ifile, the call
to nilfs_bmap_write(), the i_device_code setting, and the zero-fill code
for inodes on the super root block are moved from
nilfs_write_inode_common() to its callers.

This cleanup simplifies the role and arguments of
nilfs_write_inode_common() and collects calls to nilfs_bmap_write() to the
log writing code.

Also, add and use a new helper nilfs_write_root_mdt_inode() to avoid code
duplication in the data export routine nilfs_segctor_fill_in_super_root()
to the super root block's buffer.

Link: https://lkml.kernel.org/r/20240122140202.6950-9-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/inode.c   | 38 +++++++++++++++---------------------
 fs/nilfs2/nilfs.h   |  3 ++-
 fs/nilfs2/segment.c | 47 +++++++++++++++++++++++++++++++++++++--------
 3 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 9c334c722fc1c1..b9d40f5e94d32a 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -759,8 +759,18 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode)
 	return s_inode;
 }
 
+/**
+ * nilfs_write_inode_common - export common inode information to on-disk inode
+ * @inode:     inode object
+ * @raw_inode: on-disk inode
+ *
+ * This function writes standard information from the on-memory inode @inode
+ * to @raw_inode on ifile, cpfile or a super root block.  Since inode bmap
+ * data is not exported, nilfs_bmap_write() must be called separately during
+ * log writing.
+ */
 void nilfs_write_inode_common(struct inode *inode,
-			      struct nilfs_inode *raw_inode, int has_bmap)
+			      struct nilfs_inode *raw_inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 
@@ -778,21 +788,6 @@ void nilfs_write_inode_common(struct inode *inode,
 	raw_inode->i_flags = cpu_to_le32(ii->i_flags);
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 
-	if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
-		struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
-
-		/* zero-fill unused portion in the case of super root block */
-		raw_inode->i_xattr = 0;
-		raw_inode->i_pad = 0;
-		memset((void *)raw_inode + sizeof(*raw_inode), 0,
-		       nilfs->ns_inode_size - sizeof(*raw_inode));
-	}
-
-	if (has_bmap)
-		nilfs_bmap_write(ii->i_bmap, raw_inode);
-	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-		raw_inode->i_device_code =
-			cpu_to_le64(huge_encode_dev(inode->i_rdev));
 	/*
 	 * When extending inode, nilfs->ns_inode_size should be checked
 	 * for substitutions of appended fields.
@@ -813,12 +808,11 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
 	if (flags & I_DIRTY_DATASYNC)
 		set_bit(NILFS_I_INODE_SYNC, &ii->i_state);
 
-	nilfs_write_inode_common(inode, raw_inode, 0);
-		/*
-		 * XXX: call with has_bmap = 0 is a workaround to avoid
-		 * deadlock of bmap.  This delays update of i_bmap to just
-		 * before writing.
-		 */
+	nilfs_write_inode_common(inode, raw_inode);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		raw_inode->i_device_code =
+			cpu_to_le64(huge_encode_dev(inode->i_rdev));
 
 	nilfs_ifile_unmap_inode(ifile, ino, ibh);
 }
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 98cffaf0ac1277..2e29b98ba8bab2 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -256,7 +256,8 @@ extern struct inode *nilfs_new_inode(struct inode *, umode_t);
 extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern void nilfs_set_inode_flags(struct inode *);
 extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
-extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
+void nilfs_write_inode_common(struct inode *inode,
+			      struct nilfs_inode *raw_inode);
 struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
 			    unsigned long ino);
 struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2590a0860eab02..b512e772846568 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -913,6 +913,7 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
 	struct buffer_head *bh_cp;
 	struct nilfs_checkpoint *raw_cp;
+	struct inode *ifile;
 	int err;
 
 	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
@@ -941,8 +942,10 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 	else
 		nilfs_checkpoint_set_minor(raw_cp);
 
-	nilfs_write_inode_common(sci->sc_root->ifile,
-				 &raw_cp->cp_ifile_inode, 1);
+	ifile = sci->sc_root->ifile;
+	nilfs_write_inode_common(ifile, &raw_cp->cp_ifile_inode);
+	nilfs_bmap_write(NILFS_I(ifile)->i_bmap, &raw_cp->cp_ifile_inode);
+
 	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
 	return 0;
 
@@ -977,6 +980,33 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
 	}
 }
 
+/**
+ * nilfs_write_root_mdt_inode - export root metadata inode information to
+ *                              the on-disk inode
+ * @inode:     inode object of the root metadata file
+ * @raw_inode: on-disk inode
+ *
+ * nilfs_write_root_mdt_inode() writes inode information and bmap data of
+ * @inode to the inode area of the metadata file allocated on the super root
+ * block created to finalize the log.  Since super root blocks are configured
+ * each time, this function zero-fills the unused area of @raw_inode.
+ */
+static void nilfs_write_root_mdt_inode(struct inode *inode,
+				       struct nilfs_inode *raw_inode)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+	nilfs_write_inode_common(inode, raw_inode);
+
+	/* zero-fill unused portion of raw_inode */
+	raw_inode->i_xattr = 0;
+	raw_inode->i_pad = 0;
+	memset((void *)raw_inode + sizeof(*raw_inode), 0,
+	       nilfs->ns_inode_size - sizeof(*raw_inode));
+
+	nilfs_bmap_write(NILFS_I(inode)->i_bmap, raw_inode);
+}
+
 static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
 					     struct the_nilfs *nilfs)
 {
@@ -998,12 +1028,13 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
 			      nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
 	raw_sr->sr_flags = 0;
 
-	nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
-				 NILFS_SR_DAT_OFFSET(isz), 1);
-	nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
-				 NILFS_SR_CPFILE_OFFSET(isz), 1);
-	nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
-				 NILFS_SR_SUFILE_OFFSET(isz), 1);
+	nilfs_write_root_mdt_inode(nilfs->ns_dat, (void *)raw_sr +
+				   NILFS_SR_DAT_OFFSET(isz));
+	nilfs_write_root_mdt_inode(nilfs->ns_cpfile, (void *)raw_sr +
+				   NILFS_SR_CPFILE_OFFSET(isz));
+	nilfs_write_root_mdt_inode(nilfs->ns_sufile, (void *)raw_sr +
+				   NILFS_SR_SUFILE_OFFSET(isz));
+
 	memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz);
 	set_buffer_uptodate(bh_sr);
 	unlock_buffer(bh_sr);

From f422ef6111d54c192bcc7e0c2d381a5a90299583 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:56 +0900
Subject: [PATCH 597/707] nilfs2: do not acquire rwsem in nilfs_bmap_write()

It is now clear that nilfs_bmap_write() is only used to finalize logs
written to disk.  Concurrent bmap modification operations are not
performed on bmaps in this context.  Additionally, this function does not
modify data used in read-only operations such as bmap lookups.

Therefore, there is no need to acquire bmap->b_sem in nilfs_bmap_write(),
so delete it.

Link: https://lkml.kernel.org/r/20240122140202.6950-10-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/bmap.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 7a8f166f2c8d84..383f0afa2cea36 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -548,13 +548,10 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
  */
 void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
 {
-	down_write(&bmap->b_sem);
 	memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
 	       NILFS_INODE_BMAP_SIZE * sizeof(__le64));
 	if (bmap->b_inode->i_ino == NILFS_DAT_INO)
 		bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
-
-	up_write(&bmap->b_sem);
 }
 
 void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)

From 2d97cae8cc4853f622ca6a05ac7d63d455bd58f1 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:57 +0900
Subject: [PATCH 598/707] nilfs2: convert ifile to use kmap_local

Convert deprecated kmap() and kmap_atomic() to use kmap_local for the
ifile metadata file used to manage disk inodes.

In some usages, calls to kmap_local and kunmap_local are split into
different helpers, but those usages can be safely changed to local thread
kmap.

Link: https://lkml.kernel.org/r/20240122140202.6950-11-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/ifile.c   | 4 ++--
 fs/nilfs2/ifile.h   | 7 +++----
 fs/nilfs2/inode.c   | 6 +++---
 fs/nilfs2/segment.c | 2 +-
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index a8a4bc8490b4d8..e9538fa46ff27d 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -115,11 +115,11 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
 		return ret;
 	}
 
-	kaddr = kmap_atomic(req.pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req.pr_entry_bh->b_page);
 	raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
 						 req.pr_entry_bh, kaddr);
 	raw_inode->i_flags = 0;
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(req.pr_entry_bh);
 	brelse(req.pr_entry_bh);
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 35c5273f48219b..b71ab0a81dc45e 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -21,15 +21,14 @@
 static inline struct nilfs_inode *
 nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
 {
-	void *kaddr = kmap(ibh->b_page);
+	void *kaddr = kmap_local_page(ibh->b_page);
 
 	return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
 }
 
-static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino,
-					   struct buffer_head *ibh)
+static inline void nilfs_ifile_unmap_inode(struct nilfs_inode *raw_inode)
 {
-	kunmap(ibh->b_page);
+	kunmap_local(raw_inode);
 }
 
 int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index b9d40f5e94d32a..a475095a5e80b7 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -520,7 +520,7 @@ static int __nilfs_read_inode(struct super_block *sb,
 			inode, inode->i_mode,
 			huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
 	}
-	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
+	nilfs_ifile_unmap_inode(raw_inode);
 	brelse(bh);
 	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	nilfs_set_inode_flags(inode);
@@ -529,7 +529,7 @@ static int __nilfs_read_inode(struct super_block *sb,
 	return 0;
 
  failed_unmap:
-	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
+	nilfs_ifile_unmap_inode(raw_inode);
 	brelse(bh);
 
  bad_inode:
@@ -814,7 +814,7 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
 		raw_inode->i_device_code =
 			cpu_to_le64(huge_encode_dev(inode->i_rdev));
 
-	nilfs_ifile_unmap_inode(ifile, ino, ibh);
+	nilfs_ifile_unmap_inode(raw_inode);
 }
 
 #define NILFS_MAX_TRUNCATE_BLOCKS	16384  /* 64MB for 4KB block */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index b512e772846568..3657918328ea6f 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -966,7 +966,7 @@ static void nilfs_fill_in_file_bmap(struct inode *ifile,
 		raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
 						  ibh);
 		nilfs_bmap_write(ii->i_bmap, raw_inode);
-		nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh);
+		nilfs_ifile_unmap_inode(raw_inode);
 	}
 }
 

From 5c8436d4b3c119a00a98926106f894638f8c3cbb Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:58 +0900
Subject: [PATCH 599/707] nilfs2: localize highmem mapping for checkpoint
 creation within cpfile

In order to convert kmap() used in cpfile to kmap_local, first move the
checkpoint creation routine, which is one of the places where kmap is
used, to the cpfile side and make the page mapping local and temporary.
And use kmap_local instead of kmap to access the checkpoint entry page
(and header block page) when generating a checkpoint.

Link: https://lkml.kernel.org/r/20240122140202.6950-12-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/cpfile.c  | 74 +++++++++++++++++++++++++++++++++++++++++++++
 fs/nilfs2/cpfile.h  |  1 +
 fs/nilfs2/segment.c | 31 ++-----------------
 3 files changed, 77 insertions(+), 29 deletions(-)

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 39136637f7155b..f62da80e530a73 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -272,6 +272,80 @@ int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
 	return ret;
 }
 
+/**
+ * nilfs_cpfile_create_checkpoint - create a checkpoint entry on cpfile
+ * @cpfile: checkpoint file inode
+ * @cno:    number of checkpoint to set up
+ *
+ * This function creates a checkpoint with the number specified by @cno on
+ * cpfile.  If the specified checkpoint entry already exists due to a past
+ * failure, it will be reused without returning an error.
+ * In either case, the buffer of the block containing the checkpoint entry
+ * and the cpfile inode are made dirty for inclusion in the write log.
+ *
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-EROFS	- Read only filesystem
+ */
+int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno)
+{
+	struct buffer_head *header_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	if (WARN_ON_ONCE(cno < 1))
+		return -EIO;
+
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT) {
+			nilfs_error(cpfile->i_sb,
+				    "checkpoint creation failed due to metadata corruption.");
+			ret = -EIO;
+		}
+		goto out_sem;
+	}
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 1, &cp_bh);
+	if (unlikely(ret < 0))
+		goto out_header;
+
+	kaddr = kmap_local_page(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		/* a newly-created checkpoint */
+		nilfs_checkpoint_clear_invalid(cp);
+		if (!nilfs_cpfile_is_in_first(cpfile, cno))
+			nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
+								 kaddr, 1);
+		kunmap_local(kaddr);
+
+		kaddr = kmap_local_page(header_bh->b_page);
+		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
+						       kaddr);
+		le64_add_cpu(&header->ch_ncheckpoints, 1);
+		kunmap_local(kaddr);
+		mark_buffer_dirty(header_bh);
+	} else {
+		kunmap_local(kaddr);
+	}
+
+	/* Force the buffer and the inode to become dirty */
+	mark_buffer_dirty(cp_bh);
+	brelse(cp_bh);
+	nilfs_mdt_mark_dirty(cpfile);
+
+out_header:
+	brelse(header_bh);
+
+out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
 /**
  * nilfs_cpfile_put_checkpoint - put a checkpoint
  * @cpfile: inode of checkpoint file
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index edabb2dc57567c..fcb1a94097b3f9 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -19,6 +19,7 @@
 int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
 				struct nilfs_checkpoint **,
 				struct buffer_head **);
+int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno);
 void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
 int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
 int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 3657918328ea6f..37d06eacec6394 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -880,34 +880,6 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 	nilfs_mdt_clear_dirty(nilfs->ns_dat);
 }
 
-static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
-{
-	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-	struct buffer_head *bh_cp;
-	struct nilfs_checkpoint *raw_cp;
-	int err;
-
-	/* XXX: this interface will be changed */
-	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
-					  &raw_cp, &bh_cp);
-	if (likely(!err)) {
-		/*
-		 * The following code is duplicated with cpfile.  But, it is
-		 * needed to collect the checkpoint even if it was not newly
-		 * created.
-		 */
-		mark_buffer_dirty(bh_cp);
-		nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
-		nilfs_cpfile_put_checkpoint(
-			nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
-	} else if (err == -EINVAL || err == -ENOENT) {
-		nilfs_error(sci->sc_super,
-			    "checkpoint creation failed due to metadata corruption.");
-		err = -EIO;
-	}
-	return err;
-}
-
 static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 {
 	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
@@ -1261,7 +1233,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 			break;
 		nilfs_sc_cstage_inc(sci);
 		/* Creating a checkpoint */
-		err = nilfs_segctor_create_checkpoint(sci);
+		err = nilfs_cpfile_create_checkpoint(nilfs->ns_cpfile,
+						     nilfs->ns_cno);
 		if (unlikely(err))
 			break;
 		fallthrough;

From f477587f6cedd74ebbd92f25c28a551893014cdd Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:01:59 +0900
Subject: [PATCH 600/707] nilfs2: localize highmem mapping for checkpoint
 finalization within cpfile

Move the checkpoint finalization routine to the cpfile side, and make the
page mapping local and temporary.  And use kmap_local instead of kmap to
access the checkpoint entry page when finalizing a checkpoint.

In this conversion, some of the information on the checkpoint entry being
rewritten is passed through the arguments of the newly added method
nilfs_cpfile_finalize_checkpoint().

Link: https://lkml.kernel.org/r/20240122140202.6950-13-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/cpfile.c  | 74 +++++++++++++++++++++++++++++++++++++++++++++
 fs/nilfs2/cpfile.h  |  3 ++
 fs/nilfs2/segment.c | 51 +++----------------------------
 3 files changed, 82 insertions(+), 46 deletions(-)

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index f62da80e530a73..3af77252e08141 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -363,6 +363,80 @@ void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
 	brelse(bh);
 }
 
+/**
+ * nilfs_cpfile_finalize_checkpoint - fill in a checkpoint entry in cpfile
+ * @cpfile: checkpoint file inode
+ * @cno:    checkpoint number
+ * @root:   nilfs root object
+ * @blkinc: number of blocks added by this checkpoint
+ * @ctime:  checkpoint creation time
+ * @minor:  minor checkpoint flag
+ *
+ * This function completes the checkpoint entry numbered by @cno in the
+ * cpfile with the data given by the arguments @root, @blkinc, @ctime, and
+ * @minor.
+ *
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-EIO	- I/O error (including metadata corruption).
+ */
+int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno,
+				     struct nilfs_root *root, __u64 blkinc,
+				     time64_t ctime, bool minor)
+{
+	struct buffer_head *cp_bh;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	if (WARN_ON_ONCE(cno < 1))
+		return -EIO;
+
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT)
+			goto error;
+		goto out_sem;
+	}
+
+	kaddr = kmap_local_page(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (unlikely(nilfs_checkpoint_invalid(cp))) {
+		kunmap_local(kaddr);
+		brelse(cp_bh);
+		goto error;
+	}
+
+	cp->cp_snapshot_list.ssl_next = 0;
+	cp->cp_snapshot_list.ssl_prev = 0;
+	cp->cp_inodes_count = cpu_to_le64(atomic64_read(&root->inodes_count));
+	cp->cp_blocks_count = cpu_to_le64(atomic64_read(&root->blocks_count));
+	cp->cp_nblk_inc = cpu_to_le64(blkinc);
+	cp->cp_create = cpu_to_le64(ctime);
+	cp->cp_cno = cpu_to_le64(cno);
+
+	if (minor)
+		nilfs_checkpoint_set_minor(cp);
+	else
+		nilfs_checkpoint_clear_minor(cp);
+
+	nilfs_write_inode_common(root->ifile, &cp->cp_ifile_inode);
+	nilfs_bmap_write(NILFS_I(root->ifile)->i_bmap, &cp->cp_ifile_inode);
+
+	kunmap_local(kaddr);
+	brelse(cp_bh);
+out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+
+error:
+	nilfs_error(cpfile->i_sb,
+		    "checkpoint finalization failed due to metadata corruption.");
+	ret = -EIO;
+	goto out_sem;
+}
+
 /**
  * nilfs_cpfile_delete_checkpoints - delete checkpoints
  * @cpfile: inode of checkpoint file
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index fcb1a94097b3f9..aa1408a3af010e 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -21,6 +21,9 @@ int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
 				struct buffer_head **);
 int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno);
 void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
+int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno,
+				     struct nilfs_root *root, __u64 blkinc,
+				     time64_t ctime, bool minor);
 int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
 int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
 int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 37d06eacec6394..ecf778f44bbc85 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -880,51 +880,6 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 	nilfs_mdt_clear_dirty(nilfs->ns_dat);
 }
 
-static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
-{
-	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-	struct buffer_head *bh_cp;
-	struct nilfs_checkpoint *raw_cp;
-	struct inode *ifile;
-	int err;
-
-	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
-					  &raw_cp, &bh_cp);
-	if (unlikely(err)) {
-		if (err == -EINVAL || err == -ENOENT) {
-			nilfs_error(sci->sc_super,
-				    "checkpoint finalization failed due to metadata corruption.");
-			err = -EIO;
-		}
-		goto failed_ibh;
-	}
-	raw_cp->cp_snapshot_list.ssl_next = 0;
-	raw_cp->cp_snapshot_list.ssl_prev = 0;
-	raw_cp->cp_inodes_count =
-		cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count));
-	raw_cp->cp_blocks_count =
-		cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count));
-	raw_cp->cp_nblk_inc =
-		cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
-	raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
-	raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
-
-	if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
-		nilfs_checkpoint_clear_minor(raw_cp);
-	else
-		nilfs_checkpoint_set_minor(raw_cp);
-
-	ifile = sci->sc_root->ifile;
-	nilfs_write_inode_common(ifile, &raw_cp->cp_ifile_inode);
-	nilfs_bmap_write(NILFS_I(ifile)->i_bmap, &raw_cp->cp_ifile_inode);
-
-	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
-	return 0;
-
- failed_ibh:
-	return err;
-}
-
 static void nilfs_fill_in_file_bmap(struct inode *ifile,
 				    struct nilfs_inode_info *ii)
 
@@ -2103,7 +2058,11 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 
 		if (mode == SC_LSEG_SR &&
 		    nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) {
-			err = nilfs_segctor_fill_in_checkpoint(sci);
+			err = nilfs_cpfile_finalize_checkpoint(
+				nilfs->ns_cpfile, nilfs->ns_cno, sci->sc_root,
+				sci->sc_nblk_inc + sci->sc_nblk_this_inc,
+				sci->sc_seg_ctime,
+				!test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags));
 			if (unlikely(err))
 				goto failed_to_write;
 

From 5fb49887991cdfb5d8284c97cc27a4ac9d8722da Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:02:00 +0900
Subject: [PATCH 601/707] nilfs2: localize highmem mapping for checkpoint
 reading within cpfile

Move the code for reading from a checkpoint entry that is performed in
nilfs_attach_checkpoint() to the cpfile side, and make the page mapping
local and temporary.  And use kmap_local instead of kmap to access the
checkpoint entry page.

In order to load the ifile inode information included in the checkpoint
entry within the inode lock section of nilfs_ifile_read(), the newly added
checkpoint reading method nilfs_cpfile_read_checkpoint() is called
indirectly via nilfs_ifile_read() instead of from
nilfs_attach_checkpoint().

Link: https://lkml.kernel.org/r/20240122140202.6950-14-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/cpfile.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nilfs2/cpfile.h |  2 ++
 fs/nilfs2/ifile.c  | 17 ++++++++----
 fs/nilfs2/ifile.h  |  3 +-
 fs/nilfs2/super.c  | 31 ++++-----------------
 5 files changed, 87 insertions(+), 34 deletions(-)

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 3af77252e08141..56e38843536b96 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -186,6 +186,74 @@ static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
 				      nilfs_cpfile_get_blkoff(cpfile, cno));
 }
 
+/**
+ * nilfs_cpfile_read_checkpoint - read a checkpoint entry in cpfile
+ * @cpfile: checkpoint file inode
+ * @cno:    number of checkpoint entry to read
+ * @root:   nilfs root object
+ * @ifile:  ifile's inode to read and attach to @root
+ *
+ * This function imports checkpoint information from the checkpoint file and
+ * stores it to the inode file given by @ifile and the nilfs root object
+ * given by @root.
+ *
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EINVAL	- Invalid checkpoint.
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-EIO	- I/O error (including metadata corruption).
+ */
+int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno,
+				 struct nilfs_root *root, struct inode *ifile)
+{
+	struct buffer_head *cp_bh;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	if (cno < 1 || cno > nilfs_mdt_cno(cpfile))
+		return -EINVAL;
+
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT)
+			ret = -EINVAL;
+		goto out_sem;
+	}
+
+	kaddr = kmap_local_page(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		ret = -EINVAL;
+		goto put_cp;
+	}
+
+	ret = nilfs_read_inode_common(ifile, &cp->cp_ifile_inode);
+	if (unlikely(ret)) {
+		/*
+		 * Since this inode is on a checkpoint entry, treat errors
+		 * as metadata corruption.
+		 */
+		nilfs_err(cpfile->i_sb,
+			  "ifile inode (checkpoint number=%llu) corrupted",
+			  (unsigned long long)cno);
+		ret = -EIO;
+		goto put_cp;
+	}
+
+	/* Configure the nilfs root object */
+	atomic64_set(&root->inodes_count, le64_to_cpu(cp->cp_inodes_count));
+	atomic64_set(&root->blocks_count, le64_to_cpu(cp->cp_blocks_count));
+	root->ifile = ifile;
+
+put_cp:
+	kunmap_local(kaddr);
+	brelse(cp_bh);
+out_sem:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
 /**
  * nilfs_cpfile_get_checkpoint - get a checkpoint
  * @cpfile: inode of checkpoint file
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index aa1408a3af010e..2cfa14011bc832 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -19,6 +19,8 @@
 int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
 				struct nilfs_checkpoint **,
 				struct buffer_head **);
+int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno,
+				 struct nilfs_root *root, struct inode *ifile);
 int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno);
 void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
 int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno,
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index e9538fa46ff27d..612e609158b520 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -15,6 +15,7 @@
 #include "mdt.h"
 #include "alloc.h"
 #include "ifile.h"
+#include "cpfile.h"
 
 /**
  * struct nilfs_ifile_info - on-memory private data of ifile
@@ -173,14 +174,18 @@ int nilfs_ifile_count_free_inodes(struct inode *ifile,
  * nilfs_ifile_read - read or get ifile inode
  * @sb: super block instance
  * @root: root object
+ * @cno: number of checkpoint entry to read
  * @inode_size: size of an inode
- * @raw_inode: on-disk ifile inode
- * @inodep: buffer to store the inode
+ *
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EINVAL	- Invalid checkpoint.
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-EIO	- I/O error (including metadata corruption).
  */
 int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
-		     size_t inode_size, struct nilfs_inode *raw_inode,
-		     struct inode **inodep)
+		     __u64 cno, size_t inode_size)
 {
+	struct the_nilfs *nilfs;
 	struct inode *ifile;
 	int err;
 
@@ -201,13 +206,13 @@ int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
 
 	nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);
 
-	err = nilfs_read_inode_common(ifile, raw_inode);
+	nilfs = sb->s_fs_info;
+	err = nilfs_cpfile_read_checkpoint(nilfs->ns_cpfile, cno, root, ifile);
 	if (err)
 		goto failed;
 
 	unlock_new_inode(ifile);
  out:
-	*inodep = ifile;
 	return 0;
  failed:
 	iget_failed(ifile);
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index b71ab0a81dc45e..625545cc2a989f 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -38,7 +38,6 @@ int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
 int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *);
 
 int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
-		     size_t inode_size, struct nilfs_inode *raw_inode,
-		     struct inode **inodep);
+		     __u64 cno, size_t inode_size);
 
 #endif	/* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index df8674173b2202..5e630c179a1e29 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -544,8 +544,6 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
 {
 	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_root *root;
-	struct nilfs_checkpoint *raw_cp;
-	struct buffer_head *bh_cp;
 	int err = -ENOMEM;
 
 	root = nilfs_find_or_create_root(
@@ -557,38 +555,19 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
 		goto reuse; /* already attached checkpoint */
 
 	down_read(&nilfs->ns_segctor_sem);
-	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
-					  &bh_cp);
+	err = nilfs_ifile_read(sb, root, cno, nilfs->ns_inode_size);
 	up_read(&nilfs->ns_segctor_sem);
-	if (unlikely(err)) {
-		if (err == -ENOENT || err == -EINVAL) {
-			nilfs_err(sb,
-				  "Invalid checkpoint (checkpoint number=%llu)",
-				  (unsigned long long)cno);
-			err = -EINVAL;
-		}
+	if (unlikely(err))
 		goto failed;
-	}
-
-	err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size,
-			       &raw_cp->cp_ifile_inode, &root->ifile);
-	if (err)
-		goto failed_bh;
-
-	atomic64_set(&root->inodes_count,
-			le64_to_cpu(raw_cp->cp_inodes_count));
-	atomic64_set(&root->blocks_count,
-			le64_to_cpu(raw_cp->cp_blocks_count));
-
-	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
 
  reuse:
 	*rootp = root;
 	return 0;
 
- failed_bh:
-	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
  failed:
+	if (err == -EINVAL)
+		nilfs_err(sb, "Invalid checkpoint (checkpoint number=%llu)",
+			  (unsigned long long)cno);
 	nilfs_put_root(root);
 
 	return err;

From 71488bdc373d2884b4323c7427ebc316b4eaab6a Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:02:01 +0900
Subject: [PATCH 602/707] nilfs2: remove nilfs_cpfile_{get,put}_checkpoint()

All calls to nilfs_cpfile_get_checkpoint() and
nilfs_cpfile_put_checkpoint() that call kmap() and kunmap() separately are
now gone, so remove these methods.

Link: https://lkml.kernel.org/r/20240122140202.6950-15-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/cpfile.c | 103 ---------------------------------------------
 fs/nilfs2/cpfile.h |   4 --
 2 files changed, 107 deletions(-)

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 56e38843536b96..b5bad332d630c5 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -254,92 +254,6 @@ int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno,
 	return ret;
 }
 
-/**
- * nilfs_cpfile_get_checkpoint - get a checkpoint
- * @cpfile: inode of checkpoint file
- * @cno: checkpoint number
- * @create: create flag
- * @cpp: pointer to a checkpoint
- * @bhp: pointer to a buffer head
- *
- * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint
- * specified by @cno. A new checkpoint will be created if @cno is the current
- * checkpoint number and @create is nonzero.
- *
- * Return Value: On success, 0 is returned, and the checkpoint and the
- * buffer head of the buffer on which the checkpoint is located are stored in
- * the place pointed by @cpp and @bhp, respectively. On error, one of the
- * following negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - No such checkpoint.
- *
- * %-EINVAL - invalid checkpoint.
- */
-int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
-				__u64 cno,
-				int create,
-				struct nilfs_checkpoint **cpp,
-				struct buffer_head **bhp)
-{
-	struct buffer_head *header_bh, *cp_bh;
-	struct nilfs_cpfile_header *header;
-	struct nilfs_checkpoint *cp;
-	void *kaddr;
-	int ret;
-
-	if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) ||
-		     (cno < nilfs_mdt_cno(cpfile) && create)))
-		return -EINVAL;
-
-	down_write(&NILFS_MDT(cpfile)->mi_sem);
-
-	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
-	if (ret < 0)
-		goto out_sem;
-	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh);
-	if (ret < 0)
-		goto out_header;
-	kaddr = kmap(cp_bh->b_page);
-	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
-	if (nilfs_checkpoint_invalid(cp)) {
-		if (!create) {
-			kunmap(cp_bh->b_page);
-			brelse(cp_bh);
-			ret = -ENOENT;
-			goto out_header;
-		}
-		/* a newly-created checkpoint */
-		nilfs_checkpoint_clear_invalid(cp);
-		if (!nilfs_cpfile_is_in_first(cpfile, cno))
-			nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
-								 kaddr, 1);
-		mark_buffer_dirty(cp_bh);
-
-		kaddr = kmap_atomic(header_bh->b_page);
-		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
-						       kaddr);
-		le64_add_cpu(&header->ch_ncheckpoints, 1);
-		kunmap_atomic(kaddr);
-		mark_buffer_dirty(header_bh);
-		nilfs_mdt_mark_dirty(cpfile);
-	}
-
-	if (cpp != NULL)
-		*cpp = cp;
-	*bhp = cp_bh;
-
- out_header:
-	brelse(header_bh);
-
- out_sem:
-	up_write(&NILFS_MDT(cpfile)->mi_sem);
-	return ret;
-}
-
 /**
  * nilfs_cpfile_create_checkpoint - create a checkpoint entry on cpfile
  * @cpfile: checkpoint file inode
@@ -414,23 +328,6 @@ int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno)
 	return ret;
 }
 
-/**
- * nilfs_cpfile_put_checkpoint - put a checkpoint
- * @cpfile: inode of checkpoint file
- * @cno: checkpoint number
- * @bh: buffer head
- *
- * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint
- * specified by @cno. @bh must be the buffer head which has been returned by
- * a previous call to nilfs_cpfile_get_checkpoint() with @cno.
- */
-void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
-				 struct buffer_head *bh)
-{
-	kunmap(bh->b_page);
-	brelse(bh);
-}
-
 /**
  * nilfs_cpfile_finalize_checkpoint - fill in a checkpoint entry in cpfile
  * @cpfile: checkpoint file inode
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 2cfa14011bc832..f5b1d59289ebf8 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -16,13 +16,9 @@
 #include <linux/nilfs2_ondisk.h>	/* nilfs_inode, nilfs_checkpoint */
 
 
-int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
-				struct nilfs_checkpoint **,
-				struct buffer_head **);
 int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno,
 				 struct nilfs_root *root, struct inode *ifile);
 int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno);
-void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
 int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno,
 				     struct nilfs_root *root, __u64 blkinc,
 				     time64_t ctime, bool minor);

From ab52d122692776d5e0a29997f7d9e2d0b6d9f509 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Mon, 22 Jan 2024 23:02:02 +0900
Subject: [PATCH 603/707] nilfs2: convert cpfile to use kmap_local

Convert all remaining usages of kmap_atomic in cpfile to kmap_local.

Link: https://lkml.kernel.org/r/20240122140202.6950-16-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/cpfile.c | 90 +++++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index b5bad332d630c5..2c57132584de74 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -460,7 +460,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 			continue;
 		}
 
-		kaddr = kmap_atomic(cp_bh->b_page);
+		kaddr = kmap_local_page(cp_bh->b_page);
 		cp = nilfs_cpfile_block_get_checkpoint(
 			cpfile, cno, cp_bh, kaddr);
 		nicps = 0;
@@ -482,7 +482,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 						cpfile, cp_bh, kaddr, nicps);
 				if (count == 0) {
 					/* make hole */
-					kunmap_atomic(kaddr);
+					kunmap_local(kaddr);
 					brelse(cp_bh);
 					ret =
 					  nilfs_cpfile_delete_checkpoint_block(
@@ -497,18 +497,18 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 			}
 		}
 
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(cp_bh);
 	}
 
 	if (tnicps > 0) {
-		kaddr = kmap_atomic(header_bh->b_page);
+		kaddr = kmap_local_page(header_bh->b_page);
 		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
 						       kaddr);
 		le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
 		mark_buffer_dirty(header_bh);
 		nilfs_mdt_mark_dirty(cpfile);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 	}
 
 	brelse(header_bh);
@@ -560,7 +560,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
 		}
 		ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
 
-		kaddr = kmap_atomic(bh->b_page);
+		kaddr = kmap_local_page(bh->b_page);
 		cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
 		for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
 			if (!nilfs_checkpoint_invalid(cp)) {
@@ -570,7 +570,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
 				n++;
 			}
 		}
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(bh);
 	}
 
@@ -604,10 +604,10 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 		ret = nilfs_cpfile_get_header_block(cpfile, &bh);
 		if (ret < 0)
 			goto out;
-		kaddr = kmap_atomic(bh->b_page);
+		kaddr = kmap_local_page(bh->b_page);
 		header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
 		curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(bh);
 		if (curr == 0) {
 			ret = 0;
@@ -625,7 +625,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 			ret = 0; /* No snapshots (started from a hole block) */
 		goto out;
 	}
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	while (n < nci) {
 		cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
 		curr = ~(__u64)0; /* Terminator */
@@ -641,7 +641,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 
 		next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
 		if (curr_blkoff != next_blkoff) {
-			kunmap_atomic(kaddr);
+			kunmap_local(kaddr);
 			brelse(bh);
 			ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
 								0, &bh);
@@ -649,12 +649,12 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 				WARN_ON(ret == -ENOENT);
 				goto out;
 			}
-			kaddr = kmap_atomic(bh->b_page);
+			kaddr = kmap_local_page(bh->b_page);
 		}
 		curr = next;
 		curr_blkoff = next_blkoff;
 	}
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(bh);
 	*cnop = curr;
 	ret = n;
@@ -763,24 +763,24 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
 	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
 	if (ret < 0)
 		goto out_sem;
-	kaddr = kmap_atomic(cp_bh->b_page);
+	kaddr = kmap_local_page(cp_bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
 	if (nilfs_checkpoint_invalid(cp)) {
 		ret = -ENOENT;
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		goto out_cp;
 	}
 	if (nilfs_checkpoint_snapshot(cp)) {
 		ret = 0;
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		goto out_cp;
 	}
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
 	if (ret < 0)
 		goto out_cp;
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
 	list = &header->ch_snapshot_list;
 	curr_bh = header_bh;
@@ -792,13 +792,13 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
 		prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
 		curr = prev;
 		if (curr_blkoff != prev_blkoff) {
-			kunmap_atomic(kaddr);
+			kunmap_local(kaddr);
 			brelse(curr_bh);
 			ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
 								0, &curr_bh);
 			if (ret < 0)
 				goto out_header;
-			kaddr = kmap_atomic(curr_bh->b_page);
+			kaddr = kmap_local_page(curr_bh->b_page);
 		}
 		curr_blkoff = prev_blkoff;
 		cp = nilfs_cpfile_block_get_checkpoint(
@@ -806,7 +806,7 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
 		list = &cp->cp_snapshot_list;
 		prev = le64_to_cpu(list->ssl_prev);
 	}
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	if (prev != 0) {
 		ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
@@ -818,29 +818,29 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
 		get_bh(prev_bh);
 	}
 
-	kaddr = kmap_atomic(curr_bh->b_page);
+	kaddr = kmap_local_page(curr_bh->b_page);
 	list = nilfs_cpfile_block_get_snapshot_list(
 		cpfile, curr, curr_bh, kaddr);
 	list->ssl_prev = cpu_to_le64(cno);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(cp_bh->b_page);
+	kaddr = kmap_local_page(cp_bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
 	cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
 	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
 	nilfs_checkpoint_set_snapshot(cp);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(prev_bh->b_page);
+	kaddr = kmap_local_page(prev_bh->b_page);
 	list = nilfs_cpfile_block_get_snapshot_list(
 		cpfile, prev, prev_bh, kaddr);
 	list->ssl_next = cpu_to_le64(cno);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
 	le64_add_cpu(&header->ch_nsnapshots, 1);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(prev_bh);
 	mark_buffer_dirty(curr_bh);
@@ -881,23 +881,23 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
 	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
 	if (ret < 0)
 		goto out_sem;
-	kaddr = kmap_atomic(cp_bh->b_page);
+	kaddr = kmap_local_page(cp_bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
 	if (nilfs_checkpoint_invalid(cp)) {
 		ret = -ENOENT;
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		goto out_cp;
 	}
 	if (!nilfs_checkpoint_snapshot(cp)) {
 		ret = 0;
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		goto out_cp;
 	}
 
 	list = &cp->cp_snapshot_list;
 	next = le64_to_cpu(list->ssl_next);
 	prev = le64_to_cpu(list->ssl_prev);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
 	if (ret < 0)
@@ -921,29 +921,29 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
 		get_bh(prev_bh);
 	}
 
-	kaddr = kmap_atomic(next_bh->b_page);
+	kaddr = kmap_local_page(next_bh->b_page);
 	list = nilfs_cpfile_block_get_snapshot_list(
 		cpfile, next, next_bh, kaddr);
 	list->ssl_prev = cpu_to_le64(prev);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(prev_bh->b_page);
+	kaddr = kmap_local_page(prev_bh->b_page);
 	list = nilfs_cpfile_block_get_snapshot_list(
 		cpfile, prev, prev_bh, kaddr);
 	list->ssl_next = cpu_to_le64(next);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(cp_bh->b_page);
+	kaddr = kmap_local_page(cp_bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
 	cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
 	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
 	nilfs_checkpoint_clear_snapshot(cp);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
 	le64_add_cpu(&header->ch_nsnapshots, -1);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(next_bh);
 	mark_buffer_dirty(prev_bh);
@@ -1002,13 +1002,13 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
 	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
 	if (ret < 0)
 		goto out;
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
 	if (nilfs_checkpoint_invalid(cp))
 		ret = -ENOENT;
 	else
 		ret = nilfs_checkpoint_snapshot(cp);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(bh);
 
  out:
@@ -1085,12 +1085,12 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
 	ret = nilfs_cpfile_get_header_block(cpfile, &bh);
 	if (ret < 0)
 		goto out_sem;
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
 	cpstat->cs_cno = nilfs_mdt_cno(cpfile);
 	cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
 	cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(bh);
 
  out_sem:

From 834ec1d23052b3cc69ed409793e4d370fcda0ebb Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:07 -0700
Subject: [PATCH 604/707] kbuild: raise the minimum supported version of LLVM
 to 13.0.1

Patch series "Bump the minimum supported version of LLVM to 13.0.1".

This series bumps the minimum supported version of LLVM for building the
kernel to 13.0.1.  The first patch does the bump and all subsequent
patches clean up all the various workarounds and checks for earlier
versions.

Quoting the first patch's commit message for those that were only on CC
for the clean ups:

  When __builtin_mul_overflow() has arguments that differ in terms of
  signedness and width, LLVM may generate a libcall to __muloti4 because
  it performs the checks in terms of 65-bit multiplication. This issue
  becomes harder to hit (but still possible) after LLVM 12.0.0, which
  includes a special case for matching widths but different signs.

  To gain access to this special case, which the kernel can take advantage
  of when calls to __muloti4 appear, bump the minimum supported version of
  LLVM for building the kernel to 13.0.1. 13.0.1 was chosen because there
  is minimal impact to distribution support while allowing a few more
  workarounds to be dropped in the kernel source than if 12.0.0 were
  chosen. Looking at container images of up to date distribution versions:

    archlinux:latest              clang version 16.0.6
    debian:oldoldstable-slim      clang version 7.0.1-8+deb10u2 (tags/RELEASE_701/final)
    debian:oldstable-slim         Debian clang version 11.0.1-2
    debian:stable-slim            Debian clang version 14.0.6
    debian:testing-slim           Debian clang version 16.0.6 (19)
    debian:unstable-slim          Debian clang version 16.0.6 (19)
    fedora:38                     clang version 16.0.6 (Fedora 16.0.6-3.fc38)
    fedora:latest                 clang version 17.0.6 (Fedora 17.0.6-1.fc39)
    fedora:rawhide                clang version 17.0.6 (Fedora 17.0.6-1.fc40)
    opensuse/leap:latest          clang version 15.0.7
    opensuse/tumbleweed:latest    clang version 17.0.6
    ubuntu:focal                  clang version 10.0.0-4ubuntu1
    ubuntu:latest                 Ubuntu clang version 14.0.0-1ubuntu1.1
    ubuntu:rolling                Ubuntu clang version 16.0.6 (15)
    ubuntu:devel                  Ubuntu clang version 17.0.6 (3)

  The only distribution that gets left behind is Debian Bullseye, as the
  default version is 11.0.1; other distributions either have a newer
  version than 13.0.1 or one older than the current minimum of 11.0.0.
  Debian has easy access to more recent LLVM versions through
  apt.llvm.org, so this is not as much of a concern. There are also the
  kernel.org LLVM toolchains, which should work with distributions with
  glibc 2.28 and newer.

  Another benefit of slimming up the number of supported versions of LLVM
  for building the kernel is reducing the build capacity needed to support
  a matrix that builds with each supported version, which allows a matrix
  to reallocate the freed up build capacity towards something else, such
  as more configuration combinations.

This passes my build matrix with all supported versions.

This is based on Andrew's mm-nonmm-unstable to avoid trivial conflicts
with my series to update the LLVM links across the repository [1] but I
can easily rebase it to linux-kbuild if Masahiro would rather these
patches go through there (and defer the conflict resolution to the merge
window).

[1]: https://lore.kernel.org/20240109-update-llvm-links-v1-0-eb09b59db071@kernel.org/


This patch (of 11):

When __builtin_mul_overflow() has arguments that differ in terms of
signedness and width, LLVM may generate a libcall to __muloti4 because it
performs the checks in terms of 65-bit multiplication.  This issue becomes
harder to hit (but still possible) after LLVM 12.0.0, which includes a
special case for matching widths but different signs.

To gain access to this special case, which the kernel can take advantage
of when calls to __muloti4 appear, bump the minimum supported version of
LLVM for building the kernel to 13.0.1.  13.0.1 was chosen because there
is minimal impact to distribution support while allowing a few more
workarounds to be dropped in the kernel source than if 12.0.0 were chosen.
Looking at container images of up to date distribution versions:

  archlinux:latest              clang version 16.0.6
  debian:oldoldstable-slim      clang version 7.0.1-8+deb10u2 (tags/RELEASE_701/final)
  debian:oldstable-slim         Debian clang version 11.0.1-2
  debian:stable-slim            Debian clang version 14.0.6
  debian:testing-slim           Debian clang version 16.0.6 (19)
  debian:unstable-slim          Debian clang version 16.0.6 (19)
  fedora:38                     clang version 16.0.6 (Fedora 16.0.6-3.fc38)
  fedora:latest                 clang version 17.0.6 (Fedora 17.0.6-1.fc39)
  fedora:rawhide                clang version 17.0.6 (Fedora 17.0.6-1.fc40)
  opensuse/leap:latest          clang version 15.0.7
  opensuse/tumbleweed:latest    clang version 17.0.6
  ubuntu:focal                  clang version 10.0.0-4ubuntu1
  ubuntu:latest                 Ubuntu clang version 14.0.0-1ubuntu1.1
  ubuntu:rolling                Ubuntu clang version 16.0.6 (15)
  ubuntu:devel                  Ubuntu clang version 17.0.6 (3)

The only distribution that gets left behind is Debian Bullseye, as the
default version is 11.0.1; other distributions either have a newer version
than 13.0.1 or one older than the current minimum of 11.0.0.  Debian has
easy access to more recent LLVM versions through apt.llvm.org, so this is
not as much of a concern.  There are also the kernel.org LLVM toolchains,
which should work with distributions with glibc 2.28 and newer.

Another benefit of slimming up the number of supported versions of LLVM
for building the kernel is reducing the build capacity needed to support a
matrix that builds with each supported version, which allows a matrix to
reallocate the freed up build capacity towards something else, such as
more configuration combinations.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-0-f5ff9bda41c5@kernel.org
Closes: https://github.com/ClangBuiltLinux/linux/issues/1975
Link: https://github.com/llvm/llvm-project/issues/38013
Link: https://github.com/llvm/llvm-project/commit/3203143f1356a4e4e3ada231156fc6da6e1a9f9d
Link: https://mirrors.edge.kernel.org/pub/tools/llvm/
Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-1-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/process/changes.rst | 2 +-
 scripts/min-tool-version.sh       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index 50b3d1cb11159b..d7306b8cad1378 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -30,7 +30,7 @@ you probably needn't concern yourself with pcmciautils.
         Program        Minimal version       Command to check the version
 ====================== ===============  ========================================
 GNU C                  5.1              gcc --version
-Clang/LLVM (optional)  11.0.0           clang --version
+Clang/LLVM (optional)  13.0.1           clang --version
 Rust (optional)        1.74.1           rustc --version
 bindgen (optional)     0.65.1           bindgen --version
 GNU make               3.82             make --version
diff --git a/scripts/min-tool-version.sh b/scripts/min-tool-version.sh
index 9faa4d3d91e358..5d17022ee1f6f8 100755
--- a/scripts/min-tool-version.sh
+++ b/scripts/min-tool-version.sh
@@ -29,7 +29,7 @@ llvm)
 	elif [ "$SRCARCH" = loongarch ]; then
 		echo 18.0.0
 	else
-		echo 11.0.0
+		echo 13.0.1
 	fi
 	;;
 rustc)

From 8f20632b4f573e5063fec43000119ed47ac76d81 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:08 -0700
Subject: [PATCH 605/707] Makefile: drop warn-stack-size plugin opt

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, the inner ifeq statement is always false, as the
build will fail during the configuration stage for older LLVM versions.

This effectively reverts commit 24845dcb170e ("Makefile: LTO: have linker
check -Wframe-larger-than") and its follow up fix, commit 0236526d76b8
("Makefile: lto: Pass -warn-stack-size only on LLD < 13.0.0").

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-2-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Makefile | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/Makefile b/Makefile
index 9869f57c3fb3e6..885b2940e20da2 100644
--- a/Makefile
+++ b/Makefile
@@ -951,14 +951,6 @@ CC_FLAGS_LTO	+= -fvisibility=hidden
 
 # Limit inlining across translation units to reduce binary size
 KBUILD_LDFLAGS += -mllvm -import-instr-limit=5
-
-# Check for frame size exceeding threshold during prolog/epilog insertion
-# when using lld < 13.0.0.
-ifneq ($(CONFIG_FRAME_WARN),0)
-ifeq ($(call test-lt, $(CONFIG_LLD_VERSION), 130000),y)
-KBUILD_LDFLAGS	+= -plugin-opt=-warn-stack-size=$(CONFIG_FRAME_WARN)
-endif
-endif
 endif
 
 ifdef CONFIG_LTO

From e3dee7f334341775b6641e70efd382d1359f43af Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:09 -0700
Subject: [PATCH 606/707] x86: drop stack-alignment plugin opt

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, the inner ifeq statement is always false, as the
build will fail during the configuration stage for older LLVM versions.

This effectively reverts part of commit b33fff07e3e3 ("x86, build: allow
LTO to be selected") and its follow up fix, commit 2398ce80152a ("x86,
lto: Pass -stack-alignment only on LLD < 13.0.0").

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-3-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/Makefile | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1a068de12a564f..de30a8b35c41cf 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -217,12 +217,6 @@ endif
 
 KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)
 
-ifdef CONFIG_LTO_CLANG
-ifeq ($(call test-lt, $(CONFIG_LLD_VERSION), 130000),y)
-KBUILD_LDFLAGS	+= -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8)
-endif
-endif
-
 ifdef CONFIG_X86_NEED_RELOCS
 LDFLAGS_vmlinux := --emit-relocs --discard-none
 else

From a18f79c4145fbb59c90d8c9eca540cb514d43392 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:10 -0700
Subject: [PATCH 607/707] ARM: remove Thumb2 __builtin_thread_pointer
 workaround for Clang

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, the conditional expression added to get_current()
by commit c1e42efacb9b ("ARM: 9151/1: Thumb2: avoid
__builtin_thread_pointer() on Clang") is always true, as the build will
fail during the configuration stage for older LLVM versions.  Remove it,
effectively reverting the aforementioned change.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-4-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/include/asm/current.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/arm/include/asm/current.h b/arch/arm/include/asm/current.h
index 1e1178bf176da6..5225cb1c803b16 100644
--- a/arch/arm/include/asm/current.h
+++ b/arch/arm/include/asm/current.h
@@ -18,18 +18,12 @@ static __always_inline __attribute_const__ struct task_struct *get_current(void)
 {
 	struct task_struct *cur;
 
-#if __has_builtin(__builtin_thread_pointer) && \
-    defined(CONFIG_CURRENT_POINTER_IN_TPIDRURO) && \
-    !(defined(CONFIG_THUMB2_KERNEL) && \
-      defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 130001)
+#if __has_builtin(__builtin_thread_pointer) && defined(CONFIG_CURRENT_POINTER_IN_TPIDRURO)
 	/*
 	 * Use the __builtin helper when available - this results in better
 	 * code, especially when using GCC in combination with the per-task
 	 * stack protector, as the compiler will recognize that it needs to
 	 * load the TLS register only once in every function.
-	 *
-	 * Clang < 13.0.1 gets this wrong for Thumb2 builds:
-	 * https://github.com/ClangBuiltLinux/linux/issues/1485
 	 */
 	cur = __builtin_thread_pointer();
 #elif defined(CONFIG_CURRENT_POINTER_IN_TPIDRURO) || defined(CONFIG_SMP)

From 1633a42d2cf19ebe8b35d076ea72f6069d53e495 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:11 -0700
Subject: [PATCH 608/707] arm64: Kconfig: clean up tautological LLVM version
 checks

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, several conditions become tautologies, as they will
always be true because the build will fail during the configuration stage
for older LLVM versions.  Drop them, as they are unnecessary.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-5-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/Kconfig | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 5a8acca4dbf495..cb34e7d780c090 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -383,7 +383,7 @@ config BUILTIN_RETURN_ADDRESS_STRIPS_PAC
 	bool
 	# Clang's __builtin_return_adddress() strips the PAC since 12.0.0
 	# https://github.com/llvm/llvm-project/commit/2a96f47c5ffca84cd774ad402cacd137f4bf45e2
-	default y if CC_IS_CLANG && (CLANG_VERSION >= 120000)
+	default y if CC_IS_CLANG
 	# GCC's __builtin_return_address() strips the PAC since 11.1.0,
 	# and this was backported to 10.2.0, 9.4.0, 8.5.0, but not earlier
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94891
@@ -1387,7 +1387,6 @@ choice
 
 config CPU_BIG_ENDIAN
 	bool "Build big-endian kernel"
-	depends on !LD_IS_LLD || LLD_VERSION >= 130000
 	# https://github.com/llvm/llvm-project/commit/1379b150991f70a5782e9a143c2ba5308da1161c
 	depends on AS_IS_GNU || AS_VERSION >= 150000
 	help
@@ -2018,8 +2017,6 @@ config ARM64_BTI_KERNEL
 	depends on !CC_IS_GCC || GCC_VERSION >= 100100
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106671
 	depends on !CC_IS_GCC
-	# https://github.com/llvm/llvm-project/commit/a88c722e687e6780dcd6a58718350dc76fcc4cc9
-	depends on !CC_IS_CLANG || CLANG_VERSION >= 120000
 	depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_ARGS)
 	help
 	  Build the kernel with Branch Target Identification annotations

From c9be957f1135c039151f2483cfa5924f3cad53ff Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:12 -0700
Subject: [PATCH 609/707] powerpc: Kconfig: remove tautology in CONFIG_COMPAT

This reverts commit 6fcb574125e6 ("powerpc: Kconfig: disable
CONFIG_COMPAT for clang < 12").

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, this condition is always true, as the build will
fail during the configuration stage for older LLVM versions.  Remove it.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-6-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b9fc064d38d281..86da0d01365a75 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -333,7 +333,6 @@ config PANIC_TIMEOUT
 config COMPAT
 	bool "Enable support for 32bit binaries"
 	depends on PPC64
-	depends on !CC_IS_CLANG || CLANG_VERSION >= 120000
 	default y if !CPU_LITTLE_ENDIAN
 	select ARCH_WANT_OLD_COMPAT_IPC
 	select COMPAT_OLD_SIGACTION

From 944e827678dd77e8dbc89d9fe7111aec830757f3 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:13 -0700
Subject: [PATCH 610/707] riscv: remove MCOUNT_NAME workaround

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, the condition for using _mcount as MCOUNT_NAME is
always true, as the build will fail during the configuration stage for
older LLVM versions.  Replace MCOUNT_NAME with _mcount directly.

This effectively reverts commit 7ce047715030 ("riscv: Workaround mcount
name prior to clang-13").

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-7-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/riscv/include/asm/ftrace.h | 14 ++------------
 arch/riscv/kernel/mcount.S      | 10 +++++-----
 scripts/recordmcount.pl         |  2 +-
 3 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index 06874fb1311e5e..cf5b63e789fa7c 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -13,19 +13,9 @@
 #endif
 #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
 
-/*
- * Clang prior to 13 had "mcount" instead of "_mcount":
- * https://github.com/llvm/llvm-project/commit/ef58ae86ba778ed7d01cd3f6bd6d08f943abab44
- */
-#if defined(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 130000
-#define MCOUNT_NAME _mcount
-#else
-#define MCOUNT_NAME mcount
-#endif
-
 #define ARCH_SUPPORTS_FTRACE_OPS 1
 #ifndef __ASSEMBLY__
-void MCOUNT_NAME(void);
+void _mcount(void);
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
 	return addr;
@@ -75,7 +65,7 @@ struct dyn_arch_ftrace {
  * both auipc and jalr at the same time.
  */
 
-#define MCOUNT_ADDR		((unsigned long)MCOUNT_NAME)
+#define MCOUNT_ADDR		((unsigned long)_mcount)
 #define JALR_SIGN_MASK		(0x00000800)
 #define JALR_OFFSET_MASK	(0x00000fff)
 #define AUIPC_OFFSET_MASK	(0xfffff000)
diff --git a/arch/riscv/kernel/mcount.S b/arch/riscv/kernel/mcount.S
index d7ec69ac6910c6..3a42f6287909d0 100644
--- a/arch/riscv/kernel/mcount.S
+++ b/arch/riscv/kernel/mcount.S
@@ -50,8 +50,8 @@
 
 SYM_TYPED_FUNC_START(ftrace_stub)
 #ifdef CONFIG_DYNAMIC_FTRACE
-       .global MCOUNT_NAME
-       .set    MCOUNT_NAME, ftrace_stub
+       .global _mcount
+       .set    _mcount, ftrace_stub
 #endif
 	ret
 SYM_FUNC_END(ftrace_stub)
@@ -80,7 +80,7 @@ SYM_FUNC_END(return_to_handler)
 #endif
 
 #ifndef CONFIG_DYNAMIC_FTRACE
-SYM_FUNC_START(MCOUNT_NAME)
+SYM_FUNC_START(_mcount)
 	la	t4, ftrace_stub
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	la	t0, ftrace_graph_return
@@ -126,6 +126,6 @@ SYM_FUNC_START(MCOUNT_NAME)
 	jalr	t5
 	RESTORE_ABI_STATE
 	ret
-SYM_FUNC_END(MCOUNT_NAME)
+SYM_FUNC_END(_mcount)
 #endif
-EXPORT_SYMBOL(MCOUNT_NAME)
+EXPORT_SYMBOL(_mcount)
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index f84df9e383fd0a..0871b2e92584b2 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -352,7 +352,7 @@
     $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$";
 } elsif ($arch eq "riscv") {
     $function_regex = "^([0-9a-fA-F]+)\\s+<([^.0-9][0-9a-zA-Z_\\.]+)>:";
-    $mcount_regex = "^\\s*([0-9a-fA-F]+):\\sR_RISCV_CALL(_PLT)?\\s_?mcount\$";
+    $mcount_regex = "^\\s*([0-9a-fA-F]+):\\sR_RISCV_CALL(_PLT)?\\s_mcount\$";
     $type = ".quad";
     $alignment = 2;
 } elsif ($arch eq "csky") {

From e129633ac04f66c67b6c4edaa035bd0c811228bf Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:14 -0700
Subject: [PATCH 611/707] riscv: Kconfig: remove version dependency from
 CONFIG_CLANG_SUPPORTS_DYNAMIC_FTRACE

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, this condition is always true, as the build will
fail during the configuration stage for older LLVM versions.  Remove it.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-8-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/riscv/Kconfig | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 69d24f51392206..00edc4ff589c99 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -174,8 +174,6 @@ config RISCV
 
 config CLANG_SUPPORTS_DYNAMIC_FTRACE
 	def_bool CC_IS_CLANG
-	# https://github.com/llvm/llvm-project/commit/6ab8927931851bb42b2c93a00801dc499d7d9b1e
-	depends on CLANG_VERSION >= 130000
 	# https://github.com/ClangBuiltLinux/linux/issues/1817
 	depends on AS_IS_GNU || (AS_IS_LLVM && (LD_IS_LLD || LD_VERSION >= 23600))
 

From 2bc5ecf4daa5d64c5246c4e1176d0814b7a07e1f Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:15 -0700
Subject: [PATCH 612/707] fortify: drop Clang version check for 12.0.1 or newer

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, this condition is always true, as the build will
fail during the configuration stage for older LLVM versions.  Remove it.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-9-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 security/Kconfig | 2 --
 1 file changed, 2 deletions(-)

diff --git a/security/Kconfig b/security/Kconfig
index 606a87c29a0170..412e76f1575d0d 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -142,8 +142,6 @@ config HARDENED_USERCOPY
 config FORTIFY_SOURCE
 	bool "Harden common str/mem functions against buffer overflows"
 	depends on ARCH_HAS_FORTIFY_SOURCE
-	# https://llvm.org/pr41459
-	depends on !CC_IS_CLANG || CLANG_VERSION >= 120001
 	# https://github.com/llvm/llvm-project/issues/53645
 	depends on !CC_IS_CLANG || !X86_32
 	help

From 1717502a14afb53ac56294661a5426299c94107e Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:16 -0700
Subject: [PATCH 613/707] lib/Kconfig.debug: update Clang version check in
 CONFIG_KCOV

Now that the minimum supported version of LLVM for building the kernel has
been bumped to 13.0.1, this condition can be changed to just
CONFIG_CC_IS_CLANG, as the build will fail during the configuration stage
for older LLVM versions.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-10-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8f502f15dc7fb5..1339fb893d712f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2085,7 +2085,7 @@ config KCOV
 	depends on ARCH_HAS_KCOV
 	depends on CC_HAS_SANCOV_TRACE_PC || GCC_PLUGINS
 	depends on !ARCH_WANTS_NO_INSTR || HAVE_NOINSTR_HACK || \
-		   GCC_VERSION >= 120000 || CLANG_VERSION >= 130000
+		   GCC_VERSION >= 120000 || CC_IS_CLANG
 	select DEBUG_FS
 	select GCC_PLUGIN_SANCOV if !CC_HAS_SANCOV_TRACE_PC
 	select OBJTOOL if HAVE_NOINSTR_HACK

From 94b190dbda3d16850a002c372fc1040541ce124e Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 25 Jan 2024 15:55:17 -0700
Subject: [PATCH 614/707] compiler-clang.h: update __diag_clang() macros for
 minimum version bump

The minimum supported version of LLVM for building the kernel has been
bumped to 13.0.1.  Update the __diag_clang() macros for this bump.

Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-11-f5ff9bda41c5@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: "Aneesh Kumar K.V (IBM)" <aneesh.kumar@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler-clang.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index f0a47afef12581..49feac0162a526 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -114,11 +114,7 @@
 #define __diag_str(s)		__diag_str1(s)
 #define __diag(s)		_Pragma(__diag_str(clang diagnostic s))
 
-#if CONFIG_CLANG_VERSION >= 110000
-#define __diag_clang_11(s)	__diag(s)
-#else
-#define __diag_clang_11(s)
-#endif
+#define __diag_clang_13(s)	__diag(s)
 
 #define __diag_ignore_all(option, comment) \
-	__diag_clang(11, ignore, option)
+	__diag_clang(13, ignore, option)

From aa6a58c8fcaf82e75ab200958605139e6ff93299 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Thu, 25 Jan 2024 20:46:04 +0500
Subject: [PATCH 615/707] selftests/mm: hugetlb_reparenting_test: do not
 unmount

Patch series "selftests/mm: Improve run_vmtests.sh", v3.

In this series, I'm trying to add 3 missing tests to vm_runtests.sh which
is used to run all the tests in mm suite.  These tests weren't running by
CIs.  While enabling them and through review feedback, I've fixed some
problems in tests as well.  I've found more flakiness in more tests which
I'll be fixing with future patches.

hugetlb-read-hwpoison test is being added where it can only run with newly
added "-d" (destructive) flag only.  Not sure why it is failing again.  So
once it become stable, we can think of moving it to default set of tests
if it doesn't have any side-effect to them.


This patch (of 5):

Do not unmount the cgroup if it wasn't mounted by the test.  The earlier
patch had fixed this for charge_reserved_hugetlb, but not for this test.
I'm adding fixes tag to that earlier patch.

Link: https://lkml.kernel.org/r/20240125154608.720072-1-usama.anjum@collabora.com
Link: https://lkml.kernel.org/r/20240125154608.720072-2-usama.anjum@collabora.com
Fixes: 209376ed2a84 ("selftests/vm: make charge_reserved_hugetlb.sh work with existing cgroup setting")
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/hugetlb_reparenting_test.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
index 14d26075c8635f..615c4d766c9093 100755
--- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
+++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
@@ -248,5 +248,7 @@ cleanup
 
 echo ALL PASS
 
-umount $CGROUP_ROOT
-rm -rf $CGROUP_ROOT
+if [[ $do_umount ]]; then
+  umount $CGROUP_ROOT
+  rm -rf $CGROUP_ROOT
+fi

From 25ea45f9058026e644a2c9eaf494aef789576bdf Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Thu, 25 Jan 2024 20:46:05 +0500
Subject: [PATCH 616/707] selftests/mm: run_vmtests: remove sudo and conform to
 tap

Remove sudo as some test running environments may not have sudo available.
Instead skip the test if root privileges aren't available in the test.

Link: https://lkml.kernel.org/r/20240125154608.720072-3-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/on-fault-limit.c | 36 ++++++++++-----------
 tools/testing/selftests/mm/run_vmtests.sh   |  2 +-
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/tools/testing/selftests/mm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c
index b5888d613f34eb..0ea98ffab35892 100644
--- a/tools/testing/selftests/mm/on-fault-limit.c
+++ b/tools/testing/selftests/mm/on-fault-limit.c
@@ -5,40 +5,38 @@
 #include <string.h>
 #include <sys/time.h>
 #include <sys/resource.h>
+#include "../kselftest.h"
 
-static int test_limit(void)
+static void test_limit(void)
 {
-	int ret = 1;
 	struct rlimit lims;
 	void *map;
 
-	if (getrlimit(RLIMIT_MEMLOCK, &lims)) {
-		perror("getrlimit");
-		return ret;
-	}
+	if (getrlimit(RLIMIT_MEMLOCK, &lims))
+		ksft_exit_fail_msg("getrlimit: %s\n", strerror(errno));
 
-	if (mlockall(MCL_ONFAULT | MCL_FUTURE)) {
-		perror("mlockall");
-		return ret;
-	}
+	if (mlockall(MCL_ONFAULT | MCL_FUTURE))
+		ksft_exit_fail_msg("mlockall: %s\n", strerror(errno));
 
 	map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
 		   MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
+
+	ksft_test_result(map == MAP_FAILED, "Failed mmap\n");
+
 	if (map != MAP_FAILED)
-		printf("mmap should have failed, but didn't\n");
-	else {
-		ret = 0;
 		munmap(map, 2 * lims.rlim_max);
-	}
-
 	munlockall();
-	return ret;
 }
 
 int main(int argc, char **argv)
 {
-	int ret = 0;
+	ksft_print_header();
+	ksft_set_plan(1);
+
+	if (getuid())
+		ksft_test_result_skip("Require root privileges to run\n");
+	else
+		test_limit();
 
-	ret += test_limit();
-	return ret;
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 246d53a5d7f287..e373d592dbf5cb 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -291,7 +291,7 @@ echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
 
 CATEGORY="compaction" run_test ./compaction_test
 
-CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit
+CATEGORY="mlock" run_test ./on-fault-limit
 
 CATEGORY="mmap" run_test ./map_populate
 

From 4fc1e7ffb61992bd1f2460214c766beb8c4a47c8 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Thu, 25 Jan 2024 20:46:06 +0500
Subject: [PATCH 617/707] selftests/mm: save and restore nr_hugepages value

Save and restore nr_hugepages before changing it during the test.  A test
should not change system wide settings.

Link: https://lkml.kernel.org/r/20240125154608.720072-4-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/charge_reserved_hugetlb.sh  | 4 ++++
 tools/testing/selftests/mm/hugetlb_reparenting_test.sh | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
index 0899019a7fcb4b..16b3c2e428dbdb 100755
--- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
+++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
@@ -11,6 +11,8 @@ if [[ $(id -u) -ne 0 ]]; then
   exit $ksft_skip
 fi
 
+nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
+
 fault_limit_file=limit_in_bytes
 reservation_limit_file=rsvd.limit_in_bytes
 fault_usage_file=usage_in_bytes
@@ -582,3 +584,5 @@ if [[ $do_umount ]]; then
   umount $cgroup_path
   rmdir $cgroup_path
 fi
+
+echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
index 615c4d766c9093..11f9bbe7dc222b 100755
--- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
+++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
@@ -11,6 +11,7 @@ if [[ $(id -u) -ne 0 ]]; then
   exit $ksft_skip
 fi
 
+nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
 usage_file=usage_in_bytes
 
 if [[ "$1" == "-cgroup-v2" ]]; then
@@ -252,3 +253,5 @@ if [[ $do_umount ]]; then
   umount $CGROUP_ROOT
   rm -rf $CGROUP_ROOT
 fi
+
+echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages

From 1fdeab609de0c57b9ef56078ddf647d36089147d Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Thu, 25 Jan 2024 20:46:07 +0500
Subject: [PATCH 618/707] selftests/mm: protection_keys: save/restore
 nr_hugepages settings

Save and restore nr_hugepages before changing it during the test.  A test
should not change system wide settings.

Link: https://lkml.kernel.org/r/20240125154608.720072-5-usama.anjum@collabora.com
Fixes: 5f23f6d082a9 ("x86/pkeys: Add self-tests")
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/protection_keys.c | 34 ++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
index 48dc151f8fca8a..f822ae31af22e2 100644
--- a/tools/testing/selftests/mm/protection_keys.c
+++ b/tools/testing/selftests/mm/protection_keys.c
@@ -54,6 +54,7 @@ int test_nr;
 u64 shadow_pkey_reg;
 int dprint_in_signal;
 char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+char buf[256];
 
 void cat_into_file(char *str, char *file)
 {
@@ -1744,6 +1745,38 @@ void pkey_setup_shadow(void)
 	shadow_pkey_reg = __read_pkey_reg();
 }
 
+void restore_settings_atexit(void)
+{
+	cat_into_file(buf, "/proc/sys/vm/nr_hugepages");
+}
+
+void save_settings(void)
+{
+	int fd;
+	int err;
+
+	if (geteuid())
+		return;
+
+	fd = open("/proc/sys/vm/nr_hugepages", O_RDONLY);
+	if (fd < 0) {
+		fprintf(stderr, "error opening\n");
+		perror("error: ");
+		exit(__LINE__);
+	}
+
+	/* -1 to guarantee leaving the trailing \0 */
+	err = read(fd, buf, sizeof(buf)-1);
+	if (err < 0) {
+		fprintf(stderr, "error reading\n");
+		perror("error: ");
+		exit(__LINE__);
+	}
+
+	atexit(restore_settings_atexit);
+	close(fd);
+}
+
 int main(void)
 {
 	int nr_iterations = 22;
@@ -1751,6 +1784,7 @@ int main(void)
 
 	srand((unsigned int)time(NULL));
 
+	save_settings();
 	setup_handlers();
 
 	printf("has pkeys: %d\n", pkeys_supported);

From f7ff0b277ff9fefe8f41b374baca8888c7dc54cd Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Thu, 25 Jan 2024 20:46:08 +0500
Subject: [PATCH 619/707] selftests/mm: run_vmtests.sh: add missing tests

Add missing tests to run_vmtests.sh.  The mm kselftests are run through
run_vmtests.sh.  If a test isn't present in this script, it'll not run
with run_tests or `make -C tools/testing/selftests/mm run_tests`.

Link: https://lkml.kernel.org/r/20240125154608.720072-6-usama.anjum@collabora.com
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/Makefile       | 5 +++++
 tools/testing/selftests/mm/run_vmtests.sh | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 2453add65d12f8..f3aec7be80730b 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -114,6 +114,11 @@ TEST_PROGS := run_vmtests.sh
 TEST_FILES := test_vmalloc.sh
 TEST_FILES += test_hmm.sh
 TEST_FILES += va_high_addr_switch.sh
+TEST_FILES += charge_reserved_hugetlb.sh
+TEST_FILES += hugetlb_reparenting_test.sh
+
+# required by charge_reserved_hugetlb.sh
+TEST_FILES += write_hugetlb_memory.sh
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index e373d592dbf5cb..a0f37e44389372 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -19,6 +19,7 @@ usage: ${BASH_SOURCE[0]:-$0} [ options ]
   -t: specify specific categories to tests to run
   -h: display this message
   -n: disable TAP output
+  -d: run destructive tests
 
 The default behavior is to run required tests only.  If -a is specified,
 will run all tests.
@@ -79,6 +80,7 @@ EOF
 }
 
 RUN_ALL=false
+RUN_DESTRUCTIVE_TEST=false
 TAP_PREFIX="# "
 
 while getopts "aht:n" OPT; do
@@ -87,6 +89,7 @@ while getopts "aht:n" OPT; do
 		"h") usage ;;
 		"t") VM_SELFTEST_ITEMS=${OPTARG} ;;
 		"n") TAP_PREFIX= ;;
+		"a") RUN_DESTRUCTIVE_TEST=true ;;
 	esac
 done
 shift $((OPTIND -1))
@@ -304,6 +307,11 @@ CATEGORY="process_mrelease" run_test ./mrelease_test
 CATEGORY="mremap" run_test ./mremap_test
 
 CATEGORY="hugetlb" run_test ./thuge-gen
+CATEGORY="hugetlb" run_test ./charge_reserved_hugetlb.sh -cgroup-v2
+CATEGORY="hugetlb" run_test ./hugetlb_reparenting_test.sh -cgroup-v2
+if $RUN_DESTRUCTIVE_TEST; then
+CATEGORY="hugetlb" run_test ./hugetlb-read-hwpoison
+fi
 
 if [ $VADDR64 -ne 0 ]; then
 

From 6d5cbfaf058cecf52dea4839cb0681207b908c03 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 24 Jan 2024 15:27:35 +0100
Subject: [PATCH 620/707] init: remove obsolete arch_call_rest_init() wrapper

Since commit 3570ee046c46b5dc ("s390/smp: keep the original lowcore for
CPU 0"), there is no longer any architecture that needs to override
arch_call_rest_init().

Remove the weak wrapper around rest_init(), call rest_init() directly, and
make rest_init() static.

Link: https://lkml.kernel.org/r/aa10868bfb176eef4abb8bb4a710b85330792694.1706106183.git.geert@linux-m68k.org
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/start_kernel.h | 2 --
 init/main.c                  | 9 ++-------
 tools/objtool/noreturns.h    | 1 -
 3 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/include/linux/start_kernel.h b/include/linux/start_kernel.h
index a9806a44a605c7..09f994ac87df44 100644
--- a/include/linux/start_kernel.h
+++ b/include/linux/start_kernel.h
@@ -9,7 +9,5 @@
    up something else. */
 
 extern asmlinkage void __init __noreturn start_kernel(void);
-extern void __init __noreturn arch_call_rest_init(void);
-extern void __ref __noreturn rest_init(void);
 
 #endif /* _LINUX_START_KERNEL_H */
diff --git a/init/main.c b/init/main.c
index e24b0780fdff7a..521f40770e67dd 100644
--- a/init/main.c
+++ b/init/main.c
@@ -681,7 +681,7 @@ static void __init setup_command_line(char *command_line)
 
 static __initdata DECLARE_COMPLETION(kthreadd_done);
 
-noinline void __ref __noreturn rest_init(void)
+static noinline void __ref __noreturn rest_init(void)
 {
 	struct task_struct *tsk;
 	int pid;
@@ -822,11 +822,6 @@ static int __init early_randomize_kstack_offset(char *buf)
 early_param("randomize_kstack_offset", early_randomize_kstack_offset);
 #endif
 
-void __init __weak __noreturn arch_call_rest_init(void)
-{
-	rest_init();
-}
-
 static void __init print_unknown_bootoptions(void)
 {
 	char *unknown_options;
@@ -1069,7 +1064,7 @@ void start_kernel(void)
 	kcsan_init();
 
 	/* Do the rest non-__init'ed, we're now alive */
-	arch_call_rest_init();
+	rest_init();
 
 	/*
 	 * Avoid stack canaries in callers of boot_init_stack_canary for gcc-10
diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
index 1685d7ea6a9f70..7cda577da897ca 100644
--- a/tools/objtool/noreturns.h
+++ b/tools/objtool/noreturns.h
@@ -12,7 +12,6 @@ NORETURN(__reiserfs_panic)
 NORETURN(__stack_chk_fail)
 NORETURN(__tdx_hypercall_failed)
 NORETURN(__ubsan_handle_builtin_unreachable)
-NORETURN(arch_call_rest_init)
 NORETURN(arch_cpu_idle_dead)
 NORETURN(bch2_trans_in_restart_error)
 NORETURN(bch2_trans_restart_error)

From 4d3e70d4f83aa98d4e50fd163d44872e25e31479 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 26 Jan 2024 16:21:20 +0500
Subject: [PATCH 621/707] selftests/mm: hugepage-shm: conform test to TAP
 format output

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.

The "." was being printed inside for loop to indicate the writes progress.
This was extraneous and hence removed in the patch.

Link: https://lkml.kernel.org/r/20240126112129.1480265-1-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Jiaqi Yan <jiaqiyan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/hugepage-shm.c | 47 +++++++++++------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/tools/testing/selftests/mm/hugepage-shm.c b/tools/testing/selftests/mm/hugepage-shm.c
index 478bb1e989e9f3..f949dbbc345409 100644
--- a/tools/testing/selftests/mm/hugepage-shm.c
+++ b/tools/testing/selftests/mm/hugepage-shm.c
@@ -34,11 +34,10 @@
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <sys/mman.h>
+#include "../kselftest.h"
 
 #define LENGTH (256UL*1024*1024)
 
-#define dprintf(x)  printf(x)
-
 /* Only ia64 requires this */
 #ifdef __ia64__
 #define ADDR (void *)(0x8000000000000000UL)
@@ -54,44 +53,42 @@ int main(void)
 	unsigned long i;
 	char *shmaddr;
 
+	ksft_print_header();
+	ksft_set_plan(1);
+
 	shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
-	if (shmid < 0) {
-		perror("shmget");
-		exit(1);
-	}
-	printf("shmid: 0x%x\n", shmid);
+	if (shmid < 0)
+		ksft_exit_fail_msg("shmget: %s\n", strerror(errno));
+
+	ksft_print_msg("shmid: 0x%x\n", shmid);
 
 	shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
 	if (shmaddr == (char *)-1) {
-		perror("Shared memory attach failure");
 		shmctl(shmid, IPC_RMID, NULL);
-		exit(2);
+		ksft_exit_fail_msg("Shared memory attach failure: %s\n", strerror(errno));
 	}
-	printf("shmaddr: %p\n", shmaddr);
 
-	dprintf("Starting the writes:\n");
-	for (i = 0; i < LENGTH; i++) {
+	ksft_print_msg("shmaddr: %p\n", shmaddr);
+
+	ksft_print_msg("Starting the writes:");
+	for (i = 0; i < LENGTH; i++)
 		shmaddr[i] = (char)(i);
-		if (!(i % (1024 * 1024)))
-			dprintf(".");
-	}
-	dprintf("\n");
+	ksft_print_msg("Done.\n");
 
-	dprintf("Starting the Check...");
+	ksft_print_msg("Starting the Check...");
 	for (i = 0; i < LENGTH; i++)
-		if (shmaddr[i] != (char)i) {
-			printf("\nIndex %lu mismatched\n", i);
-			exit(3);
-		}
-	dprintf("Done.\n");
+		if (shmaddr[i] != (char)i)
+			ksft_exit_fail_msg("\nIndex %lu mismatched\n", i);
+	ksft_print_msg("Done.\n");
 
 	if (shmdt((const void *)shmaddr) != 0) {
-		perror("Detach failure");
 		shmctl(shmid, IPC_RMID, NULL);
-		exit(4);
+		ksft_exit_fail_msg("Detach failure: %s\n", strerror(errno));
 	}
 
 	shmctl(shmid, IPC_RMID, NULL);
 
-	return 0;
+	ksft_test_result_pass("Completed test\n");
+
+	ksft_finished();
 }

From 216bd78b2e3198c7a92e0ba0b00df658cc3b9220 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 26 Jan 2024 16:21:21 +0500
Subject: [PATCH 622/707] selftests/mm: hugepage-vmemmap: conform test to TAP
 format output

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.

Link: https://lkml.kernel.org/r/20240126112129.1480265-2-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Jiaqi Yan <jiaqiyan@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/hugepage-vmemmap.c | 39 ++++++++-----------
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/tools/testing/selftests/mm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugepage-vmemmap.c
index 894d28c3dd4785..9080e34bb297e5 100644
--- a/tools/testing/selftests/mm/hugepage-vmemmap.c
+++ b/tools/testing/selftests/mm/hugepage-vmemmap.c
@@ -11,6 +11,7 @@
 #include <sys/mman.h>
 #include <fcntl.h>
 #include "vm_util.h"
+#include "../kselftest.h"
 
 #define PAGE_COMPOUND_HEAD	(1UL << 15)
 #define PAGE_COMPOUND_TAIL	(1UL << 16)
@@ -77,7 +78,7 @@ static int check_page_flags(unsigned long pfn)
 	read(fd, &pageflags, sizeof(pageflags));
 	if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) {
 		close(fd);
-		printf("Head page flags (%lx) is invalid\n", pageflags);
+		ksft_print_msg("Head page flags (%lx) is invalid\n", pageflags);
 		return -1;
 	}
 
@@ -91,7 +92,7 @@ static int check_page_flags(unsigned long pfn)
 		if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS ||
 		    (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) {
 			close(fd);
-			printf("Tail page flags (%lx) is invalid\n", pageflags);
+			ksft_print_msg("Tail page flags (%lx) is invalid\n", pageflags);
 			return -1;
 		}
 	}
@@ -106,18 +107,17 @@ int main(int argc, char **argv)
 	void *addr;
 	unsigned long pfn;
 
+	ksft_print_header();
+	ksft_set_plan(1);
+
 	pagesize  = psize();
 	maplength = default_huge_page_size();
-	if (!maplength) {
-		printf("Unable to determine huge page size\n");
-		exit(1);
-	}
+	if (!maplength)
+		ksft_exit_fail_msg("Unable to determine huge page size\n");
 
 	addr = mmap(MAP_ADDR, maplength, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
 
 	/* Trigger allocation of HugeTLB page. */
 	write_bytes(addr, maplength);
@@ -125,23 +125,16 @@ int main(int argc, char **argv)
 	pfn = virt_to_pfn(addr);
 	if (pfn == -1UL) {
 		munmap(addr, maplength);
-		perror("virt_to_pfn");
-		exit(1);
+		ksft_exit_fail_msg("virt_to_pfn: %s\n", strerror(errno));
 	}
 
-	printf("Returned address is %p whose pfn is %lx\n", addr, pfn);
+	ksft_print_msg("Returned address is %p whose pfn is %lx\n", addr, pfn);
 
-	if (check_page_flags(pfn) < 0) {
-		munmap(addr, maplength);
-		perror("check_page_flags");
-		exit(1);
-	}
+	ksft_test_result(!check_page_flags(pfn), "check_page_flags\n");
 
 	/* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
-	if (munmap(addr, maplength)) {
-		perror("munmap");
-		exit(1);
-	}
+	if (munmap(addr, maplength))
+		ksft_exit_fail_msg("munmap: %s\n", strerror(errno));
 
-	return 0;
+	ksft_finished();
 }

From 7daa749b593b06623cd963569b0cbe91e6a346d3 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 26 Jan 2024 16:21:22 +0500
Subject: [PATCH 623/707] selftests/mm: hugetlb-madvise: conform test to TAP
 format output

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.

Link: https://lkml.kernel.org/r/20240126112129.1480265-3-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Jiaqi Yan <jiaqiyan@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/hugetlb-madvise.c | 207 +++++++------------
 1 file changed, 80 insertions(+), 127 deletions(-)

diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c
index f32d99565c5eaa..2d9b27bc0d01d1 100644
--- a/tools/testing/selftests/mm/hugetlb-madvise.c
+++ b/tools/testing/selftests/mm/hugetlb-madvise.c
@@ -19,19 +19,14 @@
 #include <sys/mman.h>
 #include <fcntl.h>
 #include "vm_util.h"
+#include "../kselftest.h"
 
 #define MIN_FREE_PAGES	20
 #define NR_HUGE_PAGES	10	/* common number of pages to map/allocate */
 
 #define validate_free_pages(exp_free)					\
-	do {								\
-		int fhp = get_free_hugepages();				\
-		if (fhp != (exp_free)) {				\
-			printf("Unexpected number of free huge "	\
-				"pages line %d\n", __LINE__);		\
-			exit(1);					\
-		}							\
-	} while (0)
+	ksft_test_result(get_free_hugepages() == (exp_free),		\
+			 "Validation of free pages (%d)\n", __LINE__)
 
 unsigned long huge_page_size;
 unsigned long base_page_size;
@@ -64,28 +59,27 @@ int main(int argc, char **argv)
 	int fd;
 	int ret;
 
+	ksft_print_header();
+
 	huge_page_size = default_huge_page_size();
-	if (!huge_page_size) {
-		printf("Unable to determine huge page size, exiting!\n");
-		exit(1);
-	}
+	if (!huge_page_size)
+		ksft_exit_fail_msg("Unable to determine huge page size, exiting!\n");
+
 	base_page_size = sysconf(_SC_PAGE_SIZE);
-	if (!huge_page_size) {
-		printf("Unable to determine base page size, exiting!\n");
-		exit(1);
-	}
+	if (!huge_page_size)
+		ksft_exit_fail_msg("Unable to determine base page size, exiting!\n");
 
 	free_hugepages = get_free_hugepages();
 	if (free_hugepages < MIN_FREE_PAGES) {
-		printf("Not enough free huge pages to test, exiting!\n");
-		exit(1);
+		ksft_print_msg("Not enough free huge pages to test, exiting!\n");
+		ksft_finished();
 	}
 
 	fd = memfd_create(argv[0], MFD_HUGETLB);
-	if (fd < 0) {
-		perror("memfd_create() failed");
-		exit(1);
-	}
+	if (fd < 0)
+		ksft_exit_fail_msg("memfd_create() failed\n");
+
+	ksft_set_plan(37);
 
 	/*
 	 * Test validity of MADV_DONTNEED addr and length arguments.  mmap
@@ -97,16 +91,13 @@ int main(int argc, char **argv)
 			PROT_READ | PROT_WRITE,
 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
 			-1, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
+
 	if (munmap(addr, huge_page_size) ||
-			munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size,
-				huge_page_size)) {
-		perror("munmap");
-		exit(1);
-	}
+	    munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size, huge_page_size))
+		ksft_exit_fail_msg("munmap: %s\n", strerror(errno));
+
 	addr = addr + huge_page_size;
 
 	write_fault_pages(addr, NR_HUGE_PAGES);
@@ -114,21 +105,13 @@ int main(int argc, char **argv)
 
 	/* addr before mapping should fail */
 	ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size,
-		MADV_DONTNEED);
-	if (!ret) {
-		printf("Unexpected success of madvise call with invalid addr line %d\n",
-				__LINE__);
-			exit(1);
-	}
+		      MADV_DONTNEED);
+	ksft_test_result(ret, "The madvise call with invalid address\n");
 
 	/* addr + length after mapping should fail */
 	ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size,
-		MADV_DONTNEED);
-	if (!ret) {
-		printf("Unexpected success of madvise call with invalid length line %d\n",
-				__LINE__);
-			exit(1);
-	}
+		      MADV_DONTNEED);
+	ksft_test_result(ret, "The madvise call with invalid address\n");
 
 	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
 
@@ -139,10 +122,9 @@ int main(int argc, char **argv)
 			PROT_READ | PROT_WRITE,
 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
 			-1, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
+
 	write_fault_pages(addr, NR_HUGE_PAGES);
 	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
 
@@ -150,19 +132,12 @@ int main(int argc, char **argv)
 	ret = madvise(addr + base_page_size,
 			NR_HUGE_PAGES * huge_page_size - base_page_size,
 			MADV_DONTNEED);
-	if (!ret) {
-		printf("Unexpected success of madvise call with unaligned start address %d\n",
-				__LINE__);
-			exit(1);
-	}
+	ksft_test_result(ret, "The madvise call with unaligned start address\n");
 
 	/* addr + length should be aligned down to huge page size */
-	if (madvise(addr,
-			((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size,
-			MADV_DONTNEED)) {
-		perror("madvise");
-		exit(1);
-	}
+	ret = madvise(addr, ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size,
+		      MADV_DONTNEED);
+	ksft_test_result(!ret, "The madvise call with aligned start address\n");
 
 	/* should free all but last page in mapping */
 	validate_free_pages(free_hugepages - 1);
@@ -177,17 +152,14 @@ int main(int argc, char **argv)
 			PROT_READ | PROT_WRITE,
 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
 			-1, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
+
 	write_fault_pages(addr, NR_HUGE_PAGES);
 	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
 
-	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-		perror("madvise");
-		exit(1);
-	}
+	ret = madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED);
+	ksft_test_result(!ret, "The madvise MADV_DONTNEED on anonymous private mapping\n");
 
 	/* should free all pages in mapping */
 	validate_free_pages(free_hugepages);
@@ -197,29 +169,25 @@ int main(int argc, char **argv)
 	/*
 	 * Test MADV_DONTNEED on private mapping of hugetlb file
 	 */
-	if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
-		perror("fallocate");
-		exit(1);
-	}
+	if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size))
+		ksft_exit_fail_msg("fallocate: %s\n", strerror(errno));
+
 	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
 
 	addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
 			PROT_READ | PROT_WRITE,
 			MAP_PRIVATE, fd, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
 
 	/* read should not consume any pages */
 	read_fault_pages(addr, NR_HUGE_PAGES);
 	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
 
 	/* madvise should not free any pages */
-	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-		perror("madvise");
-		exit(1);
-	}
+	ret = madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED);
+	ksft_test_result(!ret, "The madvise MADV_DONTNEED on private mapping of file\n");
+
 	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
 
 	/* writes should allocate private pages */
@@ -227,10 +195,9 @@ int main(int argc, char **argv)
 	validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
 
 	/* madvise should free private pages */
-	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-		perror("madvise");
-		exit(1);
-	}
+	ret = madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED);
+	ksft_test_result(!ret, "The madvise MADV_DONTNEED on private mapping of file\n");
+
 	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
 
 	/* writes should allocate private pages */
@@ -245,10 +212,9 @@ int main(int argc, char **argv)
 	 * implementation.
 	 */
 	if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
-					0, NR_HUGE_PAGES * huge_page_size)) {
-		perror("fallocate");
-		exit(1);
-	}
+					0, NR_HUGE_PAGES * huge_page_size))
+		ksft_exit_fail_msg("fallocate: %s\n", strerror(errno));
+
 	validate_free_pages(free_hugepages);
 
 	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
@@ -256,29 +222,25 @@ int main(int argc, char **argv)
 	/*
 	 * Test MADV_DONTNEED on shared mapping of hugetlb file
 	 */
-	if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
-		perror("fallocate");
-		exit(1);
-	}
+	if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size))
+		ksft_exit_fail_msg("fallocate: %s\n", strerror(errno));
+
 	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
 
 	addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
 			PROT_READ | PROT_WRITE,
 			MAP_SHARED, fd, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
 
 	/* write should not consume any pages */
 	write_fault_pages(addr, NR_HUGE_PAGES);
 	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
 
 	/* madvise should not free any pages */
-	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-		perror("madvise");
-		exit(1);
-	}
+	ret = madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED);
+	ksft_test_result(!ret, "The madvise MADV_DONTNEED on shared mapping of file\n");
+
 	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
 
 	/*
@@ -286,29 +248,25 @@ int main(int argc, char **argv)
 	 *
 	 * madvise is same as hole punch and should free all pages.
 	 */
-	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) {
-		perror("madvise");
-		exit(1);
-	}
+	ret = madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE);
+	ksft_test_result(!ret, "The madvise MADV_REMOVE on shared mapping of file\n");
+
 	validate_free_pages(free_hugepages);
 	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
 
 	/*
 	 * Test MADV_REMOVE on shared and private mapping of hugetlb file
 	 */
-	if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
-		perror("fallocate");
-		exit(1);
-	}
+	if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size))
+		ksft_exit_fail_msg("fallocate: %s\n", strerror(errno));
+
 	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
 
 	addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
 			PROT_READ | PROT_WRITE,
 			MAP_SHARED, fd, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
 
 	/* shared write should not consume any additional pages */
 	write_fault_pages(addr, NR_HUGE_PAGES);
@@ -317,10 +275,8 @@ int main(int argc, char **argv)
 	addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
 			PROT_READ | PROT_WRITE,
 			MAP_PRIVATE, fd, 0);
-	if (addr2 == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
+	if (addr2 == MAP_FAILED)
+		ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
 
 	/* private read should not consume any pages */
 	read_fault_pages(addr2, NR_HUGE_PAGES);
@@ -331,17 +287,15 @@ int main(int argc, char **argv)
 	validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
 
 	/* madvise of shared mapping should not free any pages */
-	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-		perror("madvise");
-		exit(1);
-	}
+	ret = madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED);
+	ksft_test_result(!ret, "The madvise MADV_REMOVE on shared mapping of file\n");
+
 	validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
 
 	/* madvise of private mapping should free private pages */
-	if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-		perror("madvise");
-		exit(1);
-	}
+	ret = madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED);
+	ksft_test_result(!ret, "The madvise MADV_REMOVE on shared mapping of file\n");
+
 	validate_free_pages(free_hugepages - NR_HUGE_PAGES);
 
 	/* private write should consume additional pages again */
@@ -353,15 +307,14 @@ int main(int argc, char **argv)
 	 * not correct.  private pages should not be freed, but this is
 	 * expected.  See comment associated with FALLOC_FL_PUNCH_HOLE call.
 	 */
-	if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) {
-		perror("madvise");
-		exit(1);
-	}
+	ret = madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE);
+	ksft_test_result(!ret, "The madvise MADV_REMOVE on shared mapping of file\n");
+
 	validate_free_pages(free_hugepages);
 
 	(void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
 	(void)munmap(addr2, NR_HUGE_PAGES * huge_page_size);
 
 	close(fd);
-	return 0;
+	ksft_finished();
 }

From eb8bb77ecc5a0e104527b7e41791f2d94f044f93 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 26 Jan 2024 16:21:23 +0500
Subject: [PATCH 624/707] selftests/mm: khugepaged: conform test to TAP format
 output

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.

Skip test if root privileges aren't provided.

Link: https://lkml.kernel.org/r/20240126112129.1480265-4-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Jiaqi Yan <jiaqiyan@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/khugepaged.c | 385 ++++++++++--------------
 1 file changed, 163 insertions(+), 222 deletions(-)

diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c
index 829320a519e723..d51fdaee7dc6ae 100644
--- a/tools/testing/selftests/mm/khugepaged.c
+++ b/tools/testing/selftests/mm/khugepaged.c
@@ -23,6 +23,7 @@
 
 #include "vm_util.h"
 #include "thp_settings.h"
+#include "../kselftest.h"
 
 #define BASE_ADDR ((void *)(1UL << 30))
 static unsigned long hpage_pmd_size;
@@ -73,22 +74,20 @@ struct file_info {
 
 static struct file_info finfo;
 static bool skip_settings_restore;
-static int exit_status;
 
 static void success(const char *msg)
 {
-	printf(" \e[32m%s\e[0m\n", msg);
+	ksft_test_result_pass("%s\n", msg);
 }
 
 static void fail(const char *msg)
 {
-	printf(" \e[31m%s\e[0m\n", msg);
-	exit_status++;
+	ksft_test_result_fail("%s\n", msg);
 }
 
 static void skip(const char *msg)
 {
-	printf(" \e[33m%s\e[0m\n", msg);
+	ksft_test_result_skip("\e%s\n", msg);
 }
 
 static void restore_settings_atexit(void)
@@ -96,9 +95,8 @@ static void restore_settings_atexit(void)
 	if (skip_settings_restore)
 		return;
 
-	printf("Restore THP and khugepaged settings...");
 	thp_restore_settings();
-	success("OK");
+	ksft_print_msg("Restored THP and khugepaged settings...\n");
 
 	skip_settings_restore = true;
 }
@@ -106,12 +104,12 @@ static void restore_settings_atexit(void)
 static void restore_settings(int sig)
 {
 	/* exit() will invoke the restore_settings_atexit handler. */
-	exit(sig ? EXIT_FAILURE : exit_status);
+	ksft_finished();
 }
 
 static void save_settings(void)
 {
-	printf("Save THP and khugepaged settings...");
+	ksft_print_msg("Save THP and khugepaged settings...\n");
 	if (file_ops && finfo.type == VMA_FILE)
 		thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path);
 	thp_save_settings();
@@ -135,60 +133,50 @@ static void get_finfo(const char *dir)
 
 	finfo.dir = dir;
 	stat(finfo.dir, &path_stat);
-	if (!S_ISDIR(path_stat.st_mode)) {
-		printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
-		exit(EXIT_FAILURE);
-	}
+	if (!S_ISDIR(path_stat.st_mode))
+		ksft_exit_fail_msg("%s: Not a directory (%s)\n", __func__, finfo.dir);
+
 	if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
-		     finfo.dir) >= sizeof(finfo.path)) {
-		printf("%s: Pathname is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
-	if (statfs(finfo.dir, &fs)) {
-		perror("statfs()");
-		exit(EXIT_FAILURE);
-	}
+		     finfo.dir) >= sizeof(finfo.path))
+		ksft_exit_fail_msg("%s: Pathname is too long\n", __func__);
+
+	if (statfs(finfo.dir, &fs))
+		ksft_exit_fail_msg("statfs(): %s\n", strerror(errno));
+
 	finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
 	if (finfo.type == VMA_SHMEM)
 		return;
 
 	/* Find owning device's queue/read_ahead_kb control */
 	if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
-		     major(path_stat.st_dev), minor(path_stat.st_dev))
-	    >= sizeof(path)) {
-		printf("%s: Pathname is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
-	if (read_file(path, buf, sizeof(buf)) < 0) {
-		perror("read_file(read_num)");
-		exit(EXIT_FAILURE);
-	}
+		     major(path_stat.st_dev), minor(path_stat.st_dev)) >= sizeof(path))
+		ksft_exit_fail_msg("%s: Pathname is too long\n", __func__);
+
+	if (read_file(path, buf, sizeof(buf)) < 0)
+		ksft_exit_fail_msg("read_file(read_num): %s\n", strerror(errno));
+
 	if (strstr(buf, "DEVTYPE=disk")) {
 		/* Found it */
 		if (snprintf(finfo.dev_queue_read_ahead_path,
 			     sizeof(finfo.dev_queue_read_ahead_path),
 			     "/sys/dev/block/%d:%d/queue/read_ahead_kb",
 			     major(path_stat.st_dev), minor(path_stat.st_dev))
-		    >= sizeof(finfo.dev_queue_read_ahead_path)) {
-			printf("%s: Pathname is too long\n", __func__);
-			exit(EXIT_FAILURE);
-		}
+		    >= sizeof(finfo.dev_queue_read_ahead_path))
+			ksft_exit_fail_msg("%s: Pathname is too long: %s\n", __func__,
+					   strerror(errno));
 		return;
 	}
-	if (!strstr(buf, "DEVTYPE=partition")) {
-		printf("%s: Unknown device type: %s\n", __func__, path);
-		exit(EXIT_FAILURE);
-	}
+	if (!strstr(buf, "DEVTYPE=partition"))
+		ksft_exit_fail_msg("%s: Unknown device type: %s\n", __func__, path);
 	/*
 	 * Partition of block device - need to find actual device.
 	 * Using naming convention that devnameN is partition of
 	 * device devname.
 	 */
 	str = strstr(buf, "DEVNAME=");
-	if (!str) {
-		printf("%s: Could not read: %s", __func__, path);
-		exit(EXIT_FAILURE);
-	}
+	if (!str)
+		ksft_exit_fail_msg("%s: Could not read: %s", __func__, path);
+
 	str += 8;
 	end = str;
 	while (*end) {
@@ -197,16 +185,14 @@ static void get_finfo(const char *dir)
 			if (snprintf(finfo.dev_queue_read_ahead_path,
 				     sizeof(finfo.dev_queue_read_ahead_path),
 				     "/sys/block/%s/queue/read_ahead_kb",
-				     str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
-				printf("%s: Pathname is too long\n", __func__);
-				exit(EXIT_FAILURE);
-			}
+				     str) >= sizeof(finfo.dev_queue_read_ahead_path))
+				ksft_exit_fail_msg("%s: Pathname is too long\n", __func__);
+
 			return;
 		}
 		++end;
 	}
-	printf("%s: Could not read: %s\n", __func__, path);
-	exit(EXIT_FAILURE);
+	ksft_exit_fail_msg("%s: Could not read: %s\n", __func__, path);
 }
 
 static bool check_swap(void *addr, unsigned long size)
@@ -219,26 +205,21 @@ static bool check_swap(void *addr, unsigned long size)
 
 	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
 		       (unsigned long) addr);
-	if (ret >= MAX_LINE_LENGTH) {
-		printf("%s: Pattern is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
-
+	if (ret >= MAX_LINE_LENGTH)
+		ksft_exit_fail_msg("%s: Pattern is too long\n", __func__);
 
 	fp = fopen(PID_SMAPS, "r");
-	if (!fp) {
-		printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
-		exit(EXIT_FAILURE);
-	}
+	if (!fp)
+		ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, PID_SMAPS);
+
 	if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
 		goto err_out;
 
 	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
 		       size >> 10);
-	if (ret >= MAX_LINE_LENGTH) {
-		printf("%s: Pattern is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
+	if (ret >= MAX_LINE_LENGTH)
+		ksft_exit_fail_msg("%s: Pattern is too long\n", __func__);
+
 	/*
 	 * Fetch the Swap: in the same block and check whether it got
 	 * the expected number of hugeepages next.
@@ -261,10 +242,8 @@ static void *alloc_mapping(int nr)
 
 	p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
 		 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (p != BASE_ADDR) {
-		printf("Failed to allocate VMA at %p\n", BASE_ADDR);
-		exit(EXIT_FAILURE);
-	}
+	if (p != BASE_ADDR)
+		ksft_exit_fail_msg("Failed to allocate VMA at %p\n", BASE_ADDR);
 
 	return p;
 }
@@ -314,19 +293,16 @@ static void *alloc_hpage(struct mem_ops *ops)
 	 * khugepaged on low-load system (like a test machine), which
 	 * would cause MADV_COLLAPSE to fail with EAGAIN.
 	 */
-	printf("Allocate huge page...");
-	if (madvise_collapse_retry(p, hpage_pmd_size)) {
-		perror("madvise(MADV_COLLAPSE)");
-		exit(EXIT_FAILURE);
-	}
-	if (!ops->check_huge(p, 1)) {
-		perror("madvise(MADV_COLLAPSE)");
-		exit(EXIT_FAILURE);
-	}
-	if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
-		perror("madvise(MADV_HUGEPAGE)");
-		exit(EXIT_FAILURE);
-	}
+	ksft_print_msg("Allocate huge page...\n");
+	if (madvise_collapse_retry(p, hpage_pmd_size))
+		ksft_exit_fail_msg("madvise(MADV_COLLAPSE): %s\n", strerror(errno));
+
+	if (!ops->check_huge(p, 1))
+		ksft_exit_fail_msg("madvise(MADV_COLLAPSE): %s\n", strerror(errno));
+
+	if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE))
+		ksft_exit_fail_msg("madvise(MADV_HUGEPAGE): %s\n", strerror(errno));
+
 	success("OK");
 	return p;
 }
@@ -335,13 +311,12 @@ static void validate_memory(int *p, unsigned long start, unsigned long end)
 {
 	int i;
 
-	for (i = start / page_size; i < end / page_size; i++) {
-		if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
-			printf("Page %d is corrupted: %#x\n",
-					i, p[i * page_size / sizeof(*p)]);
-			exit(EXIT_FAILURE);
-		}
-	}
+	for (i = start / page_size; i < end / page_size; i++)
+		if (p[i * page_size / sizeof(*p)] != i + 0xdead0000)
+			ksft_print_msg("Page %d is corrupted: %#x\n",
+				       i, p[i * page_size / sizeof(*p)]);
+
+	ksft_test_result(i == end/page_size, "Validated memory\n");
 }
 
 static void *anon_setup_area(int nr_hpages)
@@ -371,14 +346,12 @@ static void *file_setup_area(int nr_hpages)
 	unsigned long size;
 
 	unlink(finfo.path);  /* Cleanup from previous failed tests */
-	printf("Creating %s for collapse%s...", finfo.path,
-	       finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
+	ksft_print_msg("Creating %s for collapse%s...\n", finfo.path,
+		       finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
 	fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
 		  777);
-	if (fd < 0) {
-		perror("open()");
-		exit(EXIT_FAILURE);
-	}
+	if (fd < 0)
+		ksft_exit_fail_msg("open(): %s\n", strerror(errno));
 
 	size = nr_hpages * hpage_pmd_size;
 	p = alloc_mapping(nr_hpages);
@@ -388,18 +361,15 @@ static void *file_setup_area(int nr_hpages)
 	munmap(p, size);
 	success("OK");
 
-	printf("Opening %s read only for collapse...", finfo.path);
+	ksft_print_msg("Opening %s read only for collapse...\n", finfo.path);
 	finfo.fd = open(finfo.path, O_RDONLY, 777);
-	if (finfo.fd < 0) {
-		perror("open()");
-		exit(EXIT_FAILURE);
-	}
+	if (finfo.fd < 0)
+		ksft_exit_fail_msg("open(): %s\n", strerror(errno));
+
 	p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
 		 MAP_PRIVATE, finfo.fd, 0);
-	if (p == MAP_FAILED || p != BASE_ADDR) {
-		perror("mmap()");
-		exit(EXIT_FAILURE);
-	}
+	if (p == MAP_FAILED || p != BASE_ADDR)
+		ksft_exit_fail_msg("mmap(): %s\n", strerror(errno));
 
 	/* Drop page cache */
 	write_file("/proc/sys/vm/drop_caches", "3", 2);
@@ -416,10 +386,8 @@ static void file_cleanup_area(void *p, unsigned long size)
 
 static void file_fault(void *p, unsigned long start, unsigned long end)
 {
-	if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
-		perror("madvise(MADV_POPULATE_READ");
-		exit(EXIT_FAILURE);
-	}
+	if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ))
+		ksft_exit_fail_msg("madvise(MADV_POPULATE_READ: %s\n", strerror(errno));
 }
 
 static bool file_check_huge(void *addr, int nr_hpages)
@@ -430,7 +398,7 @@ static bool file_check_huge(void *addr, int nr_hpages)
 	case VMA_SHMEM:
 		return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
 	default:
-		exit(EXIT_FAILURE);
+		ksft_exit_fail_msg("Wrong type\n");
 		return false;
 	}
 }
@@ -441,20 +409,16 @@ static void *shmem_setup_area(int nr_hpages)
 	unsigned long size = nr_hpages * hpage_pmd_size;
 
 	finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
-	if (finfo.fd < 0)  {
-		perror("memfd_create()");
-		exit(EXIT_FAILURE);
-	}
-	if (ftruncate(finfo.fd, size)) {
-		perror("ftruncate()");
-		exit(EXIT_FAILURE);
-	}
-	p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
-		 0);
-	if (p != BASE_ADDR) {
-		perror("mmap()");
-		exit(EXIT_FAILURE);
-	}
+	if (finfo.fd < 0)
+		ksft_exit_fail_msg("memfd_create(): %s\n", strerror(errno));
+
+	if (ftruncate(finfo.fd, size))
+		ksft_exit_fail_msg("ftruncate(): %s\n", strerror(errno));
+
+	p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, 0);
+	if (p != BASE_ADDR)
+		ksft_exit_fail_msg("mmap(): %s\n", strerror(errno));
+
 	return p;
 }
 
@@ -499,7 +463,7 @@ static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
 	int ret;
 	struct thp_settings settings = *thp_current_settings();
 
-	printf("%s...", msg);
+	ksft_print_msg("%s...\n", msg);
 
 	/*
 	 * Prevent khugepaged interference and tests that MADV_COLLAPSE
@@ -526,10 +490,9 @@ static void madvise_collapse(const char *msg, char *p, int nr_hpages,
 			     struct mem_ops *ops, bool expect)
 {
 	/* Sanity check */
-	if (!ops->check_huge(p, 0)) {
-		printf("Unexpected huge page\n");
-		exit(EXIT_FAILURE);
-	}
+	if (!ops->check_huge(p, 0))
+		ksft_exit_fail_msg("Unexpected huge page\n");
+
 	__madvise_collapse(msg, p, nr_hpages, ops, expect);
 }
 
@@ -541,23 +504,20 @@ static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
 	int timeout = 6; /* 3 seconds */
 
 	/* Sanity check */
-	if (!ops->check_huge(p, 0)) {
-		printf("Unexpected huge page\n");
-		exit(EXIT_FAILURE);
-	}
+	if (!ops->check_huge(p, 0))
+		ksft_exit_fail_msg("Unexpected huge page\n");
 
 	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
 
 	/* Wait until the second full_scan completed */
 	full_scans = thp_read_num("khugepaged/full_scans") + 2;
 
-	printf("%s...", msg);
+	ksft_print_msg("%s...\n", msg);
 	while (timeout--) {
 		if (ops->check_huge(p, nr_hpages))
 			break;
 		if (thp_read_num("khugepaged/full_scans") >= full_scans)
 			break;
-		printf(".");
 		usleep(TICK);
 	}
 
@@ -623,7 +583,7 @@ static void alloc_at_fault(void)
 
 	p = alloc_mapping(1);
 	*p = 1;
-	printf("Allocate huge page on fault...");
+	ksft_print_msg("Allocate huge page on fault...\n");
 	if (check_huge_anon(p, 1, hpage_pmd_size))
 		success("OK");
 	else
@@ -632,7 +592,7 @@ static void alloc_at_fault(void)
 	thp_pop_settings();
 
 	madvise(p, page_size, MADV_DONTNEED);
-	printf("Split huge PMD on MADV_DONTNEED...");
+	ksft_print_msg("Split huge PMD on MADV_DONTNEED...\n");
 	if (check_huge_anon(p, 0, hpage_pmd_size))
 		success("OK");
 	else
@@ -688,7 +648,7 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o
 
 	if (is_tmpfs(ops)) {
 		/* shmem pages always in the page cache */
-		printf("tmpfs...");
+		ksft_print_msg("tmpfs...\n");
 		skip("Skip");
 		goto skip;
 	}
@@ -717,11 +677,10 @@ static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_op
 	p = ops->setup_area(1);
 	ops->fault(p, 0, hpage_pmd_size);
 
-	printf("Swapout one page...");
-	if (madvise(p, page_size, MADV_PAGEOUT)) {
-		perror("madvise(MADV_PAGEOUT)");
-		exit(EXIT_FAILURE);
-	}
+	ksft_print_msg("Swapout one page...\n");
+	if (madvise(p, page_size, MADV_PAGEOUT))
+		ksft_exit_fail_msg("madvise(MADV_PAGEOUT): %s\n", strerror(errno));
+
 	if (check_swap(p, page_size)) {
 		success("OK");
 	} else {
@@ -744,11 +703,10 @@ static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *o
 	p = ops->setup_area(1);
 	ops->fault(p, 0, hpage_pmd_size);
 
-	printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
-	if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
-		perror("madvise(MADV_PAGEOUT)");
-		exit(EXIT_FAILURE);
-	}
+	ksft_print_msg("Swapout %d of %d pages...\n", max_ptes_swap + 1, hpage_pmd_nr);
+	if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT))
+		ksft_exit_fail_msg("madvise(MADV_PAGEOUT): %s\n", strerror(errno));
+
 	if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
 		success("OK");
 	} else {
@@ -762,12 +720,11 @@ static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *o
 
 	if (c->enforce_pte_scan_limits) {
 		ops->fault(p, 0, hpage_pmd_size);
-		printf("Swapout %d of %d pages...", max_ptes_swap,
+		ksft_print_msg("Swapout %d of %d pages...\n", max_ptes_swap,
 		       hpage_pmd_nr);
-		if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
-			perror("madvise(MADV_PAGEOUT)");
-			exit(EXIT_FAILURE);
-		}
+		if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT))
+			ksft_exit_fail_msg("madvise(MADV_PAGEOUT): %s\n", strerror(errno));
+
 		if (check_swap(p, max_ptes_swap * page_size)) {
 			success("OK");
 		} else {
@@ -791,13 +748,13 @@ static void collapse_single_pte_entry_compound(struct collapse_context *c, struc
 
 	if (is_tmpfs(ops)) {
 		/* MADV_DONTNEED won't evict tmpfs pages */
-		printf("tmpfs...");
+		ksft_print_msg("tmpfs...\n");
 		skip("Skip");
 		goto skip;
 	}
 
 	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
-	printf("Split huge page leaving single PTE mapping compound page...");
+	ksft_print_msg("Split huge page leaving single PTE mapping compound page...\n");
 	madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
 	if (ops->check_huge(p, 0))
 		success("OK");
@@ -816,7 +773,7 @@ static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops
 	void *p;
 
 	p = alloc_hpage(ops);
-	printf("Split huge page leaving single PTE page table full of compound pages...");
+	ksft_print_msg("Split huge page leaving single PTE page table full of compound pages...\n");
 	madvise(p, page_size, MADV_NOHUGEPAGE);
 	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
 	if (ops->check_huge(p, 0))
@@ -837,15 +794,14 @@ static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops
 
 	p = ops->setup_area(1);
 	for (i = 0; i < hpage_pmd_nr; i++) {
-		printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
-				i + 1, hpage_pmd_nr);
+		ksft_print_msg("\rConstruct PTE page table full of different PTE-mapped "
+			       "compound pages %3d/%d...", i + 1, hpage_pmd_nr);
 
 		madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
 		ops->fault(BASE_ADDR, 0, hpage_pmd_size);
-		if (!ops->check_huge(BASE_ADDR, 1)) {
-			printf("Failed to allocate huge page\n");
-			exit(EXIT_FAILURE);
-		}
+		if (!ops->check_huge(BASE_ADDR, 1))
+			ksft_exit_fail_msg("Failed to allocate huge page\n");
+
 		madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
 
 		p = mremap(BASE_ADDR - i * page_size,
@@ -853,22 +809,20 @@ static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops
 				(i + 1) * page_size,
 				MREMAP_MAYMOVE | MREMAP_FIXED,
 				BASE_ADDR + 2 * hpage_pmd_size);
-		if (p == MAP_FAILED) {
-			perror("mremap+unmap");
-			exit(EXIT_FAILURE);
-		}
+		if (p == MAP_FAILED)
+			ksft_exit_fail_msg("mremap+unmap: %s\n", strerror(errno));
 
 		p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
 				(i + 1) * page_size,
 				(i + 1) * page_size + hpage_pmd_size,
 				MREMAP_MAYMOVE | MREMAP_FIXED,
 				BASE_ADDR - (i + 1) * page_size);
-		if (p == MAP_FAILED) {
-			perror("mremap+alloc");
-			exit(EXIT_FAILURE);
-		}
+		if (p == MAP_FAILED)
+			ksft_exit_fail_msg("mremap+alloc: %s\n", strerror(errno));
 	}
 
+	ksft_print_msg("\n");
+
 	ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
 	ops->fault(p, 0, hpage_pmd_size);
 	if (!ops->check_huge(p, 1))
@@ -890,23 +844,19 @@ static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
 
 	p = ops->setup_area(1);
 
-	printf("Allocate small page...");
+	ksft_print_msg("Allocate small page...\n");
 	ops->fault(p, 0, page_size);
 	if (ops->check_huge(p, 0))
 		success("OK");
 	else
 		fail("Fail");
 
-	printf("Share small page over fork()...");
+	ksft_print_msg("Share small page over fork()...\n");
 	if (!fork()) {
 		/* Do not touch settings on child exit */
 		skip_settings_restore = true;
-		exit_status = 0;
 
-		if (ops->check_huge(p, 0))
-			success("OK");
-		else
-			fail("Fail");
+		ksft_test_result(ops->check_huge(p, 0), "%s: child\n", __func__);
 
 		ops->fault(p, page_size, 2 * page_size);
 		c->collapse("Collapse PTE table with single page shared with parent process",
@@ -914,13 +864,12 @@ static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
 
 		validate_memory(p, 0, page_size);
 		ops->cleanup_area(p, hpage_pmd_size);
-		exit(exit_status);
+		exit(0);
 	}
 
 	wait(&wstatus);
-	exit_status += WEXITSTATUS(wstatus);
 
-	printf("Check if parent still has small page...");
+	ksft_print_msg("Check if parent still has small page...\n");
 	if (ops->check_huge(p, 0))
 		success("OK");
 	else
@@ -931,22 +880,17 @@ static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
 
 static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
 {
-	int wstatus;
 	void *p;
 
 	p = alloc_hpage(ops);
-	printf("Share huge page over fork()...");
+	ksft_print_msg("Share huge page over fork()...\n");
 	if (!fork()) {
 		/* Do not touch settings on child exit */
 		skip_settings_restore = true;
-		exit_status = 0;
 
-		if (ops->check_huge(p, 1))
-			success("OK");
-		else
-			fail("Fail");
+		ksft_test_result(ops->check_huge(p, 1), "%s: child\n", __func__);
 
-		printf("Split huge page PMD in child process...");
+		ksft_print_msg("Split huge page PMD in child process...\n");
 		madvise(p, page_size, MADV_NOHUGEPAGE);
 		madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
 		if (ops->check_huge(p, 0))
@@ -963,13 +907,12 @@ static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *o
 
 		validate_memory(p, 0, hpage_pmd_size);
 		ops->cleanup_area(p, hpage_pmd_size);
-		exit(exit_status);
+		exit(0);
 	}
 
-	wait(&wstatus);
-	exit_status += WEXITSTATUS(wstatus);
+	wait(NULL);
 
-	printf("Check if parent still has huge page...");
+	ksft_print_msg("Check if parent still has huge page...\n");
 	if (ops->check_huge(p, 1))
 		success("OK");
 	else
@@ -981,23 +924,18 @@ static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *o
 static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
 {
 	int max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared");
-	int wstatus;
 	void *p;
 
 	p = alloc_hpage(ops);
-	printf("Share huge page over fork()...");
+	ksft_print_msg("Share huge page over fork()...\n");
 	if (!fork()) {
 		/* Do not touch settings on child exit */
 		skip_settings_restore = true;
-		exit_status = 0;
 
-		if (ops->check_huge(p, 1))
-			success("OK");
-		else
-			fail("Fail");
+		ksft_test_result(ops->check_huge(p, 1), "%s: child\n", __func__);
 
-		printf("Trigger CoW on page %d of %d...",
-				hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
+		ksft_print_msg("Trigger CoW on page %d of %d...\n",
+			       hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
 		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
 		if (ops->check_huge(p, 0))
 			success("OK");
@@ -1008,8 +946,8 @@ static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops
 			    1, ops, !c->enforce_pte_scan_limits);
 
 		if (c->enforce_pte_scan_limits) {
-			printf("Trigger CoW on page %d of %d...",
-			       hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
+			ksft_print_msg("Trigger CoW on page %d of %d...\n",
+				       hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
 			ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
 				    page_size);
 			if (ops->check_huge(p, 0))
@@ -1023,13 +961,12 @@ static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops
 
 		validate_memory(p, 0, hpage_pmd_size);
 		ops->cleanup_area(p, hpage_pmd_size);
-		exit(exit_status);
+		exit(0);
 	}
 
-	wait(&wstatus);
-	exit_status += WEXITSTATUS(wstatus);
+	wait(NULL);
 
-	printf("Check if parent still has huge page...");
+	ksft_print_msg("Check if parent still has huge page...\n");
 	if (ops->check_huge(p, 1))
 		success("OK");
 	else
@@ -1083,20 +1020,19 @@ static void madvise_retracted_page_tables(struct collapse_context *c,
 
 static void usage(void)
 {
-	fprintf(stderr, "\nUsage: ./khugepaged [OPTIONS] <test type> [dir]\n\n");
-	fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
-	fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
-	fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
-	fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
-	fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
-	fprintf(stderr,	"\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
-	fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
-	fprintf(stderr,	"\tmounted with huge=madvise option for khugepaged tests to work\n");
-	fprintf(stderr,	"\n\tSupported Options:\n");
-	fprintf(stderr,	"\t\t-h: This help message.\n");
-	fprintf(stderr,	"\t\t-s: mTHP size, expressed as page order.\n");
-	fprintf(stderr,	"\t\t    Defaults to 0. Use this size for anon allocations.\n");
-	exit(1);
+	ksft_print_msg("\nUsage: ./khugepaged [OPTIONS] <test type> [dir]\n\n");
+	ksft_print_msg("\t<test type>\t: <context>:<mem_type>\n");
+	ksft_print_msg("\t<context>\t: [all|khugepaged|madvise]\n");
+	ksft_print_msg("\t<mem_type>\t: [all|anon|file|shmem]\n");
+	ksft_print_msg("\n\t\"file,all\" mem_type requires [dir] argument\n");
+	ksft_print_msg("\n\t\"file,all\" mem_type requires kernel built with\n");
+	ksft_print_msg("\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
+	ksft_print_msg("\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
+	ksft_print_msg("\tmounted with huge=madvise option for khugepaged tests to work\n");
+	ksft_print_msg("\n\tSupported Options:\n");
+	ksft_print_msg("\t\t-h: This help message.\n");
+	ksft_print_msg("\t\t-s: mTHP size, expressed as page order.\n");
+	ksft_exit_fail_msg("\t\t    Defaults to 0. Use this size for anon allocations.\n");
 }
 
 static void parse_test_type(int argc, char **argv)
@@ -1190,16 +1126,21 @@ int main(int argc, char **argv)
 		.read_ahead_kb = 0,
 	};
 
+	ksft_print_header();
+
+	if (getuid())
+		ksft_finished();
+
+	ksft_set_plan(65);
+
 	parse_test_type(argc, argv);
 
 	setbuf(stdout, NULL);
 
 	page_size = getpagesize();
 	hpage_pmd_size = read_pmd_pagesize();
-	if (!hpage_pmd_size) {
-		printf("Reading PMD pagesize failed");
-		exit(EXIT_FAILURE);
-	}
+	if (!hpage_pmd_size)
+		ksft_exit_fail_msg("Reading PMD pagesize failed\n");
 	hpage_pmd_nr = hpage_pmd_size / page_size;
 	hpage_pmd_order = __builtin_ctz(hpage_pmd_nr);
 
@@ -1217,7 +1158,7 @@ int main(int argc, char **argv)
 
 #define TEST(t, c, o) do { \
 	if (c && o) { \
-		printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
+		ksft_print_msg("Run test: " #t " (%s:%s)\n", c->name, o->name); \
 		t(c, o); \
 	} \
 	} while (0)
@@ -1281,5 +1222,5 @@ int main(int argc, char **argv)
 	TEST(madvise_retracted_page_tables, madvise_context, file_ops);
 	TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
 
-	restore_settings(0);
+	ksft_finished();
 }

From 0bd0fe075ee70089850fc819a3b01cc581bcae22 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 26 Jan 2024 16:21:24 +0500
Subject: [PATCH 625/707] selftests/mm: hugetlb-read-hwpoison: conform test to
 TAP format output

Conform the layout, informational and status messages to TAP.  No
functional change is intended other than the layout of output messages.

Link: https://lkml.kernel.org/r/20240126112129.1480265-5-usama.anjum@collabora.com
Reviewed-by: Jiaqi Yan <jiaqiyan@google.com>
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../selftests/mm/hugetlb-read-hwpoison.c      | 116 ++++++++----------
 1 file changed, 54 insertions(+), 62 deletions(-)

diff --git a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c
index ba6cc6f9cabcdf..23b41b88c6aff0 100644
--- a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c
+++ b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c
@@ -58,8 +58,8 @@ static bool verify_chunk(char *buf, size_t len, char val)
 
 	for (i = 0; i < len; ++i) {
 		if (buf[i] != val) {
-			printf(PREFIX ERROR_PREFIX "check fail: buf[%lu] = %u != %u\n",
-				i, buf[i], val);
+			ksft_print_msg(PREFIX ERROR_PREFIX "check fail: buf[%lu] = %u != %u\n",
+				       i, buf[i], val);
 			return false;
 		}
 	}
@@ -75,21 +75,21 @@ static bool seek_read_hugepage_filemap(int fd, size_t len, size_t wr_chunk_size,
 	ssize_t total_ret_count = 0;
 	char val = offset / wr_chunk_size + offset % wr_chunk_size;
 
-	printf(PREFIX PREFIX "init val=%u with offset=0x%lx\n", val, offset);
-	printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n",
-	       expected);
+	ksft_print_msg(PREFIX PREFIX "init val=%u with offset=0x%lx\n", val, offset);
+	ksft_print_msg(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n",
+		       expected);
 	if (lseek(fd, offset, SEEK_SET) < 0) {
-		perror(PREFIX ERROR_PREFIX "seek failed");
+		ksft_perror(PREFIX ERROR_PREFIX "seek failed");
 		return false;
 	}
 
 	while (offset + total_ret_count < len) {
 		ret_count = read(fd, buf, wr_chunk_size);
 		if (ret_count == 0) {
-			printf(PREFIX PREFIX "read reach end of the file\n");
+			ksft_print_msg(PREFIX PREFIX "read reach end of the file\n");
 			break;
 		} else if (ret_count < 0) {
-			perror(PREFIX ERROR_PREFIX "read failed");
+			ksft_perror(PREFIX ERROR_PREFIX "read failed");
 			break;
 		}
 		++val;
@@ -98,8 +98,8 @@ static bool seek_read_hugepage_filemap(int fd, size_t len, size_t wr_chunk_size,
 
 		total_ret_count += ret_count;
 	}
-	printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n",
-	       total_ret_count);
+	ksft_print_msg(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n",
+		       total_ret_count);
 
 	return total_ret_count == expected;
 }
@@ -112,15 +112,15 @@ static bool read_hugepage_filemap(int fd, size_t len,
 	ssize_t total_ret_count = 0;
 	char val = 0;
 
-	printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n",
-	       expected);
+	ksft_print_msg(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n",
+		       expected);
 	while (total_ret_count < len) {
 		ret_count = read(fd, buf, wr_chunk_size);
 		if (ret_count == 0) {
-			printf(PREFIX PREFIX "read reach end of the file\n");
+			ksft_print_msg(PREFIX PREFIX "read reach end of the file\n");
 			break;
 		} else if (ret_count < 0) {
-			perror(PREFIX ERROR_PREFIX "read failed");
+			ksft_perror(PREFIX ERROR_PREFIX "read failed");
 			break;
 		}
 		++val;
@@ -129,8 +129,8 @@ static bool read_hugepage_filemap(int fd, size_t len,
 
 		total_ret_count += ret_count;
 	}
-	printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n",
-	       total_ret_count);
+	ksft_print_msg(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n",
+		       total_ret_count);
 
 	return total_ret_count == expected;
 }
@@ -142,14 +142,14 @@ test_hugetlb_read(int fd, size_t len, size_t wr_chunk_size)
 	char *filemap = NULL;
 
 	if (ftruncate(fd, len) < 0) {
-		perror(PREFIX ERROR_PREFIX "ftruncate failed");
+		ksft_perror(PREFIX ERROR_PREFIX "ftruncate failed");
 		return status;
 	}
 
 	filemap = mmap(NULL, len, PROT_READ | PROT_WRITE,
 		       MAP_SHARED | MAP_POPULATE, fd, 0);
 	if (filemap == MAP_FAILED) {
-		perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed");
+		ksft_perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed");
 		goto done;
 	}
 
@@ -162,7 +162,7 @@ test_hugetlb_read(int fd, size_t len, size_t wr_chunk_size)
 	munmap(filemap, len);
 done:
 	if (ftruncate(fd, 0) < 0) {
-		perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed");
+		ksft_perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed");
 		status = TEST_FAILED;
 	}
 
@@ -179,14 +179,14 @@ test_hugetlb_read_hwpoison(int fd, size_t len, size_t wr_chunk_size,
 	const unsigned long pagesize = getpagesize();
 
 	if (ftruncate(fd, len) < 0) {
-		perror(PREFIX ERROR_PREFIX "ftruncate failed");
+		ksft_perror(PREFIX ERROR_PREFIX "ftruncate failed");
 		return status;
 	}
 
 	filemap = mmap(NULL, len, PROT_READ | PROT_WRITE,
 		       MAP_SHARED | MAP_POPULATE, fd, 0);
 	if (filemap == MAP_FAILED) {
-		perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed");
+		ksft_perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed");
 		goto done;
 	}
 
@@ -201,7 +201,7 @@ test_hugetlb_read_hwpoison(int fd, size_t len, size_t wr_chunk_size,
 	 */
 	hwp_addr = filemap + len / 2 + pagesize;
 	if (madvise(hwp_addr, pagesize, MADV_HWPOISON) < 0) {
-		perror(PREFIX ERROR_PREFIX "MADV_HWPOISON failed");
+		ksft_perror(PREFIX ERROR_PREFIX "MADV_HWPOISON failed");
 		goto unmap;
 	}
 
@@ -228,7 +228,7 @@ test_hugetlb_read_hwpoison(int fd, size_t len, size_t wr_chunk_size,
 	munmap(filemap, len);
 done:
 	if (ftruncate(fd, 0) < 0) {
-		perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed");
+		ksft_perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed");
 		status = TEST_FAILED;
 	}
 
@@ -240,27 +240,32 @@ static int create_hugetlbfs_file(struct statfs *file_stat)
 	int fd;
 
 	fd = memfd_create("hugetlb_tmp", MFD_HUGETLB);
-	if (fd < 0) {
-		perror(PREFIX ERROR_PREFIX "could not open hugetlbfs file");
-		return -1;
-	}
+	if (fd < 0)
+		ksft_exit_fail_msg(PREFIX ERROR_PREFIX "could not open hugetlbfs file: %s\n",
+				   strerror(errno));
 
 	memset(file_stat, 0, sizeof(*file_stat));
+
 	if (fstatfs(fd, file_stat)) {
-		perror(PREFIX ERROR_PREFIX "fstatfs failed");
-		goto close;
+		close(fd);
+		ksft_exit_fail_msg(PREFIX ERROR_PREFIX "fstatfs failed: %s\n", strerror(errno));
 	}
 	if (file_stat->f_type != HUGETLBFS_MAGIC) {
-		printf(PREFIX ERROR_PREFIX "not hugetlbfs file\n");
-		goto close;
+		close(fd);
+		ksft_exit_fail_msg(PREFIX ERROR_PREFIX "not hugetlbfs file\n");
 	}
 
 	return fd;
-close:
-	close(fd);
-	return -1;
 }
 
+#define KSFT_PRINT_MSG(status, fmt, ...)					\
+	do {									\
+		if (status == TEST_SKIPPED)					\
+			ksft_test_result_skip(fmt, __VA_ARGS__);		\
+		else								\
+			ksft_test_result(status == TEST_PASSED, fmt, __VA_ARGS__); \
+	} while (0)
+
 int main(void)
 {
 	int fd;
@@ -273,50 +278,37 @@ int main(void)
 	};
 	size_t i;
 
+	ksft_print_header();
+	ksft_set_plan(12);
+
 	for (i = 0; i < ARRAY_SIZE(wr_chunk_sizes); ++i) {
-		printf("Write/read chunk size=0x%lx\n",
-		       wr_chunk_sizes[i]);
+		ksft_print_msg("Write/read chunk size=0x%lx\n",
+			       wr_chunk_sizes[i]);
 
 		fd = create_hugetlbfs_file(&file_stat);
-		if (fd < 0)
-			goto create_failure;
-		printf(PREFIX "HugeTLB read regression test...\n");
+		ksft_print_msg(PREFIX "HugeTLB read regression test...\n");
 		status = test_hugetlb_read(fd, file_stat.f_bsize,
 					   wr_chunk_sizes[i]);
-		printf(PREFIX "HugeTLB read regression test...%s\n",
-		       status_to_str(status));
+		KSFT_PRINT_MSG(status, PREFIX "HugeTLB read regression test...%s\n",
+			       status_to_str(status));
 		close(fd);
-		if (status == TEST_FAILED)
-			return -1;
 
 		fd = create_hugetlbfs_file(&file_stat);
-		if (fd < 0)
-			goto create_failure;
-		printf(PREFIX "HugeTLB read HWPOISON test...\n");
+		ksft_print_msg(PREFIX "HugeTLB read HWPOISON test...\n");
 		status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize,
 						    wr_chunk_sizes[i], false);
-		printf(PREFIX "HugeTLB read HWPOISON test...%s\n",
-		       status_to_str(status));
+		KSFT_PRINT_MSG(status, PREFIX "HugeTLB read HWPOISON test...%s\n",
+			       status_to_str(status));
 		close(fd);
-		if (status == TEST_FAILED)
-			return -1;
 
 		fd = create_hugetlbfs_file(&file_stat);
-		if (fd < 0)
-			goto create_failure;
-		printf(PREFIX "HugeTLB seek then read HWPOISON test...\n");
+		ksft_print_msg(PREFIX "HugeTLB seek then read HWPOISON test...\n");
 		status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize,
 						    wr_chunk_sizes[i], true);
-		printf(PREFIX "HugeTLB seek then read HWPOISON test...%s\n",
-		       status_to_str(status));
+		KSFT_PRINT_MSG(status, PREFIX "HugeTLB seek then read HWPOISON test...%s\n",
+			       status_to_str(status));
 		close(fd);
-		if (status == TEST_FAILED)
-			return -1;
 	}
 
-	return 0;
-
-create_failure:
-	printf(ERROR_PREFIX "Abort test: failed to create hugetlbfs file\n");
-	return -1;
+	ksft_finished();
 }

From c9850eacbd63e0cc246c29b1efd6664733575fb5 Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Fri, 26 Jan 2024 16:21:26 +0500
Subject: [PATCH 626/707] selftests/mm: config: add missing configs

Add configurations which are needed for
- hugetlb-read-hwpoison
- ksm_functional_test and ksm_test

Link: https://lkml.kernel.org/r/20240126112129.1480265-7-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Jiaqi Yan <jiaqiyan@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/config | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config
index 4309916f629e36..d16a72036eb7f1 100644
--- a/tools/testing/selftests/mm/config
+++ b/tools/testing/selftests/mm/config
@@ -7,3 +7,6 @@ CONFIG_TEST_HMM=m
 CONFIG_GUP_TEST=y
 CONFIG_TRANSPARENT_HUGEPAGE=y
 CONFIG_MEM_SOFT_DIRTY=y
+CONFIG_MEMORY_FAILURE=y
+CONFIG_HWPOISON_INJECT=y
+CONFIG_KSM=y

From b4d47a53076e5b359545c1d7c011d0fa61d629f0 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 29 Jan 2024 10:37:29 -0800
Subject: [PATCH 627/707] iov_iter: avoid wrap-around instrumentation in
 copy_compat_iovec_from_user()

The loop counter "i" in copy_compat_iovec_from_user() is an int, but
because the nr_segs argument is unsigned long, the signed overflow
sanitizer got worried "i" could wrap around.  Instead of making "i" an
unsigned long (which may enlarge the type size), switch both nr_segs and i
to u32.  There is no truncation with nr_segs since it is never larger than
UIO_MAXIOV anyway.  This keeps sanitizer instrumentation[1] out of a
UACCESS path:

vmlinux.o: warning: objtool: copy_compat_iovec_from_user+0xa9: call to __ubsan_handle_add_overflow() with UACCESS enabled

Link: https://github.com/KSPP/linux/issues/26 [1]
Link: https://lkml.kernel.org/r/20240129183729.work.991-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/iov_iter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index e0aa6b440ca5f4..d797a43dca91ac 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1166,11 +1166,12 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
 EXPORT_SYMBOL(dup_iter);
 
 static __noclone int copy_compat_iovec_from_user(struct iovec *iov,
-		const struct iovec __user *uvec, unsigned long nr_segs)
+		const struct iovec __user *uvec, u32 nr_segs)
 {
 	const struct compat_iovec __user *uiov =
 		(const struct compat_iovec __user *)uvec;
-	int ret = -EFAULT, i;
+	int ret = -EFAULT;
+	u32 i;
 
 	if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
 		return -EFAULT;

From 1001c3c3a79fd285f0b2a1793206dde92a9bda54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Noack?= <gnoack@google.com>
Date: Fri, 8 Dec 2023 16:51:17 +0100
Subject: [PATCH 628/707] selftests/landlock: Test IOCTL support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Exercises Landlock's IOCTL feature in different combinations of
handling and permitting the rights LANDLOCK_ACCESS_FS_IOCTL,
LANDLOCK_ACCESS_FS_READ_FILE, LANDLOCK_ACCESS_FS_WRITE_FILE and
LANDLOCK_ACCESS_FS_READ_DIR, and in different combinations of using
files and directories.

Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20231208155121.1943775-6-gnoack@google.com
[mic: Move down linux/fs.h include to fix build issue with old libc]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/fs_test.c | 432 ++++++++++++++++++++-
 1 file changed, 429 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 3203f4a5bc8595..8c1f08ad8c9114 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -23,6 +23,8 @@
 #include <sys/vfs.h>
 #include <unistd.h>
 
+#include <linux/fs.h>
+
 #include "common.h"
 
 #ifndef renameat2
@@ -735,6 +737,9 @@ static int create_ruleset(struct __test_metadata *const _metadata,
 	}
 
 	for (i = 0; rules[i].path; i++) {
+		if (!rules[i].access)
+			continue;
+
 		add_path_beneath(_metadata, ruleset_fd, rules[i].access,
 				 rules[i].path);
 	}
@@ -3443,7 +3448,7 @@ TEST_F_FORK(layout1, truncate_unhandled)
 			      LANDLOCK_ACCESS_FS_WRITE_FILE;
 	int ruleset_fd;
 
-	/* Enable Landlock. */
+	/* Enables Landlock. */
 	ruleset_fd = create_ruleset(_metadata, handled, rules);
 
 	ASSERT_LE(0, ruleset_fd);
@@ -3526,7 +3531,7 @@ TEST_F_FORK(layout1, truncate)
 			      LANDLOCK_ACCESS_FS_TRUNCATE;
 	int ruleset_fd;
 
-	/* Enable Landlock. */
+	/* Enables Landlock. */
 	ruleset_fd = create_ruleset(_metadata, handled, rules);
 
 	ASSERT_LE(0, ruleset_fd);
@@ -3752,7 +3757,7 @@ TEST_F_FORK(ftruncate, open_and_ftruncate)
 	};
 	int fd, ruleset_fd;
 
-	/* Enable Landlock. */
+	/* Enables Landlock. */
 	ruleset_fd = create_ruleset(_metadata, variant->handled, rules);
 	ASSERT_LE(0, ruleset_fd);
 	enforce_ruleset(_metadata, ruleset_fd);
@@ -3829,6 +3834,16 @@ TEST_F_FORK(ftruncate, open_and_ftruncate_in_different_processes)
 	ASSERT_EQ(0, close(socket_fds[1]));
 }
 
+/* Invokes the FS_IOC_GETFLAGS IOCTL and returns its errno or 0. */
+static int test_fs_ioc_getflags_ioctl(int fd)
+{
+	uint32_t flags;
+
+	if (ioctl(fd, FS_IOC_GETFLAGS, &flags) < 0)
+		return errno;
+	return 0;
+}
+
 TEST(memfd_ftruncate)
 {
 	int fd;
@@ -3845,6 +3860,417 @@ TEST(memfd_ftruncate)
 	ASSERT_EQ(0, close(fd));
 }
 
+/* clang-format off */
+FIXTURE(ioctl) {};
+/* clang-format on */
+
+FIXTURE_SETUP(ioctl)
+{
+	prepare_layout(_metadata);
+	create_file(_metadata, file1_s1d1);
+}
+
+FIXTURE_TEARDOWN(ioctl)
+{
+	EXPECT_EQ(0, remove_path(file1_s1d1));
+	cleanup_layout(_metadata);
+}
+
+FIXTURE_VARIANT(ioctl)
+{
+	const __u64 handled;
+	const __u64 allowed;
+	const mode_t open_mode;
+	/*
+	 * These are the expected IOCTL results for a representative IOCTL from
+	 * each of the IOCTL groups.  We only distinguish the 0 and EACCES
+	 * results here, and treat other errors as 0.
+	 */
+	const int expected_fioqsize_result; /* G1 */
+	const int expected_fibmap_result; /* G2 */
+	const int expected_fionread_result; /* G3 */
+	const int expected_fs_ioc_zero_range_result; /* G4 */
+	const int expected_fs_ioc_getflags_result; /* other */
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_i_allowed_none) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = 0,
+	.open_mode = O_RDWR,
+	.expected_fioqsize_result = EACCES,
+	.expected_fibmap_result = EACCES,
+	.expected_fionread_result = EACCES,
+	.expected_fs_ioc_zero_range_result = EACCES,
+	.expected_fs_ioc_getflags_result = EACCES,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_i_allowed_i) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_IOCTL,
+	.open_mode = O_RDWR,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+	.expected_fs_ioc_zero_range_result = 0,
+	.expected_fs_ioc_getflags_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, unhandled) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_EXECUTE,
+	.allowed = LANDLOCK_ACCESS_FS_EXECUTE,
+	.open_mode = O_RDWR,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+	.expected_fs_ioc_zero_range_result = 0,
+	.expected_fs_ioc_getflags_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_rwd_allowed_r) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE |
+		   LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_READ_DIR,
+	.allowed = LANDLOCK_ACCESS_FS_READ_FILE,
+	.open_mode = O_RDONLY,
+	/* If LANDLOCK_ACCESS_FS_IOCTL is not handled, all IOCTLs work. */
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+	.expected_fs_ioc_zero_range_result = 0,
+	.expected_fs_ioc_getflags_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_rwd_allowed_w) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE |
+		   LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_READ_DIR,
+	.allowed = LANDLOCK_ACCESS_FS_WRITE_FILE,
+	.open_mode = O_WRONLY,
+	/* If LANDLOCK_ACCESS_FS_IOCTL is not handled, all IOCTLs work. */
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+	.expected_fs_ioc_zero_range_result = 0,
+	.expected_fs_ioc_getflags_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_ri_allowed_r) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_READ_FILE,
+	.open_mode = O_RDONLY,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+	.expected_fs_ioc_zero_range_result = EACCES,
+	.expected_fs_ioc_getflags_result = EACCES,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_wi_allowed_w) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_WRITE_FILE,
+	.open_mode = O_WRONLY,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = EACCES,
+	.expected_fs_ioc_zero_range_result = 0,
+	.expected_fs_ioc_getflags_result = EACCES,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_di_allowed_d) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_DIR | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_READ_DIR,
+	.open_mode = O_RDWR,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = EACCES,
+	.expected_fionread_result = EACCES,
+	.expected_fs_ioc_zero_range_result = EACCES,
+	.expected_fs_ioc_getflags_result = EACCES,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_rwi_allowed_rw) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE |
+		   LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_READ_FILE | LANDLOCK_ACCESS_FS_WRITE_FILE,
+	.open_mode = O_RDWR,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+	.expected_fs_ioc_zero_range_result = 0,
+	.expected_fs_ioc_getflags_result = EACCES,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_rwi_allowed_r) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE |
+		   LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_READ_FILE,
+	.open_mode = O_RDONLY,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+	.expected_fs_ioc_zero_range_result = EACCES,
+	.expected_fs_ioc_getflags_result = EACCES,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_rwi_allowed_ri) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE |
+		   LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_READ_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.open_mode = O_RDONLY,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = 0,
+	.expected_fs_ioc_zero_range_result = EACCES,
+	.expected_fs_ioc_getflags_result = 0,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_rwi_allowed_w) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE |
+		   LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_WRITE_FILE,
+	.open_mode = O_WRONLY,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = EACCES,
+	.expected_fs_ioc_zero_range_result = 0,
+	.expected_fs_ioc_getflags_result = EACCES,
+};
+
+/* clang-format off */
+FIXTURE_VARIANT_ADD(ioctl, handled_rwi_allowed_wi) {
+	/* clang-format on */
+	.handled = LANDLOCK_ACCESS_FS_READ_FILE |
+		   LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.allowed = LANDLOCK_ACCESS_FS_WRITE_FILE | LANDLOCK_ACCESS_FS_IOCTL,
+	.open_mode = O_WRONLY,
+	.expected_fioqsize_result = 0,
+	.expected_fibmap_result = 0,
+	.expected_fionread_result = EACCES,
+	.expected_fs_ioc_zero_range_result = 0,
+	.expected_fs_ioc_getflags_result = 0,
+};
+
+static int test_fioqsize_ioctl(int fd)
+{
+	size_t sz;
+
+	if (ioctl(fd, FIOQSIZE, &sz) < 0)
+		return errno;
+	return 0;
+}
+
+static int test_fibmap_ioctl(int fd)
+{
+	int blk = 0;
+
+	/*
+	 * We only want to distinguish here whether Landlock already caught it,
+	 * so we treat anything but EACCESS as success.  (It commonly returns
+	 * EPERM when missing CAP_SYS_RAWIO.)
+	 */
+	if (ioctl(fd, FIBMAP, &blk) < 0 && errno == EACCES)
+		return errno;
+	return 0;
+}
+
+static int test_fionread_ioctl(int fd)
+{
+	size_t sz = 0;
+
+	if (ioctl(fd, FIONREAD, &sz) < 0 && errno == EACCES)
+		return errno;
+	return 0;
+}
+
+#define FS_IOC_ZERO_RANGE _IOW('X', 57, struct space_resv)
+
+static int test_fs_ioc_zero_range_ioctl(int fd)
+{
+	struct space_resv {
+		__s16 l_type;
+		__s16 l_whence;
+		__s64 l_start;
+		__s64 l_len; /* len == 0 means until end of file */
+		__s32 l_sysid;
+		__u32 l_pid;
+		__s32 l_pad[4]; /* reserved area */
+	} reservation = {};
+	/*
+	 * This can fail for various reasons, but we only want to distinguish
+	 * here whether Landlock already caught it, so we treat anything but
+	 * EACCES as success.
+	 */
+	if (ioctl(fd, FS_IOC_ZERO_RANGE, &reservation) < 0 && errno == EACCES)
+		return errno;
+	return 0;
+}
+
+TEST_F_FORK(ioctl, handle_dir_access_file)
+{
+	const int flag = 0;
+	const struct rule rules[] = {
+		{
+			.path = dir_s1d1,
+			.access = variant->allowed,
+		},
+		{},
+	};
+	int file_fd, ruleset_fd;
+
+	/* Enables Landlock. */
+	ruleset_fd = create_ruleset(_metadata, variant->handled, rules);
+	ASSERT_LE(0, ruleset_fd);
+	enforce_ruleset(_metadata, ruleset_fd);
+	ASSERT_EQ(0, close(ruleset_fd));
+
+	file_fd = open(file1_s1d1, variant->open_mode);
+	ASSERT_LE(0, file_fd);
+
+	/*
+	 * Checks that IOCTL commands in each IOCTL group return the expected
+	 * errors.
+	 */
+	EXPECT_EQ(variant->expected_fioqsize_result,
+		  test_fioqsize_ioctl(file_fd));
+	EXPECT_EQ(variant->expected_fibmap_result, test_fibmap_ioctl(file_fd));
+	EXPECT_EQ(variant->expected_fionread_result,
+		  test_fionread_ioctl(file_fd));
+	EXPECT_EQ(variant->expected_fs_ioc_zero_range_result,
+		  test_fs_ioc_zero_range_ioctl(file_fd));
+	EXPECT_EQ(variant->expected_fs_ioc_getflags_result,
+		  test_fs_ioc_getflags_ioctl(file_fd));
+
+	/* Checks that unrestrictable commands are unrestricted. */
+	EXPECT_EQ(0, ioctl(file_fd, FIOCLEX));
+	EXPECT_EQ(0, ioctl(file_fd, FIONCLEX));
+	EXPECT_EQ(0, ioctl(file_fd, FIONBIO, &flag));
+	EXPECT_EQ(0, ioctl(file_fd, FIOASYNC, &flag));
+
+	ASSERT_EQ(0, close(file_fd));
+}
+
+TEST_F_FORK(ioctl, handle_dir_access_dir)
+{
+	const char *const path = dir_s1d1;
+	const int flag = 0;
+	const struct rule rules[] = {
+		{
+			.path = path,
+			.access = variant->allowed,
+		},
+		{},
+	};
+	int dir_fd, ruleset_fd;
+
+	/* Enables Landlock. */
+	ruleset_fd = create_ruleset(_metadata, variant->handled, rules);
+	ASSERT_LE(0, ruleset_fd);
+	enforce_ruleset(_metadata, ruleset_fd);
+	ASSERT_EQ(0, close(ruleset_fd));
+
+	/*
+	 * Ignore variant->open_mode for this test, as we intend to open a
+	 * directory.  If the directory can not be opened, the variant is
+	 * infeasible to test with an opened directory.
+	 */
+	dir_fd = open(path, O_RDONLY);
+	if (dir_fd < 0)
+		return;
+
+	/*
+	 * Checks that IOCTL commands in each IOCTL group return the expected
+	 * errors.
+	 */
+	EXPECT_EQ(variant->expected_fioqsize_result,
+		  test_fioqsize_ioctl(dir_fd));
+	EXPECT_EQ(variant->expected_fibmap_result, test_fibmap_ioctl(dir_fd));
+	EXPECT_EQ(variant->expected_fionread_result,
+		  test_fionread_ioctl(dir_fd));
+	EXPECT_EQ(variant->expected_fs_ioc_zero_range_result,
+		  test_fs_ioc_zero_range_ioctl(dir_fd));
+	EXPECT_EQ(variant->expected_fs_ioc_getflags_result,
+		  test_fs_ioc_getflags_ioctl(dir_fd));
+
+	/* Checks that unrestrictable commands are unrestricted. */
+	EXPECT_EQ(0, ioctl(dir_fd, FIOCLEX));
+	EXPECT_EQ(0, ioctl(dir_fd, FIONCLEX));
+	EXPECT_EQ(0, ioctl(dir_fd, FIONBIO, &flag));
+	EXPECT_EQ(0, ioctl(dir_fd, FIOASYNC, &flag));
+
+	ASSERT_EQ(0, close(dir_fd));
+}
+
+TEST_F_FORK(ioctl, handle_file_access_file)
+{
+	const char *const path = file1_s1d1;
+	const int flag = 0;
+	const struct rule rules[] = {
+		{
+			.path = path,
+			.access = variant->allowed,
+		},
+		{},
+	};
+	int file_fd, ruleset_fd;
+
+	if (variant->allowed & LANDLOCK_ACCESS_FS_READ_DIR) {
+		SKIP(return, "LANDLOCK_ACCESS_FS_READ_DIR "
+			     "can not be granted on files");
+	}
+
+	/* Enables Landlock. */
+	ruleset_fd = create_ruleset(_metadata, variant->handled, rules);
+	ASSERT_LE(0, ruleset_fd);
+	enforce_ruleset(_metadata, ruleset_fd);
+	ASSERT_EQ(0, close(ruleset_fd));
+
+	file_fd = open(path, variant->open_mode);
+	ASSERT_LE(0, file_fd);
+
+	/*
+	 * Checks that IOCTL commands in each IOCTL group return the expected
+	 * errors.
+	 */
+	EXPECT_EQ(variant->expected_fioqsize_result,
+		  test_fioqsize_ioctl(file_fd));
+	EXPECT_EQ(variant->expected_fibmap_result, test_fibmap_ioctl(file_fd));
+	EXPECT_EQ(variant->expected_fionread_result,
+		  test_fionread_ioctl(file_fd));
+	EXPECT_EQ(variant->expected_fs_ioc_zero_range_result,
+		  test_fs_ioc_zero_range_ioctl(file_fd));
+	EXPECT_EQ(variant->expected_fs_ioc_getflags_result,
+		  test_fs_ioc_getflags_ioctl(file_fd));
+
+	/* Checks that unrestrictable commands are unrestricted. */
+	EXPECT_EQ(0, ioctl(file_fd, FIOCLEX));
+	EXPECT_EQ(0, ioctl(file_fd, FIONCLEX));
+	EXPECT_EQ(0, ioctl(file_fd, FIONBIO, &flag));
+	EXPECT_EQ(0, ioctl(file_fd, FIOASYNC, &flag));
+
+	ASSERT_EQ(0, close(file_fd));
+}
+
 /* clang-format off */
 FIXTURE(layout1_bind) {};
 /* clang-format on */

From 4c3146bc56d500f3a281d0f50f0888c91db5df40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Noack?= <gnoack@google.com>
Date: Fri, 8 Dec 2023 16:51:18 +0100
Subject: [PATCH 629/707] selftests/landlock: Test IOCTL with memfds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Because the LANDLOCK_ACCESS_FS_IOCTL right is associated with the
opened file during open(2), IOCTLs are supposed to work with files
which are opened by means other than open(2).

Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20231208155121.1943775-7-gnoack@google.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/fs_test.c | 36 ++++++++++++++++------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 8c1f08ad8c9114..9afe37700f4741 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -3844,20 +3844,38 @@ static int test_fs_ioc_getflags_ioctl(int fd)
 	return 0;
 }
 
-TEST(memfd_ftruncate)
+TEST(memfd_ftruncate_and_ioctl)
 {
-	int fd;
-
-	fd = memfd_create("name", MFD_CLOEXEC);
-	ASSERT_LE(0, fd);
+	const struct landlock_ruleset_attr attr = {
+		.handled_access_fs = ACCESS_ALL,
+	};
+	int ruleset_fd, fd, i;
 
 	/*
-	 * Checks that ftruncate is permitted on file descriptors that are
-	 * created in ways other than open(2).
+	 * We exercise the same test both with and without Landlock enabled, to
+	 * ensure that it behaves the same in both cases.
 	 */
-	EXPECT_EQ(0, test_ftruncate(fd));
+	for (i = 0; i < 2; i++) {
+		/* Creates a new memfd. */
+		fd = memfd_create("name", MFD_CLOEXEC);
+		ASSERT_LE(0, fd);
 
-	ASSERT_EQ(0, close(fd));
+		/*
+		 * Checks that operations associated with the opened file
+		 * (ftruncate, ioctl) are permitted on file descriptors that are
+		 * created in ways other than open(2).
+		 */
+		EXPECT_EQ(0, test_ftruncate(fd));
+		EXPECT_EQ(0, test_fs_ioc_getflags_ioctl(fd));
+
+		ASSERT_EQ(0, close(fd));
+
+		/* Enables Landlock. */
+		ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0);
+		ASSERT_LE(0, ruleset_fd);
+		enforce_ruleset(_metadata, ruleset_fd);
+		ASSERT_EQ(0, close(ruleset_fd));
+	}
 }
 
 /* clang-format off */

From 289982f3e9cb6247e1cea35cc62a220ccb28e39f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Noack?= <gnoack@google.com>
Date: Fri, 8 Dec 2023 16:51:19 +0100
Subject: [PATCH 630/707] selftests/landlock: Test ioctl(2) and ftruncate(2)
 with open(O_PATH)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ioctl(2) and ftruncate(2) operations on files opened with O_PATH
should always return EBADF, independent of the
LANDLOCK_ACCESS_FS_TRUNCATE and LANDLOCK_ACCESS_FS_IOCTL access rights
in that file hierarchy.

Suggested-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20231208155121.1943775-8-gnoack@google.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/fs_test.c | 40 ++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 9afe37700f4741..1c2d158b1f0773 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -3878,6 +3878,46 @@ TEST(memfd_ftruncate_and_ioctl)
 	}
 }
 
+TEST_F_FORK(layout1, o_path_ftruncate_and_ioctl)
+{
+	const struct landlock_ruleset_attr attr = {
+		.handled_access_fs = ACCESS_ALL,
+	};
+	int ruleset_fd, fd;
+
+	/*
+	 * Checks that for files opened with O_PATH, both ioctl(2) and
+	 * ftruncate(2) yield EBADF, as it is documented in open(2) for the
+	 * O_PATH flag.
+	 */
+	fd = open(dir_s1d1, O_PATH | O_CLOEXEC);
+	ASSERT_LE(0, fd);
+
+	EXPECT_EQ(EBADF, test_ftruncate(fd));
+	EXPECT_EQ(EBADF, test_fs_ioc_getflags_ioctl(fd));
+
+	ASSERT_EQ(0, close(fd));
+
+	/* Enables Landlock. */
+	ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0);
+	ASSERT_LE(0, ruleset_fd);
+	enforce_ruleset(_metadata, ruleset_fd);
+	ASSERT_EQ(0, close(ruleset_fd));
+
+	/*
+	 * Checks that after enabling Landlock,
+	 * - the file can still be opened with O_PATH
+	 * - both ioctl and truncate still yield EBADF (not EACCES).
+	 */
+	fd = open(dir_s1d1, O_PATH | O_CLOEXEC);
+	ASSERT_LE(0, fd);
+
+	EXPECT_EQ(EBADF, test_ftruncate(fd));
+	EXPECT_EQ(EBADF, test_fs_ioc_getflags_ioctl(fd));
+
+	ASSERT_EQ(0, close(fd));
+}
+
 /* clang-format off */
 FIXTURE(ioctl) {};
 /* clang-format on */

From 19b18b218c2c9aca57a73df45c7db0beab03b2e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Noack?= <gnoack@google.com>
Date: Fri, 8 Dec 2023 16:51:20 +0100
Subject: [PATCH 631/707] samples/landlock: Add support for
 LANDLOCK_ACCESS_FS_IOCTL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add ioctl support to the Landlock sample tool.

The ioctl right is grouped with the read-write rights in the sample
tool, as some ioctl requests provide features that mutate state.

Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20231208155121.1943775-9-gnoack@google.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 samples/landlock/sandboxer.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/samples/landlock/sandboxer.c b/samples/landlock/sandboxer.c
index 08596c0ef0707c..d7323e5526be29 100644
--- a/samples/landlock/sandboxer.c
+++ b/samples/landlock/sandboxer.c
@@ -81,7 +81,8 @@ static int parse_path(char *env_path, const char ***const path_list)
 	LANDLOCK_ACCESS_FS_EXECUTE | \
 	LANDLOCK_ACCESS_FS_WRITE_FILE | \
 	LANDLOCK_ACCESS_FS_READ_FILE | \
-	LANDLOCK_ACCESS_FS_TRUNCATE)
+	LANDLOCK_ACCESS_FS_TRUNCATE | \
+	LANDLOCK_ACCESS_FS_IOCTL)
 
 /* clang-format on */
 
@@ -199,11 +200,12 @@ static int populate_ruleset_net(const char *const env_var, const int ruleset_fd,
 	LANDLOCK_ACCESS_FS_MAKE_BLOCK | \
 	LANDLOCK_ACCESS_FS_MAKE_SYM | \
 	LANDLOCK_ACCESS_FS_REFER | \
-	LANDLOCK_ACCESS_FS_TRUNCATE)
+	LANDLOCK_ACCESS_FS_TRUNCATE | \
+	LANDLOCK_ACCESS_FS_IOCTL)
 
 /* clang-format on */
 
-#define LANDLOCK_ABI_LAST 4
+#define LANDLOCK_ABI_LAST 5
 
 int main(const int argc, char *const argv[], char *const *const envp)
 {
@@ -317,6 +319,11 @@ int main(const int argc, char *const argv[], char *const *const envp)
 		ruleset_attr.handled_access_net &=
 			~(LANDLOCK_ACCESS_NET_BIND_TCP |
 			  LANDLOCK_ACCESS_NET_CONNECT_TCP);
+		__attribute__((fallthrough));
+	case 4:
+		/* Removes LANDLOCK_ACCESS_FS_IOCTL for ABI < 5 */
+		ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_IOCTL;
+
 		fprintf(stderr,
 			"Hint: You should update the running kernel "
 			"to leverage Landlock features "

From 2f8bb71d737c25ca97a83e47b445837aa96cec77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnther=20Noack?= <gnoack@google.com>
Date: Fri, 8 Dec 2023 16:51:21 +0100
Subject: [PATCH 632/707] landlock: Document IOCTL support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the paragraph above the fallback logic, use the shorter phrasing
from the landlock(7) man page.

Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20231208155121.1943775-10-gnoack@google.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 Documentation/userspace-api/landlock.rst | 119 ++++++++++++++++++++---
 1 file changed, 104 insertions(+), 15 deletions(-)

diff --git a/Documentation/userspace-api/landlock.rst b/Documentation/userspace-api/landlock.rst
index 2e38226770615d..8398851964e61c 100644
--- a/Documentation/userspace-api/landlock.rst
+++ b/Documentation/userspace-api/landlock.rst
@@ -75,7 +75,8 @@ to be explicit about the denied-by-default access rights.
             LANDLOCK_ACCESS_FS_MAKE_BLOCK |
             LANDLOCK_ACCESS_FS_MAKE_SYM |
             LANDLOCK_ACCESS_FS_REFER |
-            LANDLOCK_ACCESS_FS_TRUNCATE,
+            LANDLOCK_ACCESS_FS_TRUNCATE |
+            LANDLOCK_ACCESS_FS_IOCTL,
         .handled_access_net =
             LANDLOCK_ACCESS_NET_BIND_TCP |
             LANDLOCK_ACCESS_NET_CONNECT_TCP,
@@ -84,10 +85,10 @@ to be explicit about the denied-by-default access rights.
 Because we may not know on which kernel version an application will be
 executed, it is safer to follow a best-effort security approach.  Indeed, we
 should try to protect users as much as possible whatever the kernel they are
-using.  To avoid binary enforcement (i.e. either all security features or
-none), we can leverage a dedicated Landlock command to get the current version
-of the Landlock ABI and adapt the handled accesses.  Let's check if we should
-remove access rights which are only supported in higher versions of the ABI.
+using.
+
+To be compatible with older Linux versions, we detect the available Landlock ABI
+version, and only use the available subset of access rights:
 
 .. code-block:: c
 
@@ -113,6 +114,10 @@ remove access rights which are only supported in higher versions of the ABI.
         ruleset_attr.handled_access_net &=
             ~(LANDLOCK_ACCESS_NET_BIND_TCP |
               LANDLOCK_ACCESS_NET_CONNECT_TCP);
+        __attribute__((fallthrough));
+    case 4:
+        /* Removes LANDLOCK_ACCESS_FS_IOCTL for ABI < 5 */
+        ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_IOCTL;
     }
 
 This enables to create an inclusive ruleset that will contain our rules.
@@ -224,6 +229,7 @@ access rights per directory enables to change the location of such directory
 without relying on the destination directory access rights (except those that
 are required for this operation, see ``LANDLOCK_ACCESS_FS_REFER``
 documentation).
+
 Having self-sufficient hierarchies also helps to tighten the required access
 rights to the minimal set of data.  This also helps avoid sinkhole directories,
 i.e.  directories where data can be linked to but not linked from.  However,
@@ -317,18 +323,69 @@ It should also be noted that truncating files does not require the
 system call, this can also be done through :manpage:`open(2)` with the flags
 ``O_RDONLY | O_TRUNC``.
 
-When opening a file, the availability of the ``LANDLOCK_ACCESS_FS_TRUNCATE``
-right is associated with the newly created file descriptor and will be used for
-subsequent truncation attempts using :manpage:`ftruncate(2)`.  The behavior is
-similar to opening a file for reading or writing, where permissions are checked
-during :manpage:`open(2)`, but not during the subsequent :manpage:`read(2)` and
+The truncate right is associated with the opened file (see below).
+
+Rights associated with file descriptors
+---------------------------------------
+
+When opening a file, the availability of the ``LANDLOCK_ACCESS_FS_TRUNCATE`` and
+``LANDLOCK_ACCESS_FS_IOCTL`` rights is associated with the newly created file
+descriptor and will be used for subsequent truncation and ioctl attempts using
+:manpage:`ftruncate(2)` and :manpage:`ioctl(2)`.  The behavior is similar to
+opening a file for reading or writing, where permissions are checked during
+:manpage:`open(2)`, but not during the subsequent :manpage:`read(2)` and
 :manpage:`write(2)` calls.
 
-As a consequence, it is possible to have multiple open file descriptors for the
-same file, where one grants the right to truncate the file and the other does
-not.  It is also possible to pass such file descriptors between processes,
-keeping their Landlock properties, even when these processes do not have an
-enforced Landlock ruleset.
+As a consequence, it is possible that a process has multiple open file
+descriptors referring to the same file, but Landlock enforces different things
+when operating with these file descriptors.  This can happen when a Landlock
+ruleset gets enforced and the process keeps file descriptors which were opened
+both before and after the enforcement.  It is also possible to pass such file
+descriptors between processes, keeping their Landlock properties, even when some
+of the involved processes do not have an enforced Landlock ruleset.
+
+Restricting IOCTL commands
+--------------------------
+
+When the ``LANDLOCK_ACCESS_FS_IOCTL`` access right is handled, Landlock will
+restrict the invocation of IOCTL commands.  However, to *permit* these IOCTL
+commands again, some of these IOCTL commands are then granted through other,
+preexisting access rights.
+
+For example, consider a program which handles ``LANDLOCK_ACCESS_FS_IOCTL`` and
+``LANDLOCK_ACCESS_FS_READ_FILE``.  The program *permits*
+``LANDLOCK_ACCESS_FS_READ_FILE`` on a file ``foo.log``.
+
+By virtue of granting this access on the ``foo.log`` file, it is now possible to
+use common and harmless IOCTL commands which are useful when reading files, such
+as ``FIONREAD``.
+
+On the other hand, if the program permits ``LANDLOCK_ACCESS_FS_IOCTL`` on
+another file, ``FIONREAD`` will not work on that file when it is opened.  As
+soon as ``LANDLOCK_ACCESS_FS_READ_FILE`` is *handled* in the ruleset, the IOCTL
+commands affected by it can not be reenabled though ``LANDLOCK_ACCESS_FS_IOCTL``
+any more, but are then governed by ``LANDLOCK_ACCESS_FS_READ_FILE``.
+
+The following table illustrates how IOCTL attempts for ``FIONREAD`` are
+filtered, depending on how a Landlock ruleset handles and permits the
+``LANDLOCK_ACCESS_FS_IOCTL`` and ``LANDLOCK_ACCESS_FS_READ_FILE`` access rights:
+
++------------------------+-------------+-------------------+-------------------+
+|                        | ``IOCTL``   | ``IOCTL`` handled | ``IOCTL`` handled |
+|                        | not handled | and permitted     | and not permitted |
++------------------------+-------------+-------------------+-------------------+
+| ``READ_FILE`` not      | allow       | allow             | deny              |
+| handled                |             |                   |                   |
++------------------------+             +-------------------+-------------------+
+| ``READ_FILE`` handled  |             | allow                                 |
+| and permitted          |             |                                       |
++------------------------+             +-------------------+-------------------+
+| ``READ_FILE`` handled  |             | deny                                  |
+| and not permitted      |             |                                       |
++------------------------+-------------+-------------------+-------------------+
+
+The full list of IOCTL commands and the access rights which affect them is
+documented below.
 
 Compatibility
 =============
@@ -457,6 +514,28 @@ Memory usage
 Kernel memory allocated to create rulesets is accounted and can be restricted
 by the Documentation/admin-guide/cgroup-v1/memory.rst.
 
+IOCTL support
+-------------
+
+The ``LANDLOCK_ACCESS_FS_IOCTL`` access right restricts the use of
+:manpage:`ioctl(2)`, but it only applies to newly opened files.  This means
+specifically that pre-existing file descriptors like stdin, stdout and stderr
+are unaffected.
+
+Users should be aware that TTY devices have traditionally permitted to control
+other processes on the same TTY through the ``TIOCSTI`` and ``TIOCLINUX`` IOCTL
+commands.  It is therefore recommended to close inherited TTY file descriptors,
+or to reopen them from ``/proc/self/fd/*`` without the
+``LANDLOCK_ACCESS_FS_IOCTL`` right, if possible.  The :manpage:`isatty(3)`
+function checks whether a given file descriptor is a TTY.
+
+Landlock's IOCTL support is coarse-grained at the moment, but may become more
+fine-grained in the future.  Until then, users are advised to establish the
+guarantees that they need through the file hierarchy, by only permitting the
+``LANDLOCK_ACCESS_FS_IOCTL`` right on files where it is really harmless.  In
+cases where you can control the mounts, the ``nodev`` mount option can help to
+rule out that device files can be accessed.
+
 Previous limitations
 ====================
 
@@ -494,6 +573,16 @@ bind and connect actions to only a set of allowed ports thanks to the new
 ``LANDLOCK_ACCESS_NET_BIND_TCP`` and ``LANDLOCK_ACCESS_NET_CONNECT_TCP``
 access rights.
 
+IOCTL (ABI < 5)
+---------------
+
+IOCTL operations could not be denied before the fifth Landlock ABI, so
+:manpage:`ioctl(2)` is always allowed when using a kernel that only supports an
+earlier ABI.
+
+Starting with the Landlock ABI version 5, it is possible to restrict the use of
+:manpage:`ioctl(2)` using the new ``LANDLOCK_ACCESS_FS_IOCTL`` access right.
+
 .. _kernel_support:
 
 Kernel support

From 6f351af0c85fdbc9fe184bd9744b1b3caf4e9431 Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Wed, 31 Jan 2024 15:09:41 +0800
Subject: [PATCH 633/707] fs: Use KMEM_CACHE instead of kmem_cache_create

commit 0a31bd5f2bbb ("KMEM_CACHE(): simplify slab cache creation")
introduces a new macro.
Use the new KMEM_CACHE() macro instead of direct kmem_cache_create
to simplify the creation of SLAB caches.

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Link: https://lore.kernel.org/r/20240131070941.135178-1-chentao@kylinos.cn
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/backing-file.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/backing-file.c b/fs/backing-file.c
index a681f38d84d8e1..740185198db347 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -325,9 +325,7 @@ EXPORT_SYMBOL_GPL(backing_file_mmap);
 
 static int __init backing_aio_init(void)
 {
-	backing_aio_cachep = kmem_cache_create("backing_aio",
-					       sizeof(struct backing_aio),
-					       0, SLAB_HWCACHE_ALIGN, NULL);
+	backing_aio_cachep = KMEM_CACHE(backing_aio, SLAB_HWCACHE_ALIGN);
 	if (!backing_aio_cachep)
 		return -ENOMEM;
 

From b473491b6cf8063c040e6863b74f6d31cabf52f8 Mon Sep 17 00:00:00 2001
From: Wang Jinchao <wangjinchao@xfusion.com>
Date: Wed, 31 Jan 2024 10:54:41 +0800
Subject: [PATCH 634/707] fork: Using clone_flags for legacy clone check

In the current implementation of clone(), there is a line that
initializes `u64 clone_flags = args->flags` at the top.
This means that there is no longer a need to use args->flags
for the legacy clone check.

Signed-off-by: Wang Jinchao <wangjinchao@xfusion.com>
Link: https://lore.kernel.org/r/202401311054+0800-wangjinchao@xfusion.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 34704d277b68a5..726a92043531ff 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2872,8 +2872,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
 	 * here has the advantage that we don't need to have a separate helper
 	 * to check for legacy clone().
 	 */
-	if ((args->flags & CLONE_PIDFD) &&
-	    (args->flags & CLONE_PARENT_SETTID) &&
+	if ((clone_flags & CLONE_PIDFD) &&
+	    (clone_flags & CLONE_PARENT_SETTID) &&
 	    (args->pidfd == args->parent_tid))
 		return -EINVAL;
 

From 25639e92cc75059375ea7cb8bcbf1b678feaef9b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:21 +1100
Subject: [PATCH 635/707] nfsd: remove stale comment in nfs4_show_deleg()

As we do now support write delegations, this comment is unhelpful and
misleading.

Reported-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ae9b5a3a585f96..5e640e9945cd6b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2711,7 +2711,6 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
 	nfs4_show_stateid(s, &st->sc_stateid);
 	seq_printf(s, ": { type: deleg, ");
 
-	/* Kinda dead code as long as we only support read delegs: */
 	seq_printf(s, "access: %s, ",
 		ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w");
 

From 2f9ed7d34aa56e75e449b4a9e1079eaf77a501e3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:22 +1100
Subject: [PATCH 636/707] nfsd: hold ->cl_lock for hash_delegation_locked()

The protocol for creating a new state in nfsd is to allocate the state
leaving it largely uninitialised, add that state to the ->cl_stateids
idr so as to reserve a state-id, then complete initialisation of the
state and only set ->sc_type to non-zero once the state is fully
initialised.

If a state is found in the idr with ->sc_type == 0, it is ignored.
The ->cl_lock lock is used to avoid races - it is held while checking
sc_type during lookup, and held when a non-zero value is stored in
->sc_type.

... except... hash_delegation_locked() finalises the initialisation of a
delegation state, but does NOT hold ->cl_lock.

So this patch takes ->cl_lock at the appropriate time w.r.t other locks,
and so ensures there are no races (which are extremely unlikely in any
case).
As ->fi_lock is often taken when ->cl_lock is held, we need to take
->cl_lock first of those two.
Currently ->cl_lock and state_lock are never both taken at the same time.
We need both for this patch so an arbitrary choice is needed concerning
which to take first.  As state_lock is more global, it might be more
contended, so take it first.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 5e640e9945cd6b..ae00f9327245e5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1312,6 +1312,7 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
 
 	lockdep_assert_held(&state_lock);
 	lockdep_assert_held(&fp->fi_lock);
+	lockdep_assert_held(&clp->cl_lock);
 
 	if (nfs4_delegation_exists(clp, fp))
 		return -EAGAIN;
@@ -5558,12 +5559,14 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 		goto out_unlock;
 
 	spin_lock(&state_lock);
+	spin_lock(&clp->cl_lock);
 	spin_lock(&fp->fi_lock);
 	if (fp->fi_had_conflict)
 		status = -EAGAIN;
 	else
 		status = hash_delegation_locked(dp, fp);
 	spin_unlock(&fp->fi_lock);
+	spin_unlock(&clp->cl_lock);
 	spin_unlock(&state_lock);
 
 	if (status)

From 05d89966ab3eabc4394068ebb3bf5720b01a315f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:23 +1100
Subject: [PATCH 637/707] nfsd: don't call functions with side-effecting inside
 WARN_ON()

Code like:

    WARN_ON(foo())

looks like an assertion and might not be expected to have any side
effects.
When testing if a function with side-effects fails a construct like

    if (foo())
       WARN_ON(1);

makes the intent more obvious.

nfsd has several WARN_ON calls where the test has side effects, so it
would be good to change them.  These cases don't really need the
WARN_ON.  They have never failed in 8 years of usage so let's just
remove the WARN_ON wrapper.

Suggested-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ae00f9327245e5..59982fa5d4fafe 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1600,7 +1600,7 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
 	while (!list_empty(&open_stp->st_locks)) {
 		stp = list_entry(open_stp->st_locks.next,
 				struct nfs4_ol_stateid, st_locks);
-		WARN_ON(!unhash_lock_stateid(stp));
+		unhash_lock_stateid(stp);
 		put_ol_stateid_locked(stp, reaplist);
 	}
 }
@@ -2229,7 +2229,7 @@ __destroy_client(struct nfs4_client *clp)
 	spin_lock(&state_lock);
 	while (!list_empty(&clp->cl_delegations)) {
 		dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
-		WARN_ON(!unhash_delegation_locked(dp));
+		unhash_delegation_locked(dp);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
@@ -6170,7 +6170,7 @@ nfs4_laundromat(struct nfsd_net *nn)
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
 		if (!state_expired(&lt, dp->dl_time))
 			break;
-		WARN_ON(!unhash_delegation_locked(dp));
+		unhash_delegation_locked(dp);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
@@ -8010,7 +8010,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 		stp = list_first_entry(&lo->lo_owner.so_stateids,
 				       struct nfs4_ol_stateid,
 				       st_perstateowner);
-		WARN_ON(!unhash_lock_stateid(stp));
+		unhash_lock_stateid(stp);
 		put_ol_stateid_locked(stp, &reaplist);
 	}
 	spin_unlock(&clp->cl_lock);
@@ -8303,7 +8303,7 @@ nfs4_state_shutdown_net(struct net *net)
 	spin_lock(&state_lock);
 	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-		WARN_ON(!unhash_delegation_locked(dp));
+		unhash_delegation_locked(dp);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);

From 176e7ce14e1a2c908dde39b9f38e7e78ae5c594e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:24 +1100
Subject: [PATCH 638/707] nfsd: avoid race after unhash_delegation_locked()

NFS4_CLOSED_DELEG_STID and NFS4_REVOKED_DELEG_STID are similar in
purpose.
REVOKED is used for NFSv4.1 states which have been revoked because the
lease has expired.  CLOSED is used in other cases.
The difference has two practical effects.
1/ REVOKED states are on the ->cl_revoked list
2/ REVOKED states result in nfserr_deleg_revoked from
   nfsd4_verify_open_stid() and nfsd4_validate_stateid while
   CLOSED states result in nfserr_bad_stid.

Currently a state that is being revoked is first set to "CLOSED" in
unhash_delegation_locked(), then possibly to "REVOKED" in
revoke_delegation(), at which point it is added to the cl_revoked list.

It is possible that a stateid test could see the CLOSED state
which really should be REVOKED, and so return the wrong error code.  So
it is safest to remove this window of inconsistency.

With this patch, unhash_delegation_locked() always sets the state
correctly, and revoke_delegation() no longer changes the state.

Also remove a redundant test on minorversion when
NFS4_REVOKED_DELEG_STID is seen - it can only be seen when minorversion
is non-zero.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 59982fa5d4fafe..3527b9388174bb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1329,7 +1329,7 @@ static bool delegation_hashed(struct nfs4_delegation *dp)
 }
 
 static bool
-unhash_delegation_locked(struct nfs4_delegation *dp)
+unhash_delegation_locked(struct nfs4_delegation *dp, unsigned char type)
 {
 	struct nfs4_file *fp = dp->dl_stid.sc_file;
 
@@ -1338,7 +1338,9 @@ unhash_delegation_locked(struct nfs4_delegation *dp)
 	if (!delegation_hashed(dp))
 		return false;
 
-	dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID;
+	if (dp->dl_stid.sc_client->cl_minorversion == 0)
+		type = NFS4_CLOSED_DELEG_STID;
+	dp->dl_stid.sc_type = type;
 	/* Ensure that deleg break won't try to requeue it */
 	++dp->dl_time;
 	spin_lock(&fp->fi_lock);
@@ -1354,7 +1356,7 @@ static void destroy_delegation(struct nfs4_delegation *dp)
 	bool unhashed;
 
 	spin_lock(&state_lock);
-	unhashed = unhash_delegation_locked(dp);
+	unhashed = unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID);
 	spin_unlock(&state_lock);
 	if (unhashed)
 		destroy_unhashed_deleg(dp);
@@ -1368,9 +1370,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
 
 	trace_nfsd_stid_revoke(&dp->dl_stid);
 
-	if (clp->cl_minorversion) {
+	if (dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) {
 		spin_lock(&clp->cl_lock);
-		dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
 		refcount_inc(&dp->dl_stid.sc_count);
 		list_add(&dp->dl_recall_lru, &clp->cl_revoked);
 		spin_unlock(&clp->cl_lock);
@@ -2229,7 +2230,7 @@ __destroy_client(struct nfs4_client *clp)
 	spin_lock(&state_lock);
 	while (!list_empty(&clp->cl_delegations)) {
 		dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
-		unhash_delegation_locked(dp);
+		unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
@@ -5146,8 +5147,7 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
 		goto out;
 	if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) {
 		nfs4_put_stid(&deleg->dl_stid);
-		if (cl->cl_minorversion)
-			status = nfserr_deleg_revoked;
+		status = nfserr_deleg_revoked;
 		goto out;
 	}
 	flags = share_access_to_flags(open->op_share_access);
@@ -6170,7 +6170,7 @@ nfs4_laundromat(struct nfsd_net *nn)
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
 		if (!state_expired(&lt, dp->dl_time))
 			break;
-		unhash_delegation_locked(dp);
+		unhash_delegation_locked(dp, NFS4_REVOKED_DELEG_STID);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
@@ -8303,7 +8303,7 @@ nfs4_state_shutdown_net(struct net *net)
 	spin_lock(&state_lock);
 	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-		unhash_delegation_locked(dp);
+		unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);

From 7aa3cc8a10c7cefe58251c7a1febec8f1406b36c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:25 +1100
Subject: [PATCH 639/707] nfsd: split sc_status out of sc_type

sc_type identifies the type of a state - open, lock, deleg, layout - and
also the status of a state - closed or revoked.

This is a bit untidy and could get worse when "admin-revoked" states are
added.  So clean it up.

With this patch, the type is now all that is stored in sc_type.  This is
zero when the state is first added to ->cl_stateids (causing it to be
ignored), and is then set appropriately once it is fully initialised.
It is set under ->cl_lock to ensure atomicity w.r.t lookup.  It is now
never cleared.

sc_type is still a bit-set even though at most one bit is set.  This allows
lookup functions to be given a bitmap of acceptable types.

sc_type is now an unsigned short rather than char.  There is no value in
restricting to just 8 bits.

All the constants now start SC_TYPE_ matching the field in which they
are stored.  Keeping the existing names and ensuring clear separation
from non-type flags would have required something like
NFS4_STID_TYPE_CLOSED which is cumbersome.  The "NFS4" prefix is
redundant was they only appear in NFS4 code, so remove that and change
STID to SC to match the field.

The status is stored in a separate unsigned short named "sc_status".  It
has two flags: SC_STATUS_CLOSED and SC_STATUS_REVOKED.
CLOSED combines NFS4_CLOSED_STID, NFS4_CLOSED_DELEG_STID, and is used
for SC_TYPE_LOCK and SC_TYPE_LAYOUT instead of setting the sc_type to zero.
These flags are only ever set, never cleared.
For deleg stateids they are set under the global state_lock.
For open and lock stateids they are set under ->cl_lock.
For layout stateids they are set under ->ls_lock

nfs4_unhash_stid() has been removed, and we never set sc_type = 0.  This
was only used for LOCK and LAYOUT stids and they now use
SC_STATUS_CLOSED.

Also TRACE_DEFINE_NUM() calls for the various STID #define have been
removed because these things are not enums, and so that call is
incorrect.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4layouts.c |  14 +--
 fs/nfsd/nfs4state.c   | 207 +++++++++++++++++++++---------------------
 fs/nfsd/state.h       |  40 +++++---
 fs/nfsd/trace.h       |  31 +++----
 4 files changed, 151 insertions(+), 141 deletions(-)

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 5e8096bc5eaa45..857b822450b4fe 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -236,7 +236,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 	nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
 			NFSPROC4_CLNT_CB_LAYOUT);
 
-	if (parent->sc_type == NFS4_DELEG_STID)
+	if (parent->sc_type == SC_TYPE_DELEG)
 		ls->ls_file = nfsd_file_get(fp->fi_deleg_file);
 	else
 		ls->ls_file = find_any_file(fp);
@@ -250,7 +250,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 	}
 
 	spin_lock(&clp->cl_lock);
-	stp->sc_type = NFS4_LAYOUT_STID;
+	stp->sc_type = SC_TYPE_LAYOUT;
 	list_add(&ls->ls_perclnt, &clp->cl_lo_states);
 	spin_unlock(&clp->cl_lock);
 
@@ -269,13 +269,13 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
 {
 	struct nfs4_layout_stateid *ls;
 	struct nfs4_stid *stid;
-	unsigned char typemask = NFS4_LAYOUT_STID;
+	unsigned short typemask = SC_TYPE_LAYOUT;
 	__be32 status;
 
 	if (create)
-		typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
+		typemask |= (SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG);
 
-	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
+	status = nfsd4_lookup_stateid(cstate, stateid, typemask, 0, &stid,
 			net_generic(SVC_NET(rqstp), nfsd_net_id));
 	if (status)
 		goto out;
@@ -286,7 +286,7 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
 		goto out_put_stid;
 	}
 
-	if (stid->sc_type != NFS4_LAYOUT_STID) {
+	if (stid->sc_type != SC_TYPE_LAYOUT) {
 		ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
 		nfs4_put_stid(stid);
 
@@ -518,7 +518,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
 		lrp->lrs_present = true;
 	} else {
 		trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid);
-		nfs4_unhash_stid(&ls->ls_stid);
+		ls->ls_stid.sc_status |= SC_STATUS_CLOSED;
 		lrp->lrs_present = false;
 	}
 	spin_unlock(&ls->ls_lock);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3527b9388174bb..58096ec81fb979 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1260,11 +1260,6 @@ static void destroy_unhashed_deleg(struct nfs4_delegation *dp)
 	nfs4_put_stid(&dp->dl_stid);
 }
 
-void nfs4_unhash_stid(struct nfs4_stid *s)
-{
-	s->sc_type = 0;
-}
-
 /**
  * nfs4_delegation_exists - Discover if this delegation already exists
  * @clp:     a pointer to the nfs4_client we're granting a delegation to
@@ -1317,7 +1312,7 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
 	if (nfs4_delegation_exists(clp, fp))
 		return -EAGAIN;
 	refcount_inc(&dp->dl_stid.sc_count);
-	dp->dl_stid.sc_type = NFS4_DELEG_STID;
+	dp->dl_stid.sc_type = SC_TYPE_DELEG;
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
 	list_add(&dp->dl_perclnt, &clp->cl_delegations);
 	return 0;
@@ -1329,7 +1324,7 @@ static bool delegation_hashed(struct nfs4_delegation *dp)
 }
 
 static bool
-unhash_delegation_locked(struct nfs4_delegation *dp, unsigned char type)
+unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask)
 {
 	struct nfs4_file *fp = dp->dl_stid.sc_file;
 
@@ -1339,8 +1334,9 @@ unhash_delegation_locked(struct nfs4_delegation *dp, unsigned char type)
 		return false;
 
 	if (dp->dl_stid.sc_client->cl_minorversion == 0)
-		type = NFS4_CLOSED_DELEG_STID;
-	dp->dl_stid.sc_type = type;
+		statusmask = SC_STATUS_CLOSED;
+	dp->dl_stid.sc_status |= statusmask;
+
 	/* Ensure that deleg break won't try to requeue it */
 	++dp->dl_time;
 	spin_lock(&fp->fi_lock);
@@ -1356,7 +1352,7 @@ static void destroy_delegation(struct nfs4_delegation *dp)
 	bool unhashed;
 
 	spin_lock(&state_lock);
-	unhashed = unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID);
+	unhashed = unhash_delegation_locked(dp, SC_STATUS_CLOSED);
 	spin_unlock(&state_lock);
 	if (unhashed)
 		destroy_unhashed_deleg(dp);
@@ -1370,7 +1366,7 @@ static void revoke_delegation(struct nfs4_delegation *dp)
 
 	trace_nfsd_stid_revoke(&dp->dl_stid);
 
-	if (dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) {
+	if (dp->dl_stid.sc_status & SC_STATUS_REVOKED) {
 		spin_lock(&clp->cl_lock);
 		refcount_inc(&dp->dl_stid.sc_count);
 		list_add(&dp->dl_recall_lru, &clp->cl_revoked);
@@ -1379,8 +1375,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
 	destroy_unhashed_deleg(dp);
 }
 
-/* 
- * SETCLIENTID state 
+/*
+ * SETCLIENTID state
  */
 
 static unsigned int clientid_hashval(u32 id)
@@ -1543,7 +1539,7 @@ static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp)
 	if (!unhash_ol_stateid(stp))
 		return false;
 	list_del_init(&stp->st_locks);
-	nfs4_unhash_stid(&stp->st_stid);
+	stp->st_stid.sc_status |= SC_STATUS_CLOSED;
 	return true;
 }
 
@@ -1622,6 +1618,7 @@ static void release_open_stateid(struct nfs4_ol_stateid *stp)
 	LIST_HEAD(reaplist);
 
 	spin_lock(&stp->st_stid.sc_client->cl_lock);
+	stp->st_stid.sc_status |= SC_STATUS_CLOSED;
 	if (unhash_open_stateid(stp, &reaplist))
 		put_ol_stateid_locked(stp, &reaplist);
 	spin_unlock(&stp->st_stid.sc_client->cl_lock);
@@ -2230,7 +2227,7 @@ __destroy_client(struct nfs4_client *clp)
 	spin_lock(&state_lock);
 	while (!list_empty(&clp->cl_delegations)) {
 		dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
-		unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID);
+		unhash_delegation_locked(dp, SC_STATUS_CLOSED);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
@@ -2462,14 +2459,16 @@ find_stateid_locked(struct nfs4_client *cl, stateid_t *t)
 }
 
 static struct nfs4_stid *
-find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
+find_stateid_by_type(struct nfs4_client *cl, stateid_t *t,
+		     unsigned short typemask, unsigned short ok_states)
 {
 	struct nfs4_stid *s;
 
 	spin_lock(&cl->cl_lock);
 	s = find_stateid_locked(cl, t);
 	if (s != NULL) {
-		if (typemask & s->sc_type)
+		if ((s->sc_status & ~ok_states) == 0 &&
+		    (typemask & s->sc_type))
 			refcount_inc(&s->sc_count);
 		else
 			s = NULL;
@@ -2622,7 +2621,7 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
 	struct nfs4_stateowner *oo;
 	unsigned int access, deny;
 
-	if (st->sc_type != NFS4_OPEN_STID && st->sc_type != NFS4_LOCK_STID)
+	if (st->sc_type != SC_TYPE_OPEN && st->sc_type != SC_TYPE_LOCK)
 		return 0; /* XXX: or SEQ_SKIP? */
 	ols = openlockstateid(st);
 	oo = ols->st_stateowner;
@@ -2754,13 +2753,13 @@ static int states_show(struct seq_file *s, void *v)
 	struct nfs4_stid *st = v;
 
 	switch (st->sc_type) {
-	case NFS4_OPEN_STID:
+	case SC_TYPE_OPEN:
 		return nfs4_show_open(s, st);
-	case NFS4_LOCK_STID:
+	case SC_TYPE_LOCK:
 		return nfs4_show_lock(s, st);
-	case NFS4_DELEG_STID:
+	case SC_TYPE_DELEG:
 		return nfs4_show_deleg(s, st);
-	case NFS4_LAYOUT_STID:
+	case SC_TYPE_LAYOUT:
 		return nfs4_show_layout(s, st);
 	default:
 		return 0; /* XXX: or SEQ_SKIP? */
@@ -4533,7 +4532,8 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
 			continue;
 		if (local->st_stateowner != &oo->oo_owner)
 			continue;
-		if (local->st_stid.sc_type == NFS4_OPEN_STID) {
+		if (local->st_stid.sc_type == SC_TYPE_OPEN &&
+		    !local->st_stid.sc_status) {
 			ret = local;
 			refcount_inc(&ret->st_stid.sc_count);
 			break;
@@ -4547,17 +4547,10 @@ nfsd4_verify_open_stid(struct nfs4_stid *s)
 {
 	__be32 ret = nfs_ok;
 
-	switch (s->sc_type) {
-	default:
-		break;
-	case 0:
-	case NFS4_CLOSED_STID:
-	case NFS4_CLOSED_DELEG_STID:
-		ret = nfserr_bad_stateid;
-		break;
-	case NFS4_REVOKED_DELEG_STID:
+	if (s->sc_status & SC_STATUS_REVOKED)
 		ret = nfserr_deleg_revoked;
-	}
+	else if (s->sc_status & SC_STATUS_CLOSED)
+		ret = nfserr_bad_stateid;
 	return ret;
 }
 
@@ -4643,7 +4636,7 @@ init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open)
 
 	open->op_stp = NULL;
 	refcount_inc(&stp->st_stid.sc_count);
-	stp->st_stid.sc_type = NFS4_OPEN_STID;
+	stp->st_stid.sc_type = SC_TYPE_OPEN;
 	INIT_LIST_HEAD(&stp->st_locks);
 	stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner);
 	get_nfs4_file(fp);
@@ -4870,9 +4863,9 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
 
 	trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task);
 
-	if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID ||
-	    dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID)
-	        return 1;
+	if (dp->dl_stid.sc_status)
+		/* CLOSED or REVOKED */
+		return 1;
 
 	switch (task->tk_status) {
 	case 0:
@@ -5117,12 +5110,12 @@ static int share_access_to_flags(u32 share_access)
 	return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
 }
 
-static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s)
+static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl,
+						  stateid_t *s)
 {
 	struct nfs4_stid *ret;
 
-	ret = find_stateid_by_type(cl, s,
-				NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID);
+	ret = find_stateid_by_type(cl, s, SC_TYPE_DELEG, SC_STATUS_REVOKED);
 	if (!ret)
 		return NULL;
 	return delegstateid(ret);
@@ -5145,7 +5138,7 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
 	deleg = find_deleg_stateid(cl, &open->op_delegate_stateid);
 	if (deleg == NULL)
 		goto out;
-	if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) {
+	if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) {
 		nfs4_put_stid(&deleg->dl_stid);
 		status = nfserr_deleg_revoked;
 		goto out;
@@ -5778,7 +5771,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	} else {
 		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true);
 		if (status) {
-			stp->st_stid.sc_type = NFS4_CLOSED_STID;
 			release_open_stateid(stp);
 			mutex_unlock(&stp->st_mutex);
 			goto out;
@@ -6170,7 +6162,7 @@ nfs4_laundromat(struct nfsd_net *nn)
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
 		if (!state_expired(&lt, dp->dl_time))
 			break;
-		unhash_delegation_locked(dp, NFS4_REVOKED_DELEG_STID);
+		unhash_delegation_locked(dp, SC_STATUS_REVOKED);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
@@ -6409,22 +6401,20 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 	status = nfsd4_stid_check_stateid_generation(stateid, s, 1);
 	if (status)
 		goto out_unlock;
+	status = nfsd4_verify_open_stid(s);
+	if (status)
+		goto out_unlock;
+
 	switch (s->sc_type) {
-	case NFS4_DELEG_STID:
+	case SC_TYPE_DELEG:
 		status = nfs_ok;
 		break;
-	case NFS4_REVOKED_DELEG_STID:
-		status = nfserr_deleg_revoked;
-		break;
-	case NFS4_OPEN_STID:
-	case NFS4_LOCK_STID:
+	case SC_TYPE_OPEN:
+	case SC_TYPE_LOCK:
 		status = nfsd4_check_openowner_confirmed(openlockstateid(s));
 		break;
 	default:
 		printk("unknown stateid type %x\n", s->sc_type);
-		fallthrough;
-	case NFS4_CLOSED_STID:
-	case NFS4_CLOSED_DELEG_STID:
 		status = nfserr_bad_stateid;
 	}
 out_unlock:
@@ -6434,7 +6424,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 
 __be32
 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
-		     stateid_t *stateid, unsigned char typemask,
+		     stateid_t *stateid,
+		     unsigned short typemask, unsigned short statusmask,
 		     struct nfs4_stid **s, struct nfsd_net *nn)
 {
 	__be32 status;
@@ -6445,10 +6436,13 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 	 *  only return revoked delegations if explicitly asked.
 	 *  otherwise we report revoked or bad_stateid status.
 	 */
-	if (typemask & NFS4_REVOKED_DELEG_STID)
+	if (statusmask & SC_STATUS_REVOKED)
 		return_revoked = true;
-	else if (typemask & NFS4_DELEG_STID)
-		typemask |= NFS4_REVOKED_DELEG_STID;
+	if (typemask & SC_TYPE_DELEG)
+		/* Always allow REVOKED for DELEG so we can
+		 * retturn the appropriate error.
+		 */
+		statusmask |= SC_STATUS_REVOKED;
 
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
 		CLOSE_STATEID(stateid))
@@ -6461,14 +6455,12 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 	}
 	if (status)
 		return status;
-	stid = find_stateid_by_type(cstate->clp, stateid, typemask);
+	stid = find_stateid_by_type(cstate->clp, stateid, typemask, statusmask);
 	if (!stid)
 		return nfserr_bad_stateid;
-	if ((stid->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) {
+	if ((stid->sc_status & SC_STATUS_REVOKED) && !return_revoked) {
 		nfs4_put_stid(stid);
-		if (cstate->minorversion)
-			return nfserr_deleg_revoked;
-		return nfserr_bad_stateid;
+		return nfserr_deleg_revoked;
 	}
 	*s = stid;
 	return nfs_ok;
@@ -6479,17 +6471,17 @@ nfs4_find_file(struct nfs4_stid *s, int flags)
 {
 	struct nfsd_file *ret = NULL;
 
-	if (!s)
+	if (!s || s->sc_status)
 		return NULL;
 
 	switch (s->sc_type) {
-	case NFS4_DELEG_STID:
+	case SC_TYPE_DELEG:
 		spin_lock(&s->sc_file->fi_lock);
 		ret = nfsd_file_get(s->sc_file->fi_deleg_file);
 		spin_unlock(&s->sc_file->fi_lock);
 		break;
-	case NFS4_OPEN_STID:
-	case NFS4_LOCK_STID:
+	case SC_TYPE_OPEN:
+	case SC_TYPE_LOCK:
 		if (flags & RD_STATE)
 			ret = find_readable_file(s->sc_file);
 		else
@@ -6602,7 +6594,8 @@ static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st,
 		goto out;
 
 	*stid = find_stateid_by_type(found, &cps->cp_p_stateid,
-			NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID);
+				     SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK,
+				     0);
 	if (*stid)
 		status = nfs_ok;
 	else
@@ -6659,8 +6652,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
 	}
 
 	status = nfsd4_lookup_stateid(cstate, stateid,
-				NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
-				&s, nn);
+				SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK,
+				0, &s, nn);
 	if (status == nfserr_bad_stateid)
 		status = find_cpntf_state(nn, stateid, &s);
 	if (status)
@@ -6671,16 +6664,13 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
 		goto out;
 
 	switch (s->sc_type) {
-	case NFS4_DELEG_STID:
+	case SC_TYPE_DELEG:
 		status = nfs4_check_delegmode(delegstateid(s), flags);
 		break;
-	case NFS4_OPEN_STID:
-	case NFS4_LOCK_STID:
+	case SC_TYPE_OPEN:
+	case SC_TYPE_LOCK:
 		status = nfs4_check_olstateid(openlockstateid(s), flags);
 		break;
-	default:
-		status = nfserr_bad_stateid;
-		break;
 	}
 	if (status)
 		goto out;
@@ -6759,33 +6749,34 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	spin_lock(&cl->cl_lock);
 	s = find_stateid_locked(cl, stateid);
-	if (!s)
+	if (!s || s->sc_status & SC_STATUS_CLOSED)
 		goto out_unlock;
 	spin_lock(&s->sc_lock);
 	switch (s->sc_type) {
-	case NFS4_DELEG_STID:
+	case SC_TYPE_DELEG:
+		if (s->sc_status & SC_STATUS_REVOKED) {
+			spin_unlock(&s->sc_lock);
+			dp = delegstateid(s);
+			list_del_init(&dp->dl_recall_lru);
+			spin_unlock(&cl->cl_lock);
+			nfs4_put_stid(s);
+			ret = nfs_ok;
+			goto out;
+		}
 		ret = nfserr_locks_held;
 		break;
-	case NFS4_OPEN_STID:
+	case SC_TYPE_OPEN:
 		ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
 		if (ret)
 			break;
 		ret = nfserr_locks_held;
 		break;
-	case NFS4_LOCK_STID:
+	case SC_TYPE_LOCK:
 		spin_unlock(&s->sc_lock);
 		refcount_inc(&s->sc_count);
 		spin_unlock(&cl->cl_lock);
 		ret = nfsd4_free_lock_stateid(stateid, s);
 		goto out;
-	case NFS4_REVOKED_DELEG_STID:
-		spin_unlock(&s->sc_lock);
-		dp = delegstateid(s);
-		list_del_init(&dp->dl_recall_lru);
-		spin_unlock(&cl->cl_lock);
-		nfs4_put_stid(s);
-		ret = nfs_ok;
-		goto out;
 	/* Default falls through and returns nfserr_bad_stateid */
 	}
 	spin_unlock(&s->sc_lock);
@@ -6828,6 +6819,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
  * @seqid: seqid (provided by client)
  * @stateid: stateid (provided by client)
  * @typemask: mask of allowable types for this operation
+ * @statusmask: mask of allowed states: 0 or STID_CLOSED
  * @stpp: return pointer for the stateid found
  * @nn: net namespace for request
  *
@@ -6837,7 +6829,8 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
  */
 static __be32
 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
-			 stateid_t *stateid, char typemask,
+			 stateid_t *stateid,
+			 unsigned short typemask, unsigned short statusmask,
 			 struct nfs4_ol_stateid **stpp,
 			 struct nfsd_net *nn)
 {
@@ -6848,7 +6841,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	trace_nfsd_preprocess(seqid, stateid);
 
 	*stpp = NULL;
-	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
+	status = nfsd4_lookup_stateid(cstate, stateid,
+				      typemask, statusmask, &s, nn);
 	if (status)
 		return status;
 	stp = openlockstateid(s);
@@ -6870,7 +6864,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
 	struct nfs4_ol_stateid *stp;
 
 	status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
-						NFS4_OPEN_STID, &stp, nn);
+					  SC_TYPE_OPEN, 0, &stp, nn);
 	if (status)
 		return status;
 	oo = openowner(stp->st_stateowner);
@@ -6901,8 +6895,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		return status;
 
 	status = nfs4_preprocess_seqid_op(cstate,
-					oc->oc_seqid, &oc->oc_req_stateid,
-					NFS4_OPEN_STID, &stp, nn);
+					  oc->oc_seqid, &oc->oc_req_stateid,
+					  SC_TYPE_OPEN, 0, &stp, nn);
 	if (status)
 		goto out;
 	oo = openowner(stp->st_stateowner);
@@ -7033,18 +7027,20 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	bool need_move_to_close_list;
 
-	dprintk("NFSD: nfsd4_close on file %pd\n", 
+	dprintk("NFSD: nfsd4_close on file %pd\n",
 			cstate->current_fh.fh_dentry);
 
 	status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
-					&close->cl_stateid,
-					NFS4_OPEN_STID|NFS4_CLOSED_STID,
-					&stp, nn);
+					  &close->cl_stateid,
+					  SC_TYPE_OPEN, SC_STATUS_CLOSED,
+					  &stp, nn);
 	nfsd4_bump_seqid(cstate, status);
 	if (status)
-		goto out; 
+		goto out;
 
-	stp->st_stid.sc_type = NFS4_CLOSED_STID;
+	spin_lock(&stp->st_stid.sc_client->cl_lock);
+	stp->st_stid.sc_status |= SC_STATUS_CLOSED;
+	spin_unlock(&stp->st_stid.sc_client->cl_lock);
 
 	/*
 	 * Technically we don't _really_ have to increment or copy it, since
@@ -7095,7 +7091,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
 		return status;
 
-	status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn);
+	status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, 0, &s, nn);
 	if (status)
 		goto out;
 	dp = delegstateid(s);
@@ -7362,7 +7358,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
 	if (retstp)
 		goto out_found;
 	refcount_inc(&stp->st_stid.sc_count);
-	stp->st_stid.sc_type = NFS4_LOCK_STID;
+	stp->st_stid.sc_type = SC_TYPE_LOCK;
 	stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner);
 	get_nfs4_file(fp);
 	stp->st_stid.sc_file = fp;
@@ -7549,9 +7545,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 							&lock_stp, &new);
 	} else {
 		status = nfs4_preprocess_seqid_op(cstate,
-				       lock->lk_old_lock_seqid,
-				       &lock->lk_old_lock_stateid,
-				       NFS4_LOCK_STID, &lock_stp, nn);
+						  lock->lk_old_lock_seqid,
+						  &lock->lk_old_lock_stateid,
+						  SC_TYPE_LOCK, 0, &lock_stp,
+						  nn);
 	}
 	if (status)
 		goto out;
@@ -7864,8 +7861,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		 return nfserr_inval;
 
 	status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
-					&locku->lu_stateid, NFS4_LOCK_STID,
-					&stp, nn);
+					  &locku->lu_stateid, SC_TYPE_LOCK, 0,
+					  &stp, nn);
 	if (status)
 		goto out;
 	nf = find_any_file(stp->st_stid.sc_file);
@@ -8303,7 +8300,7 @@ nfs4_state_shutdown_net(struct net *net)
 	spin_lock(&state_lock);
 	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-		unhash_delegation_locked(dp, NFS4_CLOSED_DELEG_STID);
+		unhash_delegation_locked(dp, SC_STATUS_CLOSED);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 87c4372ba36a8d..1d4bf1a7d229c5 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -88,17 +88,33 @@ struct nfsd4_callback_ops {
  */
 struct nfs4_stid {
 	refcount_t		sc_count;
-#define NFS4_OPEN_STID 1
-#define NFS4_LOCK_STID 2
-#define NFS4_DELEG_STID 4
-/* For an open stateid kept around *only* to process close replays: */
-#define NFS4_CLOSED_STID 8
+
+	/* A new stateid is added to the cl_stateids idr early before it
+	 * is fully initialised.  Its sc_type is then zero.  After
+	 * initialisation the sc_type it set under cl_lock, and then
+	 * never changes.
+	 */
+#define SC_TYPE_OPEN		BIT(0)
+#define SC_TYPE_LOCK		BIT(1)
+#define SC_TYPE_DELEG		BIT(2)
+#define SC_TYPE_LAYOUT		BIT(3)
+	unsigned short		sc_type;
+
+/* state_lock protects sc_status for delegation stateids.
+ * ->cl_lock protects sc_status for open and lock stateids.
+ * ->st_mutex also protect sc_status for open stateids.
+ * ->ls_lock protects sc_status for layout stateids.
+ */
+/*
+ * For an open stateid kept around *only* to process close replays.
+ * For deleg stateid, kept in idr until last reference is dropped.
+ */
+#define SC_STATUS_CLOSED	BIT(0)
 /* For a deleg stateid kept around only to process free_stateid's: */
-#define NFS4_REVOKED_DELEG_STID 16
-#define NFS4_CLOSED_DELEG_STID 32
-#define NFS4_LAYOUT_STID 64
+#define SC_STATUS_REVOKED	BIT(1)
+	unsigned short		sc_status;
+
 	struct list_head	sc_cp_list;
-	unsigned char		sc_type;
 	stateid_t		sc_stateid;
 	spinlock_t		sc_lock;
 	struct nfs4_client	*sc_client;
@@ -672,15 +688,15 @@ extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
 		stateid_t *stateid, int flags, struct nfsd_file **filp,
 		struct nfs4_stid **cstid);
 __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
-		     stateid_t *stateid, unsigned char typemask,
-		     struct nfs4_stid **s, struct nfsd_net *nn);
+			    stateid_t *stateid, unsigned short typemask,
+			    unsigned short statusmask,
+			    struct nfs4_stid **s, struct nfsd_net *nn);
 struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab,
 				  void (*sc_free)(struct nfs4_stid *));
 int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy);
 void nfs4_free_copy_state(struct nfsd4_copy *copy);
 struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn,
 			struct nfs4_stid *p_stid);
-void nfs4_unhash_stid(struct nfs4_stid *s);
 void nfs4_put_stid(struct nfs4_stid *s);
 void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
 void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 9f9e58debc2611..f87dad1fa1d66d 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -643,23 +643,17 @@ DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \
 DEFINE_STATESEQID_EVENT(preprocess);
 DEFINE_STATESEQID_EVENT(open_confirm);
 
-TRACE_DEFINE_ENUM(NFS4_OPEN_STID);
-TRACE_DEFINE_ENUM(NFS4_LOCK_STID);
-TRACE_DEFINE_ENUM(NFS4_DELEG_STID);
-TRACE_DEFINE_ENUM(NFS4_CLOSED_STID);
-TRACE_DEFINE_ENUM(NFS4_REVOKED_DELEG_STID);
-TRACE_DEFINE_ENUM(NFS4_CLOSED_DELEG_STID);
-TRACE_DEFINE_ENUM(NFS4_LAYOUT_STID);
-
 #define show_stid_type(x)						\
 	__print_flags(x, "|",						\
-		{ NFS4_OPEN_STID,		"OPEN" },		\
-		{ NFS4_LOCK_STID,		"LOCK" },		\
-		{ NFS4_DELEG_STID,		"DELEG" },		\
-		{ NFS4_CLOSED_STID,		"CLOSED" },		\
-		{ NFS4_REVOKED_DELEG_STID,	"REVOKED" },		\
-		{ NFS4_CLOSED_DELEG_STID,	"CLOSED_DELEG" },	\
-		{ NFS4_LAYOUT_STID,		"LAYOUT" })
+		{ SC_TYPE_OPEN,		"OPEN" },		\
+		{ SC_TYPE_LOCK,		"LOCK" },		\
+		{ SC_TYPE_DELEG,		"DELEG" },		\
+		{ SC_TYPE_LAYOUT,		"LAYOUT" })
+
+#define show_stid_status(x)						\
+	__print_flags(x, "|",						\
+		{ SC_STATUS_CLOSED,		"CLOSED" },		\
+		{ SC_STATUS_REVOKED,		"REVOKED" })		\
 
 DECLARE_EVENT_CLASS(nfsd_stid_class,
 	TP_PROTO(
@@ -668,6 +662,7 @@ DECLARE_EVENT_CLASS(nfsd_stid_class,
 	TP_ARGS(stid),
 	TP_STRUCT__entry(
 		__field(unsigned long, sc_type)
+		__field(unsigned long, sc_status)
 		__field(int, sc_count)
 		__field(u32, cl_boot)
 		__field(u32, cl_id)
@@ -678,16 +673,18 @@ DECLARE_EVENT_CLASS(nfsd_stid_class,
 		const stateid_t *stp = &stid->sc_stateid;
 
 		__entry->sc_type = stid->sc_type;
+		__entry->sc_status = stid->sc_status;
 		__entry->sc_count = refcount_read(&stid->sc_count);
 		__entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
 		__entry->cl_id = stp->si_opaque.so_clid.cl_id;
 		__entry->si_id = stp->si_opaque.so_id;
 		__entry->si_generation = stp->si_generation;
 	),
-	TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s",
+	TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s state=%s",
 		__entry->cl_boot, __entry->cl_id,
 		__entry->si_id, __entry->si_generation,
-		__entry->sc_count, show_stid_type(__entry->sc_type)
+		__entry->sc_count, show_stid_type(__entry->sc_type),
+		show_stid_status(__entry->sc_status)
 	)
 );
 

From a020ae81970773ca59d5693f39e51fe8dcf8a389 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:26 +1100
Subject: [PATCH 640/707] nfsd: prepare for supporting admin-revocation of
 state

The NFSv4 protocol allows state to be revoked by the admin and has error
codes which allow this to be communicated to the client.

This patch
 - introduces a new state-id status SC_STATUS_ADMIN_REVOKED
   which can be set on open, lock, or delegation state.
 - reports NFS4ERR_ADMIN_REVOKED when these are accessed
 - introduces a per-client counter of these states and returns
   SEQ4_STATUS_ADMIN_STATE_REVOKED when the counter is not zero.
   Decrements this when freeing any admin-revoked state.
 - introduces stub code to find all interesting states for a given
   superblock so they can be revoked via the 'unlock_filesystem'
   file in /proc/fs/nfsd/
   No actual states are handled yet.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 85 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/nfsd/nfsctl.c    |  1 +
 fs/nfsd/nfsd.h      |  1 +
 fs/nfsd/state.h     | 10 ++++++
 fs/nfsd/trace.h     |  3 +-
 5 files changed, 98 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 58096ec81fb979..a19575571c059d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1210,6 +1210,8 @@ nfs4_put_stid(struct nfs4_stid *s)
 		return;
 	}
 	idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+	if (s->sc_status & SC_STATUS_ADMIN_REVOKED)
+		atomic_dec(&s->sc_client->cl_admin_revoked);
 	nfs4_free_cpntf_statelist(clp->net, s);
 	spin_unlock(&clp->cl_lock);
 	s->sc_free(s);
@@ -1529,6 +1531,8 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp,
 	}
 
 	idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+	if (s->sc_status & SC_STATUS_ADMIN_REVOKED)
+		atomic_dec(&s->sc_client->cl_admin_revoked);
 	list_add(&stp->st_locks, reaplist);
 }
 
@@ -1674,6 +1678,68 @@ static void release_openowner(struct nfs4_openowner *oo)
 	nfs4_put_stateowner(&oo->oo_owner);
 }
 
+static struct nfs4_stid *find_one_sb_stid(struct nfs4_client *clp,
+					  struct super_block *sb,
+					  unsigned int sc_types)
+{
+	unsigned long id, tmp;
+	struct nfs4_stid *stid;
+
+	spin_lock(&clp->cl_lock);
+	idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id)
+		if ((stid->sc_type & sc_types) &&
+		    stid->sc_status == 0 &&
+		    stid->sc_file->fi_inode->i_sb == sb) {
+			refcount_inc(&stid->sc_count);
+			break;
+		}
+	spin_unlock(&clp->cl_lock);
+	return stid;
+}
+
+/**
+ * nfsd4_revoke_states - revoke all nfsv4 states associated with given filesystem
+ * @net:  used to identify instance of nfsd (there is one per net namespace)
+ * @sb:   super_block used to identify target filesystem
+ *
+ * All nfs4 states (open, lock, delegation, layout) held by the server instance
+ * and associated with a file on the given filesystem will be revoked resulting
+ * in any files being closed and so all references from nfsd to the filesystem
+ * being released.  Thus nfsd will no longer prevent the filesystem from being
+ * unmounted.
+ *
+ * The clients which own the states will subsequently being notified that the
+ * states have been "admin-revoked".
+ */
+void nfsd4_revoke_states(struct net *net, struct super_block *sb)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	unsigned int idhashval;
+	unsigned int sc_types;
+
+	sc_types = 0;
+
+	spin_lock(&nn->client_lock);
+	for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) {
+		struct list_head *head = &nn->conf_id_hashtbl[idhashval];
+		struct nfs4_client *clp;
+	retry:
+		list_for_each_entry(clp, head, cl_idhash) {
+			struct nfs4_stid *stid = find_one_sb_stid(clp, sb,
+								  sc_types);
+			if (stid) {
+				spin_unlock(&nn->client_lock);
+				switch (stid->sc_type) {
+				}
+				nfs4_put_stid(stid);
+				spin_lock(&nn->client_lock);
+				goto retry;
+			}
+		}
+	}
+	spin_unlock(&nn->client_lock);
+}
+
 static inline int
 hash_sessionid(struct nfs4_sessionid *sessionid)
 {
@@ -2545,6 +2611,8 @@ static int client_info_show(struct seq_file *m, void *v)
 	}
 	seq_printf(m, "callback state: %s\n", cb_state2str(clp->cl_cb_state));
 	seq_printf(m, "callback address: %pISpc\n", &clp->cl_cb_conn.cb_addr);
+	seq_printf(m, "admin-revoked states: %d\n",
+		   atomic_read(&clp->cl_admin_revoked));
 	drop_client(clp);
 
 	return 0;
@@ -4058,6 +4126,8 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	}
 	if (!list_empty(&clp->cl_revoked))
 		seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
+	if (atomic_read(&clp->cl_admin_revoked))
+		seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED;
 	trace_nfsd_seq4_status(rqstp, seq);
 out_no_session:
 	if (conn)
@@ -4547,7 +4617,9 @@ nfsd4_verify_open_stid(struct nfs4_stid *s)
 {
 	__be32 ret = nfs_ok;
 
-	if (s->sc_status & SC_STATUS_REVOKED)
+	if (s->sc_status & SC_STATUS_ADMIN_REVOKED)
+		ret = nfserr_admin_revoked;
+	else if (s->sc_status & SC_STATUS_REVOKED)
 		ret = nfserr_deleg_revoked;
 	else if (s->sc_status & SC_STATUS_CLOSED)
 		ret = nfserr_bad_stateid;
@@ -5138,6 +5210,11 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
 	deleg = find_deleg_stateid(cl, &open->op_delegate_stateid);
 	if (deleg == NULL)
 		goto out;
+	if (deleg->dl_stid.sc_status & SC_STATUS_ADMIN_REVOKED) {
+		nfs4_put_stid(&deleg->dl_stid);
+		status = nfserr_admin_revoked;
+		goto out;
+	}
 	if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) {
 		nfs4_put_stid(&deleg->dl_stid);
 		status = nfserr_deleg_revoked;
@@ -6444,6 +6521,8 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 		 */
 		statusmask |= SC_STATUS_REVOKED;
 
+	statusmask |= SC_STATUS_ADMIN_REVOKED;
+
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
 		CLOSE_STATEID(stateid))
 		return nfserr_bad_stateid;
@@ -6462,6 +6541,10 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 		nfs4_put_stid(stid);
 		return nfserr_deleg_revoked;
 	}
+	if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) {
+		nfs4_put_stid(stid);
+		return nfserr_admin_revoked;
+	}
 	*s = stid;
 	return nfs_ok;
 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5a5547bd6ecf7e..ecd18bffeebc75 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -281,6 +281,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
 	 * 3.  Is that directory the root of an exported file system?
 	 */
 	error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb);
+	nfsd4_revoke_states(netns(file), path.dentry->d_sb);
 
 	path_put(&path);
 	return error;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index be2ea3d6d2a289..8daf22d766c60a 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -275,6 +275,7 @@ void		nfsd_lockd_shutdown(void);
 #define	nfserr_no_grace		cpu_to_be32(NFSERR_NO_GRACE)
 #define	nfserr_reclaim_bad	cpu_to_be32(NFSERR_RECLAIM_BAD)
 #define	nfserr_badname		cpu_to_be32(NFSERR_BADNAME)
+#define	nfserr_admin_revoked	cpu_to_be32(NFS4ERR_ADMIN_REVOKED)
 #define	nfserr_cb_path_down	cpu_to_be32(NFSERR_CB_PATH_DOWN)
 #define	nfserr_locked		cpu_to_be32(NFSERR_LOCKED)
 #define	nfserr_wrongsec		cpu_to_be32(NFSERR_WRONGSEC)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 1d4bf1a7d229c5..be02bf1a16bdd9 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -112,6 +112,7 @@ struct nfs4_stid {
 #define SC_STATUS_CLOSED	BIT(0)
 /* For a deleg stateid kept around only to process free_stateid's: */
 #define SC_STATUS_REVOKED	BIT(1)
+#define SC_STATUS_ADMIN_REVOKED	BIT(2)
 	unsigned short		sc_status;
 
 	struct list_head	sc_cp_list;
@@ -367,6 +368,7 @@ struct nfs4_client {
 	clientid_t		cl_clientid;	/* generated by server */
 	nfs4_verifier		cl_confirm;	/* generated by server */
 	u32			cl_minorversion;
+	atomic_t		cl_admin_revoked; /* count of admin-revoked states */
 	/* NFSv4.1 client implementation id: */
 	struct xdr_netobj	cl_nii_domain;
 	struct xdr_netobj	cl_nii_name;
@@ -730,6 +732,14 @@ static inline void get_nfs4_file(struct nfs4_file *fi)
 }
 struct nfsd_file *find_any_file(struct nfs4_file *f);
 
+#ifdef CONFIG_NFSD_V4
+void nfsd4_revoke_states(struct net *net, struct super_block *sb);
+#else
+static inline void nfsd4_revoke_states(struct net *net, struct super_block *sb)
+{
+}
+#endif
+
 /* grace period management */
 void nfsd4_end_grace(struct nfsd_net *nn);
 
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index f87dad1fa1d66d..d8e56268a250ba 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -653,7 +653,8 @@ DEFINE_STATESEQID_EVENT(open_confirm);
 #define show_stid_status(x)						\
 	__print_flags(x, "|",						\
 		{ SC_STATUS_CLOSED,		"CLOSED" },		\
-		{ SC_STATUS_REVOKED,		"REVOKED" })		\
+		{ SC_STATUS_REVOKED,		"REVOKED" },		\
+		{ SC_STATUS_ADMIN_REVOKED,	"ADMIN_REVOKED" })
 
 DECLARE_EVENT_CLASS(nfsd_stid_class,
 	TP_PROTO(

From c563cc3d2f04a8a19d3e2963f2256fa3ae70445f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:27 +1100
Subject: [PATCH 641/707] nfsd: allow state with no file to appear in
 /proc/fs/nfsd/clients/*/states

Change the "show" functions to show some content even if a file cannot
be found.  This is the case for admin-revoked state.
This is primarily useful for debugging - to ensure states are being
removed eventually.

So change several seq_printf() to seq_puts().  Some of these are needed
to keep checkpatch happy.  Others were done for consistency.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 118 ++++++++++++++++++++++----------------------
 1 file changed, 58 insertions(+), 60 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a19575571c059d..a05b6ab81ecffa 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2554,9 +2554,9 @@ static struct nfs4_client *get_nfsdfs_clp(struct inode *inode)
 
 static void seq_quote_mem(struct seq_file *m, char *data, int len)
 {
-	seq_printf(m, "\"");
+	seq_puts(m, "\"");
 	seq_escape_mem(m, data, len, ESCAPE_HEX | ESCAPE_NAP | ESCAPE_APPEND, "\"\\");
-	seq_printf(m, "\"");
+	seq_puts(m, "\"");
 }
 
 static const char *cb_state2str(int state)
@@ -2597,14 +2597,14 @@ static int client_info_show(struct seq_file *m, void *v)
 		seq_puts(m, "status: unconfirmed\n");
 	seq_printf(m, "seconds from last renew: %lld\n",
 		ktime_get_boottime_seconds() - clp->cl_time);
-	seq_printf(m, "name: ");
+	seq_puts(m, "name: ");
 	seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len);
 	seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion);
 	if (clp->cl_nii_domain.data) {
-		seq_printf(m, "Implementation domain: ");
+		seq_puts(m, "Implementation domain: ");
 		seq_quote_mem(m, clp->cl_nii_domain.data,
 					clp->cl_nii_domain.len);
-		seq_printf(m, "\nImplementation name: ");
+		seq_puts(m, "\nImplementation name: ");
 		seq_quote_mem(m, clp->cl_nii_name.data, clp->cl_nii_name.len);
 		seq_printf(m, "\nImplementation time: [%lld, %ld]\n",
 			clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec);
@@ -2671,7 +2671,7 @@ static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f)
 
 static void nfs4_show_owner(struct seq_file *s, struct nfs4_stateowner *oo)
 {
-	seq_printf(s, "owner: ");
+	seq_puts(s, "owner: ");
 	seq_quote_mem(s, oo->so_owner.data, oo->so_owner.len);
 }
 
@@ -2689,20 +2689,13 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
 	struct nfs4_stateowner *oo;
 	unsigned int access, deny;
 
-	if (st->sc_type != SC_TYPE_OPEN && st->sc_type != SC_TYPE_LOCK)
-		return 0; /* XXX: or SEQ_SKIP? */
 	ols = openlockstateid(st);
 	oo = ols->st_stateowner;
 	nf = st->sc_file;
 
-	spin_lock(&nf->fi_lock);
-	file = find_any_file_locked(nf);
-	if (!file)
-		goto out;
-
-	seq_printf(s, "- ");
+	seq_puts(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
-	seq_printf(s, ": { type: open, ");
+	seq_puts(s, ": { type: open, ");
 
 	access = bmap_to_share_mode(ols->st_access_bmap);
 	deny   = bmap_to_share_mode(ols->st_deny_bmap);
@@ -2714,14 +2707,17 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
 		deny & NFS4_SHARE_ACCESS_READ ? "r" : "-",
 		deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-");
 
-	nfs4_show_superblock(s, file);
-	seq_printf(s, ", ");
-	nfs4_show_fname(s, file);
-	seq_printf(s, ", ");
-	nfs4_show_owner(s, oo);
-	seq_printf(s, " }\n");
-out:
+	spin_lock(&nf->fi_lock);
+	file = find_any_file_locked(nf);
+	if (file) {
+		nfs4_show_superblock(s, file);
+		seq_puts(s, ", ");
+		nfs4_show_fname(s, file);
+		seq_puts(s, ", ");
+	}
 	spin_unlock(&nf->fi_lock);
+	nfs4_show_owner(s, oo);
+	seq_puts(s, " }\n");
 	return 0;
 }
 
@@ -2735,30 +2731,29 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
 	ols = openlockstateid(st);
 	oo = ols->st_stateowner;
 	nf = st->sc_file;
-	spin_lock(&nf->fi_lock);
-	file = find_any_file_locked(nf);
-	if (!file)
-		goto out;
 
-	seq_printf(s, "- ");
+	seq_puts(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
-	seq_printf(s, ": { type: lock, ");
+	seq_puts(s, ": { type: lock, ");
 
-	/*
-	 * Note: a lock stateid isn't really the same thing as a lock,
-	 * it's the locking state held by one owner on a file, and there
-	 * may be multiple (or no) lock ranges associated with it.
-	 * (Same for the matter is true of open stateids.)
-	 */
+	spin_lock(&nf->fi_lock);
+	file = find_any_file_locked(nf);
+	if (file) {
+		/*
+		 * Note: a lock stateid isn't really the same thing as a lock,
+		 * it's the locking state held by one owner on a file, and there
+		 * may be multiple (or no) lock ranges associated with it.
+		 * (Same for the matter is true of open stateids.)
+		 */
 
-	nfs4_show_superblock(s, file);
-	/* XXX: open stateid? */
-	seq_printf(s, ", ");
-	nfs4_show_fname(s, file);
-	seq_printf(s, ", ");
+		nfs4_show_superblock(s, file);
+		/* XXX: open stateid? */
+		seq_puts(s, ", ");
+		nfs4_show_fname(s, file);
+		seq_puts(s, ", ");
+	}
 	nfs4_show_owner(s, oo);
-	seq_printf(s, " }\n");
-out:
+	seq_puts(s, " }\n");
 	spin_unlock(&nf->fi_lock);
 	return 0;
 }
@@ -2771,25 +2766,25 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
 
 	ds = delegstateid(st);
 	nf = st->sc_file;
-	spin_lock(&nf->fi_lock);
-	file = nf->fi_deleg_file;
-	if (!file)
-		goto out;
 
-	seq_printf(s, "- ");
+	seq_puts(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
-	seq_printf(s, ": { type: deleg, ");
+	seq_puts(s, ": { type: deleg, ");
 
-	seq_printf(s, "access: %s, ",
-		ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w");
+	seq_printf(s, "access: %s",
+		   ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w");
 
 	/* XXX: lease time, whether it's being recalled. */
 
-	nfs4_show_superblock(s, file);
-	seq_printf(s, ", ");
-	nfs4_show_fname(s, file);
-	seq_printf(s, " }\n");
-out:
+	spin_lock(&nf->fi_lock);
+	file = nf->fi_deleg_file;
+	if (file) {
+		seq_puts(s, ", ");
+		nfs4_show_superblock(s, file);
+		seq_puts(s, ", ");
+		nfs4_show_fname(s, file);
+	}
+	seq_puts(s, " }\n");
 	spin_unlock(&nf->fi_lock);
 	return 0;
 }
@@ -2802,16 +2797,19 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st)
 	ls = container_of(st, struct nfs4_layout_stateid, ls_stid);
 	file = ls->ls_file;
 
-	seq_printf(s, "- ");
+	seq_puts(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
-	seq_printf(s, ": { type: layout, ");
+	seq_puts(s, ": { type: layout");
 
 	/* XXX: What else would be useful? */
 
-	nfs4_show_superblock(s, file);
-	seq_printf(s, ", ");
-	nfs4_show_fname(s, file);
-	seq_printf(s, " }\n");
+	if (file) {
+		seq_puts(s, ", ");
+		nfs4_show_superblock(s, file);
+		seq_puts(s, ", ");
+		nfs4_show_fname(s, file);
+	}
+	seq_puts(s, " }\n");
 
 	return 0;
 }

From 3ea830e6073c78cef16e39ab74a4c6311bf34624 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:28 +1100
Subject: [PATCH 642/707] nfsd: report in /proc/fs/nfsd/clients/*/states when
 state is admin-revoke

Add "admin-revoked" to the status information for any states that have
been admin-revoked.  This can be useful for confirming correct
behaviour.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a05b6ab81ecffa..823239f681538d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2717,6 +2717,8 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
 	}
 	spin_unlock(&nf->fi_lock);
 	nfs4_show_owner(s, oo);
+	if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+		seq_puts(s, ", admin-revoked");
 	seq_puts(s, " }\n");
 	return 0;
 }
@@ -2753,6 +2755,8 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
 		seq_puts(s, ", ");
 	}
 	nfs4_show_owner(s, oo);
+	if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+		seq_puts(s, ", admin-revoked");
 	seq_puts(s, " }\n");
 	spin_unlock(&nf->fi_lock);
 	return 0;
@@ -2784,8 +2788,10 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
 		seq_puts(s, ", ");
 		nfs4_show_fname(s, file);
 	}
-	seq_puts(s, " }\n");
 	spin_unlock(&nf->fi_lock);
+	if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+		seq_puts(s, ", admin-revoked");
+	seq_puts(s, " }\n");
 	return 0;
 }
 
@@ -2809,6 +2815,8 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st)
 		seq_puts(s, ", ");
 		nfs4_show_fname(s, file);
 	}
+	if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+		seq_puts(s, ", admin-revoked");
 	seq_puts(s, " }\n");
 
 	return 0;

From f83397a22ed1db59c89c63437892a49a9a1540c7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:29 +1100
Subject: [PATCH 643/707] nfsd: allow admin-revoked NFSv4.0 state to be freed.

For NFSv4.1 and later the client easily discovers if there is any
admin-revoked state and will then find and explicitly free it.

For NFSv4.0 there is no such mechanism.  The client can only find that
state is admin-revoked if it tries to use that state, and there is no
way for it to explicitly free the state.  So the server must hold on to
the stateid (at least) for an indefinite amount of time.  A
RELEASE_LOCKOWNER request might justify forgetting some of these
stateids, as would the whole clients lease lapsing, but these are not
reliable.

This patch takes two approaches.

Whenever a client uses an revoked stateid, that stateid is then
discarded and will not be recognised again.  This might confuse a client
which expect to get NFS4ERR_ADMIN_REVOKED consistently once it get it at
all, but should mostly work.  Hopefully one error will lead to other
resources being closed (e.g.  process exits), which will result in more
stateid being freed when a CLOSE attempt gets NFS4ERR_ADMIN_REVOKED.

Also, any admin-revoked stateids that have been that way for more than
one lease time are periodically revoke.

No actual freeing of state happens in this patch.  That will come in
future patches which handle the different sorts of revoked state.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/netns.h     |  4 ++
 fs/nfsd/nfs4state.c | 98 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index afc16ee4da7428..d4be519b5734e3 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -209,6 +209,10 @@ struct nfsd_net {
 	atomic_t		nfsd_courtesy_clients;
 	struct shrinker		*nfsd_client_shrinker;
 	struct work_struct	nfsd_shrinker_work;
+
+	/* last time an admin-revoke happened for NFSv4.0 */
+	time64_t		nfs40_last_revoke;
+
 };
 
 /* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 823239f681538d..ab7f4e25f2a11d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1733,6 +1733,14 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 				}
 				nfs4_put_stid(stid);
 				spin_lock(&nn->client_lock);
+				if (clp->cl_minorversion == 0)
+					/* Allow cleanup after a lease period.
+					 * store_release ensures cleanup will
+					 * see any newly revoked states if it
+					 * sees the time updated.
+					 */
+					nn->nfs40_last_revoke =
+						ktime_get_boottime_seconds();
 				goto retry;
 			}
 		}
@@ -4618,6 +4626,40 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
 	return ret;
 }
 
+static void nfsd4_drop_revoked_stid(struct nfs4_stid *s)
+	__releases(&s->sc_client->cl_lock)
+{
+	struct nfs4_client *cl = s->sc_client;
+
+	switch (s->sc_type) {
+	default:
+		spin_unlock(&cl->cl_lock);
+	}
+}
+
+static void nfsd40_drop_revoked_stid(struct nfs4_client *cl,
+				    stateid_t *stid)
+{
+	/* NFSv4.0 has no way for the client to tell the server
+	 * that it can forget an admin-revoked stateid.
+	 * So we keep it around until the first time that the
+	 * client uses it, and drop it the first time
+	 * nfserr_admin_revoked is returned.
+	 * For v4.1 and later we wait until explicitly told
+	 * to free the stateid.
+	 */
+	if (cl->cl_minorversion == 0) {
+		struct nfs4_stid *st;
+
+		spin_lock(&cl->cl_lock);
+		st = find_stateid_locked(cl, stid);
+		if (st)
+			nfsd4_drop_revoked_stid(st);
+		else
+			spin_unlock(&cl->cl_lock);
+	}
+}
+
 static __be32
 nfsd4_verify_open_stid(struct nfs4_stid *s)
 {
@@ -4640,6 +4682,10 @@ nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp)
 
 	mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX);
 	ret = nfsd4_verify_open_stid(&stp->st_stid);
+	if (ret == nfserr_admin_revoked)
+		nfsd40_drop_revoked_stid(stp->st_stid.sc_client,
+					&stp->st_stid.sc_stateid);
+
 	if (ret != nfs_ok)
 		mutex_unlock(&stp->st_mutex);
 	return ret;
@@ -5223,6 +5269,7 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
 	}
 	if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) {
 		nfs4_put_stid(&deleg->dl_stid);
+		nfsd40_drop_revoked_stid(cl, &open->op_delegate_stateid);
 		status = nfserr_deleg_revoked;
 		goto out;
 	}
@@ -6207,6 +6254,43 @@ nfs4_process_client_reaplist(struct list_head *reaplist)
 	}
 }
 
+static void nfs40_clean_admin_revoked(struct nfsd_net *nn,
+				      struct laundry_time *lt)
+{
+	struct nfs4_client *clp;
+
+	spin_lock(&nn->client_lock);
+	if (nn->nfs40_last_revoke == 0 ||
+	    nn->nfs40_last_revoke > lt->cutoff) {
+		spin_unlock(&nn->client_lock);
+		return;
+	}
+	nn->nfs40_last_revoke = 0;
+
+retry:
+	list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+		unsigned long id, tmp;
+		struct nfs4_stid *stid;
+
+		if (atomic_read(&clp->cl_admin_revoked) == 0)
+			continue;
+
+		spin_lock(&clp->cl_lock);
+		idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id)
+			if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) {
+				refcount_inc(&stid->sc_count);
+				spin_unlock(&nn->client_lock);
+				/* this function drops ->cl_lock */
+				nfsd4_drop_revoked_stid(stid);
+				nfs4_put_stid(stid);
+				spin_lock(&nn->client_lock);
+				goto retry;
+			}
+		spin_unlock(&clp->cl_lock);
+	}
+	spin_unlock(&nn->client_lock);
+}
+
 static time64_t
 nfs4_laundromat(struct nfsd_net *nn)
 {
@@ -6240,6 +6324,8 @@ nfs4_laundromat(struct nfsd_net *nn)
 	nfs4_get_client_reaplist(nn, &reaplist, &lt);
 	nfs4_process_client_reaplist(&reaplist);
 
+	nfs40_clean_admin_revoked(nn, &lt);
+
 	spin_lock(&state_lock);
 	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
@@ -6458,6 +6544,9 @@ static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_sti
 	if (ret == nfs_ok)
 		ret = check_stateid_generation(in, &s->sc_stateid, has_session);
 	spin_unlock(&s->sc_lock);
+	if (ret == nfserr_admin_revoked)
+		nfsd40_drop_revoked_stid(s->sc_client,
+					&s->sc_stateid);
 	return ret;
 }
 
@@ -6502,6 +6591,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 	}
 out_unlock:
 	spin_unlock(&cl->cl_lock);
+	if (status == nfserr_admin_revoked)
+		nfsd40_drop_revoked_stid(cl, stateid);
 	return status;
 }
 
@@ -6548,6 +6639,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 		return nfserr_deleg_revoked;
 	}
 	if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) {
+		nfsd40_drop_revoked_stid(cstate->clp, stateid);
 		nfs4_put_stid(stid);
 		return nfserr_admin_revoked;
 	}
@@ -6840,6 +6932,11 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	s = find_stateid_locked(cl, stateid);
 	if (!s || s->sc_status & SC_STATUS_CLOSED)
 		goto out_unlock;
+	if (s->sc_status & SC_STATUS_ADMIN_REVOKED) {
+		nfsd4_drop_revoked_stid(s);
+		ret = nfs_ok;
+		goto out;
+	}
 	spin_lock(&s->sc_lock);
 	switch (s->sc_type) {
 	case SC_TYPE_DELEG:
@@ -6866,7 +6963,6 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		spin_unlock(&cl->cl_lock);
 		ret = nfsd4_free_lock_stateid(stateid, s);
 		goto out;
-	/* Default falls through and returns nfserr_bad_stateid */
 	}
 	spin_unlock(&s->sc_lock);
 out_unlock:

From 7ba079c80c2c7eb7b56b6482df38c54f62513ca5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:30 +1100
Subject: [PATCH 644/707] nfsd: allow lock state ids to be revoked and then
 freed

Revoking state through 'unlock_filesystem' now revokes any lock states
found.  When the stateids are then freed by the client, the revoked
stateids will be cleaned up correctly.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ab7f4e25f2a11d..4d5b0a798a6a65 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1717,7 +1717,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 	unsigned int idhashval;
 	unsigned int sc_types;
 
-	sc_types = 0;
+	sc_types = SC_TYPE_LOCK;
 
 	spin_lock(&nn->client_lock);
 	for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) {
@@ -1728,8 +1728,36 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 			struct nfs4_stid *stid = find_one_sb_stid(clp, sb,
 								  sc_types);
 			if (stid) {
+				struct nfs4_ol_stateid *stp;
+
 				spin_unlock(&nn->client_lock);
 				switch (stid->sc_type) {
+				case SC_TYPE_LOCK:
+					stp = openlockstateid(stid);
+					mutex_lock_nested(&stp->st_mutex,
+							  LOCK_STATEID_MUTEX);
+					spin_lock(&clp->cl_lock);
+					if (stid->sc_status == 0) {
+						struct nfs4_lockowner *lo =
+							lockowner(stp->st_stateowner);
+						struct nfsd_file *nf;
+
+						stid->sc_status |=
+							SC_STATUS_ADMIN_REVOKED;
+						atomic_inc(&clp->cl_admin_revoked);
+						spin_unlock(&clp->cl_lock);
+						nf = find_any_file(stp->st_stid.sc_file);
+						if (nf) {
+							get_file(nf->nf_file);
+							filp_close(nf->nf_file,
+								   (fl_owner_t)lo);
+							nfsd_file_put(nf);
+						}
+						release_all_access(stp);
+					} else
+						spin_unlock(&clp->cl_lock);
+					mutex_unlock(&stp->st_mutex);
+					break;
 				}
 				nfs4_put_stid(stid);
 				spin_lock(&nn->client_lock);
@@ -4630,8 +4658,18 @@ static void nfsd4_drop_revoked_stid(struct nfs4_stid *s)
 	__releases(&s->sc_client->cl_lock)
 {
 	struct nfs4_client *cl = s->sc_client;
+	LIST_HEAD(reaplist);
+	struct nfs4_ol_stateid *stp;
+	bool unhashed;
 
 	switch (s->sc_type) {
+	case SC_TYPE_LOCK:
+		stp = openlockstateid(s);
+		unhashed = unhash_lock_stateid(stp);
+		spin_unlock(&cl->cl_lock);
+		if (unhashed)
+			nfs4_put_stid(s);
+		break;
 	default:
 		spin_unlock(&cl->cl_lock);
 	}

From 4988411481a938927a929afe6be4803c0bf695ae Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:31 +1100
Subject: [PATCH 645/707] nfsd: allow open state ids to be revoked and then
 freed

Revoking state through 'unlock_filesystem' now revokes any open states
found.  When the stateids are then freed by the client, the revoked
stateids will be cleaned up correctly.

Possibly the related lock states should be revoked too, but a
subsequent patch will do that for all lock state on the superblock.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4d5b0a798a6a65..daf61f26e609e1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1717,7 +1717,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 	unsigned int idhashval;
 	unsigned int sc_types;
 
-	sc_types = SC_TYPE_LOCK;
+	sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK;
 
 	spin_lock(&nn->client_lock);
 	for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) {
@@ -1732,6 +1732,22 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 
 				spin_unlock(&nn->client_lock);
 				switch (stid->sc_type) {
+				case SC_TYPE_OPEN:
+					stp = openlockstateid(stid);
+					mutex_lock_nested(&stp->st_mutex,
+							  OPEN_STATEID_MUTEX);
+
+					spin_lock(&clp->cl_lock);
+					if (stid->sc_status == 0) {
+						stid->sc_status |=
+							SC_STATUS_ADMIN_REVOKED;
+						atomic_inc(&clp->cl_admin_revoked);
+						spin_unlock(&clp->cl_lock);
+						release_all_access(stp);
+					} else
+						spin_unlock(&clp->cl_lock);
+					mutex_unlock(&stp->st_mutex);
+					break;
 				case SC_TYPE_LOCK:
 					stp = openlockstateid(stid);
 					mutex_lock_nested(&stp->st_mutex,
@@ -4663,6 +4679,13 @@ static void nfsd4_drop_revoked_stid(struct nfs4_stid *s)
 	bool unhashed;
 
 	switch (s->sc_type) {
+	case SC_TYPE_OPEN:
+		stp = openlockstateid(s);
+		if (unhash_open_stateid(stp, &reaplist))
+			put_ol_stateid_locked(stp, &reaplist);
+		spin_unlock(&cl->cl_lock);
+		free_ol_stateid_reaplist(&reaplist);
+		break;
 	case SC_TYPE_LOCK:
 		stp = openlockstateid(s);
 		unhashed = unhash_lock_stateid(stp);

From e221f3b9aeef4b2d62da9b20d28131bb2a107620 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:32 +1100
Subject: [PATCH 646/707] nfsd: allow delegation state ids to be revoked and
 then freed

Revoking state through 'unlock_filesystem' now revokes any delegation
states found.  When the stateids are then freed by the client, the
revoked stateids will be cleaned up correctly.

As there is already support for revoking delegations, we build on that
for admin-revoking.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index daf61f26e609e1..fe21af8dfc68e5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1335,9 +1335,12 @@ unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask)
 	if (!delegation_hashed(dp))
 		return false;
 
-	if (dp->dl_stid.sc_client->cl_minorversion == 0)
+	if (statusmask == SC_STATUS_REVOKED &&
+	    dp->dl_stid.sc_client->cl_minorversion == 0)
 		statusmask = SC_STATUS_CLOSED;
 	dp->dl_stid.sc_status |= statusmask;
+	if (statusmask & SC_STATUS_ADMIN_REVOKED)
+		atomic_inc(&dp->dl_stid.sc_client->cl_admin_revoked);
 
 	/* Ensure that deleg break won't try to requeue it */
 	++dp->dl_time;
@@ -1368,7 +1371,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
 
 	trace_nfsd_stid_revoke(&dp->dl_stid);
 
-	if (dp->dl_stid.sc_status & SC_STATUS_REVOKED) {
+	if (dp->dl_stid.sc_status &
+	    (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)) {
 		spin_lock(&clp->cl_lock);
 		refcount_inc(&dp->dl_stid.sc_count);
 		list_add(&dp->dl_recall_lru, &clp->cl_revoked);
@@ -1717,7 +1721,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 	unsigned int idhashval;
 	unsigned int sc_types;
 
-	sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK;
+	sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG;
 
 	spin_lock(&nn->client_lock);
 	for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) {
@@ -1729,6 +1733,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 								  sc_types);
 			if (stid) {
 				struct nfs4_ol_stateid *stp;
+				struct nfs4_delegation *dp;
 
 				spin_unlock(&nn->client_lock);
 				switch (stid->sc_type) {
@@ -1774,6 +1779,16 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 						spin_unlock(&clp->cl_lock);
 					mutex_unlock(&stp->st_mutex);
 					break;
+				case SC_TYPE_DELEG:
+					dp = delegstateid(stid);
+					spin_lock(&state_lock);
+					if (!unhash_delegation_locked(
+						    dp, SC_STATUS_ADMIN_REVOKED))
+						dp = NULL;
+					spin_unlock(&state_lock);
+					if (dp)
+						revoke_delegation(dp);
+					break;
 				}
 				nfs4_put_stid(stid);
 				spin_lock(&nn->client_lock);
@@ -4676,6 +4691,7 @@ static void nfsd4_drop_revoked_stid(struct nfs4_stid *s)
 	struct nfs4_client *cl = s->sc_client;
 	LIST_HEAD(reaplist);
 	struct nfs4_ol_stateid *stp;
+	struct nfs4_delegation *dp;
 	bool unhashed;
 
 	switch (s->sc_type) {
@@ -4693,6 +4709,12 @@ static void nfsd4_drop_revoked_stid(struct nfs4_stid *s)
 		if (unhashed)
 			nfs4_put_stid(s);
 		break;
+	case SC_TYPE_DELEG:
+		dp = delegstateid(s);
+		list_del_init(&dp->dl_recall_lru);
+		spin_unlock(&cl->cl_lock);
+		nfs4_put_stid(s);
+		break;
 	default:
 		spin_unlock(&cl->cl_lock);
 	}

From f46d3e4afbb59769808dd4304cfede8fe106f60a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Jan 2024 12:08:33 +1100
Subject: [PATCH 647/707] nfsd: allow layout state to be admin-revoked.

When there is layout state on a filesystem that is being "unlocked" that
is now revoked, which involves closing the nfsd_file and releasing the
vfs lease.

To avoid races, ->ls_file can now be accessed either:
 - under ->fi_lock for the state's sc_file or
 - under rcu_read_lock() if nfsd_file_get() is used.
To support this, ->fence_client and nfsd4_cb_layout_fail() now take a
second argument being the nfsd_file.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/blocklayout.c |  4 ++--
 fs/nfsd/nfs4layouts.c | 43 ++++++++++++++++++++++++++++++++-----------
 fs/nfsd/nfs4state.c   | 11 +++++++++--
 fs/nfsd/pnfs.h        |  8 +++++++-
 4 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 46fd74d91ea929..3c040c81c77d01 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -328,10 +328,10 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
 }
 
 static void
-nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
+nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
 {
 	struct nfs4_client *clp = ls->ls_stid.sc_client;
-	struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev;
+	struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
 
 	bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
 			nfsd4_scsi_pr_key(clp), 0, true);
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 857b822450b4fe..1cfd61db247297 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -152,6 +152,23 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
 #endif
 }
 
+void nfsd4_close_layout(struct nfs4_layout_stateid *ls)
+{
+	struct nfsd_file *fl;
+
+	spin_lock(&ls->ls_stid.sc_file->fi_lock);
+	fl = ls->ls_file;
+	ls->ls_file = NULL;
+	spin_unlock(&ls->ls_stid.sc_file->fi_lock);
+
+	if (fl) {
+		if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
+			vfs_setlease(fl->nf_file, F_UNLCK, NULL,
+				     (void **)&ls);
+		nfsd_file_put(fl);
+	}
+}
+
 static void
 nfsd4_free_layout_stateid(struct nfs4_stid *stid)
 {
@@ -169,9 +186,7 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
 	list_del_init(&ls->ls_perfile);
 	spin_unlock(&fp->fi_lock);
 
-	if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
-		vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls);
-	nfsd_file_put(ls->ls_file);
+	nfsd4_close_layout(ls);
 
 	if (ls->ls_recalled)
 		atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
@@ -605,7 +620,7 @@ nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
 }
 
 static void
-nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
+nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
 {
 	struct nfs4_client *clp = ls->ls_stid.sc_client;
 	char addr_str[INET6_ADDRSTRLEN];
@@ -627,7 +642,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
 
 	argv[0] = (char *)nfsd_recall_failed;
 	argv[1] = addr_str;
-	argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id;
+	argv[2] = file->nf_file->f_path.mnt->mnt_sb->s_id;
 	argv[3] = NULL;
 
 	error = call_usermodehelper(nfsd_recall_failed, argv, envp,
@@ -657,6 +672,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 	struct nfsd_net *nn;
 	ktime_t now, cutoff;
 	const struct nfsd4_layout_ops *ops;
+	struct nfsd_file *fl;
 
 	trace_nfsd_cb_layout_done(&ls->ls_stid.sc_stateid, task);
 	switch (task->tk_status) {
@@ -688,12 +704,17 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 		 * Unknown error or non-responding client, we'll need to fence.
 		 */
 		trace_nfsd_layout_recall_fail(&ls->ls_stid.sc_stateid);
-
-		ops = nfsd4_layout_ops[ls->ls_layout_type];
-		if (ops->fence_client)
-			ops->fence_client(ls);
-		else
-			nfsd4_cb_layout_fail(ls);
+		rcu_read_lock();
+		fl = nfsd_file_get(ls->ls_file);
+		rcu_read_unlock();
+		if (fl) {
+			ops = nfsd4_layout_ops[ls->ls_layout_type];
+			if (ops->fence_client)
+				ops->fence_client(ls, fl);
+			else
+				nfsd4_cb_layout_fail(ls, fl);
+			nfsd_file_put(fl);
+		}
 		return 1;
 	case -NFS4ERR_NOMATCHING_LAYOUT:
 		trace_nfsd_layout_recall_done(&ls->ls_stid.sc_stateid);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fe21af8dfc68e5..a66d66b9f76918 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1721,7 +1721,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 	unsigned int idhashval;
 	unsigned int sc_types;
 
-	sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG;
+	sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG | SC_TYPE_LAYOUT;
 
 	spin_lock(&nn->client_lock);
 	for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) {
@@ -1734,6 +1734,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 			if (stid) {
 				struct nfs4_ol_stateid *stp;
 				struct nfs4_delegation *dp;
+				struct nfs4_layout_stateid *ls;
 
 				spin_unlock(&nn->client_lock);
 				switch (stid->sc_type) {
@@ -1789,6 +1790,10 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
 					if (dp)
 						revoke_delegation(dp);
 					break;
+				case SC_TYPE_LAYOUT:
+					ls = layoutstateid(stid);
+					nfsd4_close_layout(ls);
+					break;
 				}
 				nfs4_put_stid(stid);
 				spin_lock(&nn->client_lock);
@@ -2868,7 +2873,6 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st)
 	struct nfsd_file *file;
 
 	ls = container_of(st, struct nfs4_layout_stateid, ls_stid);
-	file = ls->ls_file;
 
 	seq_puts(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
@@ -2876,12 +2880,15 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st)
 
 	/* XXX: What else would be useful? */
 
+	spin_lock(&ls->ls_stid.sc_file->fi_lock);
+	file = ls->ls_file;
 	if (file) {
 		seq_puts(s, ", ");
 		nfs4_show_superblock(s, file);
 		seq_puts(s, ", ");
 		nfs4_show_fname(s, file);
 	}
+	spin_unlock(&ls->ls_stid.sc_file->fi_lock);
 	if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
 		seq_puts(s, ", admin-revoked");
 	seq_puts(s, " }\n");
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index de1e0dfed06a23..925817f669176c 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -37,7 +37,8 @@ struct nfsd4_layout_ops {
 	__be32 (*proc_layoutcommit)(struct inode *inode,
 			struct nfsd4_layoutcommit *lcp);
 
-	void (*fence_client)(struct nfs4_layout_stateid *ls);
+	void (*fence_client)(struct nfs4_layout_stateid *ls,
+			     struct nfsd_file *file);
 };
 
 extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
@@ -72,11 +73,13 @@ void nfsd4_setup_layout_type(struct svc_export *exp);
 void nfsd4_return_all_client_layouts(struct nfs4_client *);
 void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
 		struct nfs4_file *fp);
+void nfsd4_close_layout(struct nfs4_layout_stateid *ls);
 int nfsd4_init_pnfs(void);
 void nfsd4_exit_pnfs(void);
 #else
 struct nfs4_client;
 struct nfs4_file;
+struct nfs4_layout_stateid;
 
 static inline void nfsd4_setup_layout_type(struct svc_export *exp)
 {
@@ -89,6 +92,9 @@ static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
 		struct nfs4_file *fp)
 {
 }
+static inline void nfsd4_close_layout(struct nfs4_layout_stateid *ls)
+{
+}
 static inline void nfsd4_exit_pnfs(void)
 {
 }

From 23fed3b5e4abb0d0ce72191f7cc6dcec09fcfa76 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 31 Jan 2024 11:17:40 +1100
Subject: [PATCH 648/707] nfsd: don't call locks_release_private() twice
 concurrently

It is possible for free_blocked_lock() to be called twice concurrently,
once from nfsd4_lock() and once from nfsd4_release_lockowner() calling
remove_blocked_locks().  This is why a kref was added.

It is perfectly safe for locks_delete_block() and kref_put() to be
called in parallel as they use locking or atomicity respectively as
protection.  However locks_release_private() has no locking.  It is
safe for it to be called twice sequentially, but not concurrently.

This patch moves that call from free_blocked_lock() where it could race
with itself, to free_nbl() where it cannot.  This will slightly delay
the freeing of private info or release of the owner - but not by much.
It is arguably more natural for this freeing to happen in free_nbl()
where the structure itself is freed.

This bug was found by code inspection - it has not been seen in practice.

Fixes: 47446d74f170 ("nfsd4: add refcount for nfsd4_blocked_lock")
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a66d66b9f76918..12534e12dbb363 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -318,6 +318,7 @@ free_nbl(struct kref *kref)
 	struct nfsd4_blocked_lock *nbl;
 
 	nbl = container_of(kref, struct nfsd4_blocked_lock, nbl_kref);
+	locks_release_private(&nbl->nbl_lock);
 	kfree(nbl);
 }
 
@@ -325,7 +326,6 @@ static void
 free_blocked_lock(struct nfsd4_blocked_lock *nbl)
 {
 	locks_delete_block(&nbl->nbl_lock);
-	locks_release_private(&nbl->nbl_lock);
 	kref_put(&nbl->nbl_kref, free_nbl);
 }
 

From 74967bfbffe468e2ddc4e5bc8f3bfb0c2c8a2888 Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Wed, 31 Jan 2024 14:22:27 +0800
Subject: [PATCH 649/707] nfsd: Simplify the allocation of slab caches in
 nfsd4_init_pnfs

commit 0a31bd5f2bbb ("KMEM_CACHE(): simplify slab cache creation")
introduces a new macro.
Use the new KMEM_CACHE() macro instead of direct kmem_cache_create
to simplify the creation of SLAB caches.

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4layouts.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 1cfd61db247297..b1e585c1d9a3aa 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -777,13 +777,11 @@ nfsd4_init_pnfs(void)
 	for (i = 0; i < DEVID_HASH_SIZE; i++)
 		INIT_LIST_HEAD(&nfsd_devid_hash[i]);
 
-	nfs4_layout_cache = kmem_cache_create("nfs4_layout",
-			sizeof(struct nfs4_layout), 0, 0, NULL);
+	nfs4_layout_cache = KMEM_CACHE(nfs4_layout, 0);
 	if (!nfs4_layout_cache)
 		return -ENOMEM;
 
-	nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
-			sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
+	nfs4_layout_stateid_cache = KMEM_CACHE(nfs4_layout_stateid, 0);
 	if (!nfs4_layout_stateid_cache) {
 		kmem_cache_destroy(nfs4_layout_cache);
 		return -ENOMEM;

From 6c1c91f97746154611770e14f9be4d65a52f77c6 Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Wed, 31 Jan 2024 14:56:53 +0800
Subject: [PATCH 650/707] nfsd: Simplify the allocation of slab caches in
 nfsd_file_cache_init

Use the new KMEM_CACHE() macro instead of direct kmem_cache_create
to simplify the creation of SLAB caches.

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Acked-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/filecache.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 8d9f7b07e35b39..f3a642fd0ecaa8 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -722,15 +722,13 @@ nfsd_file_cache_init(void)
 		return ret;
 
 	ret = -ENOMEM;
-	nfsd_file_slab = kmem_cache_create("nfsd_file",
-				sizeof(struct nfsd_file), 0, 0, NULL);
+	nfsd_file_slab = KMEM_CACHE(nfsd_file, 0);
 	if (!nfsd_file_slab) {
 		pr_err("nfsd: unable to create nfsd_file_slab\n");
 		goto out_err;
 	}
 
-	nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark",
-					sizeof(struct nfsd_file_mark), 0, 0, NULL);
+	nfsd_file_mark_slab = KMEM_CACHE(nfsd_file_mark, 0);
 	if (!nfsd_file_mark_slab) {
 		pr_err("nfsd: unable to create nfsd_file_mark_slab\n");
 		goto out_err;

From 2c30c237b18027aa9d29c5d99f988541791f80ae Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 31 Jan 2024 14:26:02 +0100
Subject: [PATCH 651/707] pidfd: implement PIDFD_THREAD flag for pidfd_open()

With this flag:

	- pidfd_open() doesn't require that the target task must be
	  a thread-group leader

	- pidfd_poll() succeeds when the task exits and becomes a
	  zombie (iow, passes exit_notify()), even if it is a leader
	  and thread-group is not empty.

	  This means that the behaviour of pidfd_poll(PIDFD_THREAD,
	  pid-of-group-leader) is not well defined if it races with
	  exec() from its sub-thread; pidfd_poll() can succeed or not
	  depending on whether pidfd_task_exited() is called before
	  or after exchange_tids().

	  Perhaps we can improve this behaviour later, pidfd_poll()
	  can probably take sig->group_exec_task into account. But
	  this doesn't really differ from the case when the leader
	  exits before other threads (so pidfd_poll() succeeds) and
	  then another thread execs and pidfd_poll() will block again.

thread_group_exited() is no longer used, perhaps it can die.

Co-developed-by: Tycho Andersen <tycho@tycho.pizza>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240131132602.GA23641@redhat.com
Tested-by: Tycho Andersen <tandersen@netflix.com>
Reviewed-by: Tycho Andersen <tandersen@netflix.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/exec.c                  |  6 +++++-
 include/linux/pid.h        |  3 ++-
 include/uapi/linux/pidfd.h |  3 ++-
 kernel/exit.c              |  7 +++++++
 kernel/fork.c              | 38 +++++++++++++++++++++++++++++++-------
 kernel/pid.c               | 14 +++-----------
 kernel/signal.c            |  6 ++++--
 7 files changed, 54 insertions(+), 23 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 8cdd5b2dd09c2e..b68f61bbcaa82e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1143,7 +1143,11 @@ static int de_thread(struct task_struct *tsk)
 
 		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
 		leader->exit_state = EXIT_DEAD;
-
+		/*
+		 * leader and tsk exhanged their pids, the old pid dies,
+		 * wake up the PIDFD_THREAD waiters.
+		 */
+		do_notify_pidfd(leader);
 		/*
 		 * We are going to release_task()->ptrace_unlink() silently,
 		 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
diff --git a/include/linux/pid.h b/include/linux/pid.h
index e6a041cb8bacc7..8124d57752b938 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -70,10 +70,11 @@ extern const struct file_operations pidfd_fops;
 
 struct file;
 
-extern struct pid *pidfd_pid(const struct file *file);
+struct pid *pidfd_pid(const struct file *file);
 struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
 struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
 int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
+void do_notify_pidfd(struct task_struct *task);
 
 static inline struct pid *get_pid(struct pid *pid)
 {
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 5406fbc1307489..2e6461459877ba 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -7,6 +7,7 @@
 #include <linux/fcntl.h>
 
 /* Flags for pidfd_open().  */
-#define PIDFD_NONBLOCK O_NONBLOCK
+#define PIDFD_NONBLOCK	O_NONBLOCK
+#define PIDFD_THREAD	O_EXCL
 
 #endif /* _UAPI_LINUX_PIDFD_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 3988a02efaef06..c038d10dfb3868 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
 
 	tsk->exit_state = EXIT_ZOMBIE;
+	/*
+	 * sub-thread or delay_group_leader(), wake up the
+	 * PIDFD_THREAD waiters.
+	 */
+	if (!thread_group_empty(tsk))
+		do_notify_pidfd(tsk);
+
 	if (unlikely(tsk->ptrace)) {
 		int sig = thread_group_leader(tsk) &&
 				thread_group_empty(tsk) &&
diff --git a/kernel/fork.c b/kernel/fork.c
index 726a92043531ff..1a9b910559169c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -101,6 +101,7 @@
 #include <linux/user_events.h>
 #include <linux/iommu.h>
 #include <linux/rseq.h>
+#include <uapi/linux/pidfd.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 
 	seq_put_decimal_ll(m, "Pid:\t", nr);
 
+	/* TODO: report PIDFD_THREAD */
+
 #ifdef CONFIG_PID_NS
 	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
 	if (nr > 0) {
@@ -2068,22 +2071,35 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 }
 #endif
 
+static bool pidfd_task_exited(struct pid *pid, bool thread)
+{
+	struct task_struct *task;
+	bool exited;
+
+	rcu_read_lock();
+	task = pid_task(pid, PIDTYPE_PID);
+	exited = !task ||
+		(READ_ONCE(task->exit_state) && (thread || thread_group_empty(task)));
+	rcu_read_unlock();
+
+	return exited;
+}
+
 /*
  * Poll support for process exit notification.
  */
 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 {
 	struct pid *pid = file->private_data;
+	bool thread = file->f_flags & PIDFD_THREAD;
 	__poll_t poll_flags = 0;
 
 	poll_wait(file, &pid->wait_pidfd, pts);
-
 	/*
-	 * Inform pollers only when the whole thread group exits.
-	 * If the thread group leader exits before all other threads in the
-	 * group, then poll(2) should block, similar to the wait(2) family.
+	 * Depending on PIDFD_THREAD, inform pollers when the thread
+	 * or the whole thread-group exits.
 	 */
-	if (thread_group_exited(pid))
+	if (pidfd_task_exited(pid, thread))
 		poll_flags = EPOLLIN | EPOLLRDNORM;
 
 	return poll_flags;
@@ -2141,6 +2157,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
 		return PTR_ERR(pidfd_file);
 	}
 	get_pid(pid); /* held by pidfd_file now */
+	/*
+	 * anon_inode_getfile() ignores everything outside of the
+	 * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
+	 */
+	pidfd_file->f_flags |= (flags & PIDFD_THREAD);
 	*ret = pidfd_file;
 	return pidfd;
 }
@@ -2154,7 +2175,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
  * Allocate a new file that stashes @pid and reserve a new pidfd number in the
  * caller's file descriptor table. The pidfd is reserved but not installed yet.
  *
- * The helper verifies that @pid is used as a thread group leader.
+ * The helper verifies that @pid is still in use, without PIDFD_THREAD the
+ * task identified by @pid must be a thread-group leader.
  *
  * If this function returns successfully the caller is responsible to either
  * call fd_install() passing the returned pidfd and pidfd file as arguments in
@@ -2173,7 +2195,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
  */
 int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
 {
-	if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+	bool thread = flags & PIDFD_THREAD;
+
+	if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
 		return -EINVAL;
 
 	return __pidfd_prepare(pid, flags, ret);
diff --git a/kernel/pid.c b/kernel/pid.c
index c7a3e359f8f590..e1114446682802 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -552,11 +552,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
  * Return the task associated with @pidfd. The function takes a reference on
  * the returned task. The caller is responsible for releasing that reference.
  *
- * Currently, the process identified by @pidfd is always a thread-group leader.
- * This restriction currently exists for all aspects of pidfds including pidfd
- * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
- * (only supports thread group leaders).
- *
  * Return: On success, the task_struct associated with the pidfd.
  *	   On error, a negative errno number will be returned.
  */
@@ -615,11 +610,8 @@ static int pidfd_create(struct pid *pid, unsigned int flags)
  * @flags: flags to pass
  *
  * This creates a new pid file descriptor with the O_CLOEXEC flag set for
- * the process identified by @pid. Currently, the process identified by
- * @pid must be a thread-group leader. This restriction currently exists
- * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
- * be used with CLONE_THREAD) and pidfd polling (only supports thread group
- * leaders).
+ * the task identified by @pid. Without PIDFD_THREAD flag the target task
+ * must be a thread-group leader.
  *
  * Return: On success, a cloexec pidfd is returned.
  *         On error, a negative errno number will be returned.
@@ -629,7 +621,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
 	int fd;
 	struct pid *p;
 
-	if (flags & ~PIDFD_NONBLOCK)
+	if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
 		return -EINVAL;
 
 	if (pid <= 0)
diff --git a/kernel/signal.c b/kernel/signal.c
index 9561a3962ca687..9b40109f0c5648 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2019,7 +2019,7 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
 	return ret;
 }
 
-static void do_notify_pidfd(struct task_struct *task)
+void do_notify_pidfd(struct task_struct *task)
 {
 	struct pid *pid;
 
@@ -2051,7 +2051,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	WARN_ON_ONCE(!tsk->ptrace &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
 	/*
-	 * tsk is a group leader and has no threads, wake up the pidfd waiters.
+	 * tsk is a group leader and has no threads, wake up the
+	 * non-PIDFD_THREAD waiters.
 	 */
 	if (thread_group_empty(tsk))
 		do_notify_pidfd(tsk);
@@ -3926,6 +3927,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 		prepare_kill_siginfo(sig, &kinfo);
 	}
 
+	/* TODO: respect PIDFD_THREAD */
 	ret = kill_pid_info(sig, &kinfo, pid);
 
 err:

From 2747e0ee57c2742dacf27920e815d87c6ab62643 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Sun, 28 Jan 2024 18:24:07 -0700
Subject: [PATCH 652/707] bpf: btf: Add BTF_KFUNCS_START/END macro pair

This macro pair is functionally equivalent to BTF_SET8_START/END, except
with BTF_SET8_KFUNCS flag set in the btf_id_set8 flags field. The next
commit will codemod all kfunc set8s to this new variant such that all
kfuncs are tagged as such in .BTF_ids section.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/d536c57c7c2af428686853cc7396b7a44faa53b7.1706491398.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf_ids.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index dca09b7f21dc28..0fe4f1cd1918f9 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -8,6 +8,9 @@ struct btf_id_set {
 	u32 ids[];
 };
 
+/* This flag implies BTF_SET8 holds kfunc(s) */
+#define BTF_SET8_KFUNCS		(1 << 0)
+
 struct btf_id_set8 {
 	u32 cnt;
 	u32 flags;
@@ -204,6 +207,12 @@ asm(							\
 ".popsection;                                 \n");	\
 extern struct btf_id_set8 name;
 
+#define BTF_KFUNCS_START(name)				\
+__BTF_SET8_START(name, local, BTF_SET8_KFUNCS)
+
+#define BTF_KFUNCS_END(name)				\
+BTF_SET8_END(name)
+
 #else
 
 #define BTF_ID_LIST(name) static u32 __maybe_unused name[64];
@@ -218,6 +227,8 @@ extern struct btf_id_set8 name;
 #define BTF_SET_END(name)
 #define BTF_SET8_START(name) static struct btf_id_set8 __maybe_unused name = { 0 };
 #define BTF_SET8_END(name)
+#define BTF_KFUNCS_START(name) static struct btf_id_set8 __maybe_unused name = { 0 };
+#define BTF_KFUNCS_END(name)
 
 #endif /* CONFIG_DEBUG_INFO_BTF */
 

From 6e7769e6419f7836227c74da1f569961e5de0c0e Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Sun, 28 Jan 2024 18:24:08 -0700
Subject: [PATCH 653/707] bpf: treewide: Annotate BPF kfuncs in BTF

This commit marks kfuncs as such inside the .BTF_ids section. The upshot
of these annotations is that we'll be able to automatically generate
kfunc prototypes for downstream users. The process is as follows:

1. In source, use BTF_KFUNCS_START/END macro pair to mark kfuncs
2. During build, pahole injects into BTF a "bpf_kfunc" BTF_DECL_TAG for
   each function inside BTF_KFUNCS sets
3. At runtime, vmlinux or module BTF is made available in sysfs
4. At runtime, bpftool (or similar) can look at provided BTF and
   generate appropriate prototypes for functions with "bpf_kfunc" tag

To ensure future kfunc are similarly tagged, we now also return error
inside kfunc registration for untagged kfuncs. For vmlinux kfuncs,
we also WARN(), as initcall machinery does not handle errors.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Acked-by: Benjamin Tissoires <bentiss@kernel.org>
Link: https://lore.kernel.org/r/e55150ceecbf0a5d961e608941165c0bee7bc943.1706491398.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/kfuncs.rst                  |  8 ++++----
 drivers/hid/bpf/hid_bpf_dispatch.c            |  8 ++++----
 fs/verity/measure.c                           |  4 ++--
 kernel/bpf/btf.c                              |  8 ++++++++
 kernel/bpf/cpumask.c                          |  4 ++--
 kernel/bpf/helpers.c                          |  8 ++++----
 kernel/bpf/map_iter.c                         |  4 ++--
 kernel/cgroup/rstat.c                         |  4 ++--
 kernel/trace/bpf_trace.c                      |  8 ++++----
 net/bpf/test_run.c                            |  8 ++++----
 net/core/filter.c                             | 20 +++++++++----------
 net/core/xdp.c                                |  4 ++--
 net/ipv4/bpf_tcp_ca.c                         |  4 ++--
 net/ipv4/fou_bpf.c                            |  4 ++--
 net/ipv4/tcp_bbr.c                            |  4 ++--
 net/ipv4/tcp_cubic.c                          |  4 ++--
 net/ipv4/tcp_dctcp.c                          |  4 ++--
 net/netfilter/nf_conntrack_bpf.c              |  4 ++--
 net/netfilter/nf_nat_bpf.c                    |  4 ++--
 net/xfrm/xfrm_interface_bpf.c                 |  4 ++--
 net/xfrm/xfrm_state_bpf.c                     |  4 ++--
 .../selftests/bpf/bpf_testmod/bpf_testmod.c   |  8 ++++----
 22 files changed, 70 insertions(+), 62 deletions(-)

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 7985c6615f3c2f..a8f5782bd83318 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -177,10 +177,10 @@ In addition to kfuncs' arguments, verifier may need more information about the
 type of kfunc(s) being registered with the BPF subsystem. To do so, we define
 flags on a set of kfuncs as follows::
 
-        BTF_SET8_START(bpf_task_set)
+        BTF_KFUNCS_START(bpf_task_set)
         BTF_ID_FLAGS(func, bpf_get_task_pid, KF_ACQUIRE | KF_RET_NULL)
         BTF_ID_FLAGS(func, bpf_put_pid, KF_RELEASE)
-        BTF_SET8_END(bpf_task_set)
+        BTF_KFUNCS_END(bpf_task_set)
 
 This set encodes the BTF ID of each kfunc listed above, and encodes the flags
 along with it. Ofcourse, it is also allowed to specify no flags.
@@ -347,10 +347,10 @@ Once the kfunc is prepared for use, the final step to making it visible is
 registering it with the BPF subsystem. Registration is done per BPF program
 type. An example is shown below::
 
-        BTF_SET8_START(bpf_task_set)
+        BTF_KFUNCS_START(bpf_task_set)
         BTF_ID_FLAGS(func, bpf_get_task_pid, KF_ACQUIRE | KF_RET_NULL)
         BTF_ID_FLAGS(func, bpf_put_pid, KF_RELEASE)
-        BTF_SET8_END(bpf_task_set)
+        BTF_KFUNCS_END(bpf_task_set)
 
         static const struct btf_kfunc_id_set bpf_task_kfunc_set = {
                 .owner = THIS_MODULE,
diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c
index d9ef45fcaeab13..02c441aaa21751 100644
--- a/drivers/hid/bpf/hid_bpf_dispatch.c
+++ b/drivers/hid/bpf/hid_bpf_dispatch.c
@@ -172,9 +172,9 @@ hid_bpf_get_data(struct hid_bpf_ctx *ctx, unsigned int offset, const size_t rdwr
  * The following set contains all functions we agree BPF programs
  * can use.
  */
-BTF_SET8_START(hid_bpf_kfunc_ids)
+BTF_KFUNCS_START(hid_bpf_kfunc_ids)
 BTF_ID_FLAGS(func, hid_bpf_get_data, KF_RET_NULL)
-BTF_SET8_END(hid_bpf_kfunc_ids)
+BTF_KFUNCS_END(hid_bpf_kfunc_ids)
 
 static const struct btf_kfunc_id_set hid_bpf_kfunc_set = {
 	.owner = THIS_MODULE,
@@ -440,12 +440,12 @@ static const struct btf_kfunc_id_set hid_bpf_fmodret_set = {
 };
 
 /* for syscall HID-BPF */
-BTF_SET8_START(hid_bpf_syscall_kfunc_ids)
+BTF_KFUNCS_START(hid_bpf_syscall_kfunc_ids)
 BTF_ID_FLAGS(func, hid_bpf_attach_prog)
 BTF_ID_FLAGS(func, hid_bpf_allocate_context, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, hid_bpf_release_context, KF_RELEASE)
 BTF_ID_FLAGS(func, hid_bpf_hw_request)
-BTF_SET8_END(hid_bpf_syscall_kfunc_ids)
+BTF_KFUNCS_END(hid_bpf_syscall_kfunc_ids)
 
 static const struct btf_kfunc_id_set hid_bpf_syscall_kfunc_set = {
 	.owner = THIS_MODULE,
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index bf7a5f4cccaf04..3969d54158d128 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -159,9 +159,9 @@ __bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr_ker
 
 __bpf_kfunc_end_defs();
 
-BTF_SET8_START(fsverity_set_ids)
+BTF_KFUNCS_START(fsverity_set_ids)
 BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS)
-BTF_SET8_END(fsverity_set_ids)
+BTF_KFUNCS_END(fsverity_set_ids)
 
 static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id)
 {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index c8c6e6cf18e7f4..ef380e5469521b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -8124,6 +8124,14 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
 {
 	enum btf_kfunc_hook hook;
 
+	/* All kfuncs need to be tagged as such in BTF.
+	 * WARN() for initcall registrations that do not check errors.
+	 */
+	if (!(kset->set->flags & BTF_SET8_KFUNCS)) {
+		WARN_ON(!kset->owner);
+		return -EINVAL;
+	}
+
 	hook = bpf_prog_type_to_kfunc_hook(prog_type);
 	return __register_btf_kfunc_id_set(hook, kset);
 }
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 2e73533a3811cd..dad0fb1c8e876f 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -424,7 +424,7 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
 
 __bpf_kfunc_end_defs();
 
-BTF_SET8_START(cpumask_kfunc_btf_ids)
+BTF_KFUNCS_START(cpumask_kfunc_btf_ids)
 BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
@@ -450,7 +450,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
-BTF_SET8_END(cpumask_kfunc_btf_ids)
+BTF_KFUNCS_END(cpumask_kfunc_btf_ids)
 
 static const struct btf_kfunc_id_set cpumask_kfunc_set = {
 	.owner = THIS_MODULE,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bcb951a2ecf4b9..4db1c658254c17 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2544,7 +2544,7 @@ __bpf_kfunc void bpf_throw(u64 cookie)
 
 __bpf_kfunc_end_defs();
 
-BTF_SET8_START(generic_btf_ids)
+BTF_KFUNCS_START(generic_btf_ids)
 #ifdef CONFIG_KEXEC_CORE
 BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
 #endif
@@ -2573,7 +2573,7 @@ BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 #endif
 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_throw)
-BTF_SET8_END(generic_btf_ids)
+BTF_KFUNCS_END(generic_btf_ids)
 
 static const struct btf_kfunc_id_set generic_kfunc_set = {
 	.owner = THIS_MODULE,
@@ -2589,7 +2589,7 @@ BTF_ID(struct, cgroup)
 BTF_ID(func, bpf_cgroup_release_dtor)
 #endif
 
-BTF_SET8_START(common_btf_ids)
+BTF_KFUNCS_START(common_btf_ids)
 BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
 BTF_ID_FLAGS(func, bpf_rdonly_cast)
 BTF_ID_FLAGS(func, bpf_rcu_read_lock)
@@ -2618,7 +2618,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
 BTF_ID_FLAGS(func, bpf_dynptr_size)
 BTF_ID_FLAGS(func, bpf_dynptr_clone)
-BTF_SET8_END(common_btf_ids)
+BTF_KFUNCS_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
 	.owner = THIS_MODULE,
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 6abd7c5df4b39e..9575314f40a692 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -213,9 +213,9 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
 
 __bpf_kfunc_end_defs();
 
-BTF_SET8_START(bpf_map_iter_kfunc_ids)
+BTF_KFUNCS_START(bpf_map_iter_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS)
-BTF_SET8_END(bpf_map_iter_kfunc_ids)
+BTF_KFUNCS_END(bpf_map_iter_kfunc_ids)
 
 static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = {
 	.owner = THIS_MODULE,
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index a8350d2d63e6b1..07e2284bb49971 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -562,10 +562,10 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 }
 
 /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
-BTF_SET8_START(bpf_rstat_kfunc_ids)
+BTF_KFUNCS_START(bpf_rstat_kfunc_ids)
 BTF_ID_FLAGS(func, cgroup_rstat_updated)
 BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
-BTF_SET8_END(bpf_rstat_kfunc_ids)
+BTF_KFUNCS_END(bpf_rstat_kfunc_ids)
 
 static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
 	.owner          = THIS_MODULE,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 64fdaf79d11365..241ddf5e38953e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1412,14 +1412,14 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
 
 __bpf_kfunc_end_defs();
 
-BTF_SET8_START(key_sig_kfunc_set)
+BTF_KFUNCS_START(key_sig_kfunc_set)
 BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
 #ifdef CONFIG_SYSTEM_DATA_VERIFICATION
 BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
 #endif
-BTF_SET8_END(key_sig_kfunc_set)
+BTF_KFUNCS_END(key_sig_kfunc_set)
 
 static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = {
 	.owner = THIS_MODULE,
@@ -1475,9 +1475,9 @@ __bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
 
 __bpf_kfunc_end_defs();
 
-BTF_SET8_START(fs_kfunc_set_ids)
+BTF_KFUNCS_START(fs_kfunc_set_ids)
 BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
-BTF_SET8_END(fs_kfunc_set_ids)
+BTF_KFUNCS_END(fs_kfunc_set_ids)
 
 static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id)
 {
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index dfd91937401783..5535f9adc6589d 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -617,21 +617,21 @@ CFI_NOSEAL(bpf_kfunc_call_memb_release_dtor);
 
 __bpf_kfunc_end_defs();
 
-BTF_SET8_START(bpf_test_modify_return_ids)
+BTF_KFUNCS_START(bpf_test_modify_return_ids)
 BTF_ID_FLAGS(func, bpf_modify_return_test)
 BTF_ID_FLAGS(func, bpf_modify_return_test2)
 BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE)
-BTF_SET8_END(bpf_test_modify_return_ids)
+BTF_KFUNCS_END(bpf_test_modify_return_ids)
 
 static const struct btf_kfunc_id_set bpf_test_modify_return_set = {
 	.owner = THIS_MODULE,
 	.set   = &bpf_test_modify_return_ids,
 };
 
-BTF_SET8_START(test_sk_check_kfunc_ids)
+BTF_KFUNCS_START(test_sk_check_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE)
-BTF_SET8_END(test_sk_check_kfunc_ids)
+BTF_KFUNCS_END(test_sk_check_kfunc_ids)
 
 static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size,
 			   u32 size, u32 headroom, u32 tailroom)
diff --git a/net/core/filter.c b/net/core/filter.c
index 358870408a51e6..524adf1fa6d019 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -11982,21 +11982,21 @@ int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
 	return 0;
 }
 
-BTF_SET8_START(bpf_kfunc_check_set_skb)
+BTF_KFUNCS_START(bpf_kfunc_check_set_skb)
 BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
-BTF_SET8_END(bpf_kfunc_check_set_skb)
+BTF_KFUNCS_END(bpf_kfunc_check_set_skb)
 
-BTF_SET8_START(bpf_kfunc_check_set_xdp)
+BTF_KFUNCS_START(bpf_kfunc_check_set_xdp)
 BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
-BTF_SET8_END(bpf_kfunc_check_set_xdp)
+BTF_KFUNCS_END(bpf_kfunc_check_set_xdp)
 
-BTF_SET8_START(bpf_kfunc_check_set_sock_addr)
+BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr)
 BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
-BTF_SET8_END(bpf_kfunc_check_set_sock_addr)
+BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr)
 
-BTF_SET8_START(bpf_kfunc_check_set_tcp_reqsk)
+BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
 BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
-BTF_SET8_END(bpf_kfunc_check_set_tcp_reqsk)
+BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)
 
 static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
 	.owner = THIS_MODULE,
@@ -12075,9 +12075,9 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
 
 __bpf_kfunc_end_defs();
 
-BTF_SET8_START(bpf_sk_iter_kfunc_ids)
+BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
-BTF_SET8_END(bpf_sk_iter_kfunc_ids)
+BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids)
 
 static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id)
 {
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 4869c1c2d8f3d9..034fb80f3fbe9b 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -771,11 +771,11 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx,
 
 __bpf_kfunc_end_defs();
 
-BTF_SET8_START(xdp_metadata_kfunc_ids)
+BTF_KFUNCS_START(xdp_metadata_kfunc_ids)
 #define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
 XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
-BTF_SET8_END(xdp_metadata_kfunc_ids)
+BTF_KFUNCS_END(xdp_metadata_kfunc_ids)
 
 static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
 	.owner = THIS_MODULE,
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 834edc18463ac1..7f518ea5f4ac7d 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -201,13 +201,13 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
 	}
 }
 
-BTF_SET8_START(bpf_tcp_ca_check_kfunc_ids)
+BTF_KFUNCS_START(bpf_tcp_ca_check_kfunc_ids)
 BTF_ID_FLAGS(func, tcp_reno_ssthresh)
 BTF_ID_FLAGS(func, tcp_reno_cong_avoid)
 BTF_ID_FLAGS(func, tcp_reno_undo_cwnd)
 BTF_ID_FLAGS(func, tcp_slow_start)
 BTF_ID_FLAGS(func, tcp_cong_avoid_ai)
-BTF_SET8_END(bpf_tcp_ca_check_kfunc_ids)
+BTF_KFUNCS_END(bpf_tcp_ca_check_kfunc_ids)
 
 static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = {
 	.owner = THIS_MODULE,
diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c
index 4da03bf45c9b75..06e5572f296f1e 100644
--- a/net/ipv4/fou_bpf.c
+++ b/net/ipv4/fou_bpf.c
@@ -100,10 +100,10 @@ __bpf_kfunc int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
 
 __bpf_kfunc_end_defs();
 
-BTF_SET8_START(fou_kfunc_set)
+BTF_KFUNCS_START(fou_kfunc_set)
 BTF_ID_FLAGS(func, bpf_skb_set_fou_encap)
 BTF_ID_FLAGS(func, bpf_skb_get_fou_encap)
-BTF_SET8_END(fou_kfunc_set)
+BTF_KFUNCS_END(fou_kfunc_set)
 
 static const struct btf_kfunc_id_set fou_bpf_kfunc_set = {
 	.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 22358032dd484b..05dc2d05bc7cbb 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -1155,7 +1155,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
 	.set_state	= bbr_set_state,
 };
 
-BTF_SET8_START(tcp_bbr_check_kfunc_ids)
+BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids)
 #ifdef CONFIG_X86
 #ifdef CONFIG_DYNAMIC_FTRACE
 BTF_ID_FLAGS(func, bbr_init)
@@ -1168,7 +1168,7 @@ BTF_ID_FLAGS(func, bbr_min_tso_segs)
 BTF_ID_FLAGS(func, bbr_set_state)
 #endif
 #endif
-BTF_SET8_END(tcp_bbr_check_kfunc_ids)
+BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
 
 static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = {
 	.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 0fd78ecb67e756..44869ea089e346 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -485,7 +485,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
 	.name		= "cubic",
 };
 
-BTF_SET8_START(tcp_cubic_check_kfunc_ids)
+BTF_KFUNCS_START(tcp_cubic_check_kfunc_ids)
 #ifdef CONFIG_X86
 #ifdef CONFIG_DYNAMIC_FTRACE
 BTF_ID_FLAGS(func, cubictcp_init)
@@ -496,7 +496,7 @@ BTF_ID_FLAGS(func, cubictcp_cwnd_event)
 BTF_ID_FLAGS(func, cubictcp_acked)
 #endif
 #endif
-BTF_SET8_END(tcp_cubic_check_kfunc_ids)
+BTF_KFUNCS_END(tcp_cubic_check_kfunc_ids)
 
 static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = {
 	.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index bb23bb5b387a0c..e33fbe4933e42f 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -260,7 +260,7 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = {
 	.name		= "dctcp-reno",
 };
 
-BTF_SET8_START(tcp_dctcp_check_kfunc_ids)
+BTF_KFUNCS_START(tcp_dctcp_check_kfunc_ids)
 #ifdef CONFIG_X86
 #ifdef CONFIG_DYNAMIC_FTRACE
 BTF_ID_FLAGS(func, dctcp_init)
@@ -271,7 +271,7 @@ BTF_ID_FLAGS(func, dctcp_cwnd_undo)
 BTF_ID_FLAGS(func, dctcp_state)
 #endif
 #endif
-BTF_SET8_END(tcp_dctcp_check_kfunc_ids)
+BTF_KFUNCS_END(tcp_dctcp_check_kfunc_ids)
 
 static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = {
 	.owner = THIS_MODULE,
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 475358ec821296..d2492d050fe601 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -467,7 +467,7 @@ __bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
 
 __bpf_kfunc_end_defs();
 
-BTF_SET8_START(nf_ct_kfunc_set)
+BTF_KFUNCS_START(nf_ct_kfunc_set)
 BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
@@ -478,7 +478,7 @@ BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS)
-BTF_SET8_END(nf_ct_kfunc_set)
+BTF_KFUNCS_END(nf_ct_kfunc_set)
 
 static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
 	.owner = THIS_MODULE,
diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c
index 6e3b2f58855fc0..481be15609b16a 100644
--- a/net/netfilter/nf_nat_bpf.c
+++ b/net/netfilter/nf_nat_bpf.c
@@ -54,9 +54,9 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
 
 __bpf_kfunc_end_defs();
 
-BTF_SET8_START(nf_nat_kfunc_set)
+BTF_KFUNCS_START(nf_nat_kfunc_set)
 BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS)
-BTF_SET8_END(nf_nat_kfunc_set)
+BTF_KFUNCS_END(nf_nat_kfunc_set)
 
 static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = {
 	.owner = THIS_MODULE,
diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
index 7d5e920141e9b7..5ea15037ebd104 100644
--- a/net/xfrm/xfrm_interface_bpf.c
+++ b/net/xfrm/xfrm_interface_bpf.c
@@ -93,10 +93,10 @@ __bpf_kfunc int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx, const struct bp
 
 __bpf_kfunc_end_defs();
 
-BTF_SET8_START(xfrm_ifc_kfunc_set)
+BTF_KFUNCS_START(xfrm_ifc_kfunc_set)
 BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info)
 BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info)
-BTF_SET8_END(xfrm_ifc_kfunc_set)
+BTF_KFUNCS_END(xfrm_ifc_kfunc_set)
 
 static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = {
 	.owner = THIS_MODULE,
diff --git a/net/xfrm/xfrm_state_bpf.c b/net/xfrm/xfrm_state_bpf.c
index 9e20d4a377f7eb..2248eda741f8e0 100644
--- a/net/xfrm/xfrm_state_bpf.c
+++ b/net/xfrm/xfrm_state_bpf.c
@@ -117,10 +117,10 @@ __bpf_kfunc void bpf_xdp_xfrm_state_release(struct xfrm_state *x)
 
 __bpf_kfunc_end_defs();
 
-BTF_SET8_START(xfrm_state_kfunc_set)
+BTF_KFUNCS_START(xfrm_state_kfunc_set)
 BTF_ID_FLAGS(func, bpf_xdp_get_xfrm_state, KF_RET_NULL | KF_ACQUIRE)
 BTF_ID_FLAGS(func, bpf_xdp_xfrm_state_release, KF_RELEASE)
-BTF_SET8_END(xfrm_state_kfunc_set)
+BTF_KFUNCS_END(xfrm_state_kfunc_set)
 
 static const struct btf_kfunc_id_set xfrm_state_xdp_kfunc_set = {
 	.owner = THIS_MODULE,
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 6f163a0f1c94cc..4754c662b39ff8 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -343,12 +343,12 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = {
 	.write = bpf_testmod_test_write,
 };
 
-BTF_SET8_START(bpf_testmod_common_kfunc_ids)
+BTF_KFUNCS_START(bpf_testmod_common_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_iter_testmod_seq_new, KF_ITER_NEW)
 BTF_ID_FLAGS(func, bpf_iter_testmod_seq_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_testmod_seq_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_kfunc_common_test)
-BTF_SET8_END(bpf_testmod_common_kfunc_ids)
+BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids)
 
 static const struct btf_kfunc_id_set bpf_testmod_common_kfunc_set = {
 	.owner = THIS_MODULE,
@@ -494,7 +494,7 @@ __bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused
 	return arg;
 }
 
-BTF_SET8_START(bpf_testmod_check_kfunc_ids)
+BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test1)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test2)
@@ -520,7 +520,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset)
-BTF_SET8_END(bpf_testmod_check_kfunc_ids)
+BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids)
 
 static int bpf_testmod_ops_init(struct btf *btf)
 {

From 320d6fa71508ccfe5f9cd75561ab3d3919d9628c Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Wed, 31 Jan 2024 10:10:18 +0300
Subject: [PATCH 654/707] smb: client: Fix a NULL vs IS_ERR() check in
 wsl_set_xattrs()

This was intended to be an IS_ERR() check.  The ea_create_context()
function doesn't return NULL.

Fixes: 1eab17fe485c ("smb: client: add support for WSL reparse points")
Reviewed-by: Paulo Alcantara <pc@manguebit.com>
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/reparse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 23e0cb62552ec7..84ff1683a40bcc 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -226,7 +226,7 @@ static int wsl_set_xattrs(struct inode *inode, umode_t _mode,
 	}
 
 	cc = ea_create_context(dlen, &cc_len);
-	if (!cc)
+	if (IS_ERR(cc))
 		return PTR_ERR(cc);
 
 	ea = &cc->ea;

From cf7f97cd68843d987cc7daca253f0bfbbbc535c1 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Sun, 28 Jan 2024 01:12:01 -0300
Subject: [PATCH 655/707] smb: client: introduce SMB2_OP_QUERY_WSL_EA

Add a new command to smb2_compound_op() for querying WSL extended
attributes from reparse points.

Signed-off-by: Paulo Alcantara <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsglob.h  |   5 ++
 fs/smb/client/reparse.c   |  10 +--
 fs/smb/client/smb2glob.h  |   3 +-
 fs/smb/client/smb2inode.c | 170 +++++++++++++++++++++++++++++++++-----
 fs/smb/client/smb2pdu.h   |  25 ++++++
 fs/smb/client/trace.h     |   2 +
 6 files changed, 190 insertions(+), 25 deletions(-)

diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 90bc90ed97ef66..decf80131bbeb4 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -214,6 +214,10 @@ struct cifs_open_info_data {
 			struct reparse_posix_data *posix;
 		};
 	} reparse;
+	struct {
+		__u8		eas[SMB2_WSL_MAX_QUERY_EA_RESP_SIZE];
+		unsigned int	eas_len;
+	} wsl;
 	char *symlink_target;
 	struct cifs_sid posix_owner;
 	struct cifs_sid posix_group;
@@ -2292,6 +2296,7 @@ struct smb2_compound_vars {
 	struct kvec close_iov;
 	struct smb2_file_rename_info rename_info;
 	struct smb2_file_link_info link_info;
+	struct kvec ea_iov;
 };
 
 #endif	/* _CIFS_GLOB_H */
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 84ff1683a40bcc..5f6674d8e098bd 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -201,15 +201,15 @@ static int wsl_set_xattrs(struct inode *inode, umode_t _mode,
 	__le64 dev = cpu_to_le64(((u64)MINOR(_dev) << 32) | MAJOR(_dev));
 	__le64 mode = cpu_to_le64(_mode);
 	struct wsl_xattr xattrs[] = {
-		{ .name = "$LXUID", .value = uid, .size = 4, },
-		{ .name = "$LXGID", .value = gid, .size = 4, },
-		{ .name = "$LXMOD", .value = mode, .size = 4, },
-		{ .name = "$LXDEV", .value = dev, .size = 8, },
+		{ .name = SMB2_WSL_XATTR_UID,  .value = uid,  .size = SMB2_WSL_XATTR_UID_SIZE, },
+		{ .name = SMB2_WSL_XATTR_GID,  .value = gid,  .size = SMB2_WSL_XATTR_GID_SIZE, },
+		{ .name = SMB2_WSL_XATTR_MODE, .value = mode, .size = SMB2_WSL_XATTR_MODE_SIZE, },
+		{ .name = SMB2_WSL_XATTR_DEV,  .value = dev, .size = SMB2_WSL_XATTR_DEV_SIZE, },
 	};
 	size_t cc_len;
 	u32 dlen = 0, next = 0;
 	int i, num_xattrs;
-	u8 name_size = strlen(xattrs[0].name) + 1;
+	u8 name_size = SMB2_WSL_XATTR_NAME_LEN + 1;
 
 	memset(iov, 0, sizeof(*iov));
 
diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h
index a0c156996fc51e..2466e61551369c 100644
--- a/fs/smb/client/smb2glob.h
+++ b/fs/smb/client/smb2glob.h
@@ -36,7 +36,8 @@ enum smb2_compound_ops {
 	SMB2_OP_RMDIR,
 	SMB2_OP_POSIX_QUERY_INFO,
 	SMB2_OP_SET_REPARSE,
-	SMB2_OP_GET_REPARSE
+	SMB2_OP_GET_REPARSE,
+	SMB2_OP_QUERY_WSL_EA,
 };
 
 /* Used when constructing chained read requests. */
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index e9955b964ea470..63485078a6df6c 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -85,6 +85,82 @@ static int parse_posix_sids(struct cifs_open_info_data *data,
 	return 0;
 }
 
+struct wsl_query_ea {
+	__le32	next;
+	__u8	name_len;
+	__u8	name[SMB2_WSL_XATTR_NAME_LEN + 1];
+} __packed;
+
+#define NEXT_OFF cpu_to_le32(sizeof(struct wsl_query_ea))
+
+static const struct wsl_query_ea wsl_query_eas[] = {
+	{ .next = NEXT_OFF, .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_UID, },
+	{ .next = NEXT_OFF, .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_GID, },
+	{ .next = NEXT_OFF, .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_MODE, },
+	{ .next = 0,        .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_DEV, },
+};
+
+static int check_wsl_eas(struct kvec *rsp_iov)
+{
+	struct smb2_file_full_ea_info *ea;
+	struct smb2_query_info_rsp *rsp = rsp_iov->iov_base;
+	unsigned long addr;
+	u32 outlen, next;
+	u8 nlen;
+	u16 vlen;
+	u8 *end;
+
+	outlen = le32_to_cpu(rsp->OutputBufferLength);
+	if (outlen < SMB2_WSL_MIN_QUERY_EA_RESP_SIZE ||
+	    outlen > SMB2_WSL_MAX_QUERY_EA_RESP_SIZE)
+		return -EINVAL;
+
+	ea = (void *)((u8 *)rsp_iov->iov_base +
+		      le16_to_cpu(rsp->OutputBufferOffset));
+	end = (u8 *)rsp_iov->iov_base + rsp_iov->iov_len;
+	for (;;) {
+		if ((u8 *)ea > end - sizeof(*ea))
+			return -EINVAL;
+
+		nlen = ea->ea_name_length;
+		vlen = le16_to_cpu(ea->ea_value_length);
+		if (nlen != SMB2_WSL_XATTR_NAME_LEN ||
+		    (u8 *)ea + nlen + 1 + vlen > end)
+			return -EINVAL;
+
+		switch (vlen) {
+		case 4:
+			if (strncmp(ea->ea_data, SMB2_WSL_XATTR_UID, nlen) &&
+			    strncmp(ea->ea_data, SMB2_WSL_XATTR_GID, nlen) &&
+			    strncmp(ea->ea_data, SMB2_WSL_XATTR_MODE, nlen))
+				return -EINVAL;
+			break;
+		case 8:
+			if (strncmp(ea->ea_data, SMB2_WSL_XATTR_DEV, nlen))
+				return -EINVAL;
+			break;
+		case 0:
+			if (!strncmp(ea->ea_data, SMB2_WSL_XATTR_UID, nlen) ||
+			    !strncmp(ea->ea_data, SMB2_WSL_XATTR_GID, nlen) ||
+			    !strncmp(ea->ea_data, SMB2_WSL_XATTR_MODE, nlen) ||
+			    !strncmp(ea->ea_data, SMB2_WSL_XATTR_DEV, nlen))
+				break;
+			fallthrough;
+		default:
+			return -EINVAL;
+		}
+
+		next = le32_to_cpu(ea->next_entry_offset);
+		if (!next)
+			break;
+		if (!IS_ALIGNED(next, 4) ||
+		    check_add_overflow((unsigned long)ea, next, &addr))
+			return -EINVAL;
+		ea = (void *)addr;
+	}
+	return 0;
+}
+
 /*
  * note: If cfile is passed, the reference to it is dropped here.
  * So make sure that you do not reuse cfile after return from this func.
@@ -118,7 +194,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 	__u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0};
 	unsigned int size[2];
 	void *data[2];
-	int len;
+	unsigned int len;
 	int retries = 0, cur_sleep = 1;
 
 replay_again:
@@ -455,6 +531,39 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 			trace_smb3_get_reparse_compound_enter(xid, ses->Suid,
 							      tcon->tid, full_path);
 			break;
+		case SMB2_OP_QUERY_WSL_EA:
+			rqst[num_rqst].rq_iov = &vars->ea_iov;
+			rqst[num_rqst].rq_nvec = 1;
+
+			if (cfile) {
+				rc = SMB2_query_info_init(tcon, server,
+							  &rqst[num_rqst],
+							  cfile->fid.persistent_fid,
+							  cfile->fid.volatile_fid,
+							  FILE_FULL_EA_INFORMATION,
+							  SMB2_O_INFO_FILE, 0,
+							  SMB2_WSL_MAX_QUERY_EA_RESP_SIZE,
+							  sizeof(wsl_query_eas),
+							  (void *)wsl_query_eas);
+			} else {
+				rc = SMB2_query_info_init(tcon, server,
+							  &rqst[num_rqst],
+							  COMPOUND_FID,
+							  COMPOUND_FID,
+							  FILE_FULL_EA_INFORMATION,
+							  SMB2_O_INFO_FILE, 0,
+							  SMB2_WSL_MAX_QUERY_EA_RESP_SIZE,
+							  sizeof(wsl_query_eas),
+							  (void *)wsl_query_eas);
+			}
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
+				goto finished;
+			}
+			num_rqst++;
+			break;
 		default:
 			cifs_dbg(VFS, "Invalid command\n");
 			rc = -EINVAL;
@@ -637,11 +746,32 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 				memset(iov, 0, sizeof(*iov));
 				resp_buftype[i + 1] = CIFS_NO_BUFFER;
 			} else {
-				trace_smb3_set_reparse_compound_err(xid,  ses->Suid,
+				trace_smb3_set_reparse_compound_err(xid, ses->Suid,
 								    tcon->tid, rc);
 			}
 			SMB2_ioctl_free(&rqst[num_rqst++]);
 			break;
+		case SMB2_OP_QUERY_WSL_EA:
+			if (!rc) {
+				idata = in_iov[i].iov_base;
+				qi_rsp = rsp_iov[i + 1].iov_base;
+				data[0] = (u8 *)qi_rsp + le16_to_cpu(qi_rsp->OutputBufferOffset);
+				size[0] = le32_to_cpu(qi_rsp->OutputBufferLength);
+				rc = check_wsl_eas(&rsp_iov[i + 1]);
+				if (!rc) {
+					memcpy(idata->wsl.eas, data[0], size[0]);
+					idata->wsl.eas_len = size[0];
+				}
+			}
+			if (!rc) {
+				trace_smb3_query_wsl_ea_compound_done(xid, ses->Suid,
+								      tcon->tid);
+			} else {
+				trace_smb3_query_wsl_ea_compound_err(xid, ses->Suid,
+								     tcon->tid, rc);
+			}
+			SMB2_query_info_free(&rqst[num_rqst++]);
+			break;
 		}
 	}
 	SMB2_close_free(&rqst[num_rqst]);
@@ -709,11 +839,11 @@ int smb2_query_path_info(const unsigned int xid,
 	struct cifsFileInfo *cfile;
 	struct cached_fid *cfid = NULL;
 	struct smb2_hdr *hdr;
-	struct kvec in_iov[2], out_iov[3] = {};
+	struct kvec in_iov[3], out_iov[3] = {};
 	int out_buftype[3] = {};
-	int cmds[2];
+	int cmds[3];
 	bool islink;
-	int i, num_cmds;
+	int i, num_cmds = 0;
 	int rc, rc2;
 
 	data->adjust_tz = false;
@@ -746,21 +876,22 @@ int smb2_query_path_info(const unsigned int xid,
 			close_cached_dir(cfid);
 			return rc;
 		}
-		cmds[0] = SMB2_OP_QUERY_INFO;
+		cmds[num_cmds++] = SMB2_OP_QUERY_INFO;
 	} else {
-		cmds[0] = SMB2_OP_POSIX_QUERY_INFO;
+		cmds[num_cmds++] = SMB2_OP_POSIX_QUERY_INFO;
 	}
 
 	in_iov[0].iov_base = data;
 	in_iov[0].iov_len = sizeof(*data);
 	in_iov[1] = in_iov[0];
+	in_iov[2] = in_iov[0];
 
 	cifs_get_readable_path(tcon, full_path, &cfile);
 	oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_READ_ATTRIBUTES,
 			     FILE_OPEN, create_options, ACL_NO_MODE);
 	rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
-			      &oparms, in_iov, cmds, 1, cfile,
-			      out_iov, out_buftype);
+			      &oparms, in_iov, cmds, num_cmds,
+			      cfile, out_iov, out_buftype);
 	hdr = out_iov[0].iov_base;
 	/*
 	 * If first iov is unset, then SMB session was dropped or we've got a
@@ -780,17 +911,18 @@ int smb2_query_path_info(const unsigned int xid,
 		if (rc || !data->reparse_point)
 			goto out;
 
-		if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK) {
-			/* symlink already parsed in create response */
-			num_cmds = 1;
-		} else {
-			cmds[1] = SMB2_OP_GET_REPARSE;
-			num_cmds = 2;
-		}
+		cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA;
+		/*
+		 * Skip SMB2_OP_GET_REPARSE if symlink already parsed in create
+		 * response.
+		 */
+		if (data->reparse.tag != IO_REPARSE_TAG_SYMLINK)
+			cmds[num_cmds++] = SMB2_OP_GET_REPARSE;
+
 		oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
-				     FILE_READ_ATTRIBUTES, FILE_OPEN,
-				     create_options | OPEN_REPARSE_POINT,
-				     ACL_NO_MODE);
+				     FILE_READ_ATTRIBUTES | FILE_READ_EA,
+				     FILE_OPEN, create_options |
+				     OPEN_REPARSE_POINT, ACL_NO_MODE);
 		cifs_get_readable_path(tcon, full_path, &cfile);
 		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
 				      &oparms, in_iov, cmds, num_cmds,
diff --git a/fs/smb/client/smb2pdu.h b/fs/smb/client/smb2pdu.h
index ea63d33e455322..c72a3b2886b7ff 100644
--- a/fs/smb/client/smb2pdu.h
+++ b/fs/smb/client/smb2pdu.h
@@ -420,4 +420,29 @@ struct smb2_create_ea_ctx {
 	struct smb2_file_full_ea_info ea;
 } __packed;
 
+#define SMB2_WSL_XATTR_UID		"$LXUID"
+#define SMB2_WSL_XATTR_GID		"$LXGID"
+#define SMB2_WSL_XATTR_MODE		"$LXMOD"
+#define SMB2_WSL_XATTR_DEV		"$LXDEV"
+#define SMB2_WSL_XATTR_NAME_LEN	6
+#define SMB2_WSL_NUM_XATTRS		4
+
+#define SMB2_WSL_XATTR_UID_SIZE	4
+#define SMB2_WSL_XATTR_GID_SIZE	4
+#define SMB2_WSL_XATTR_MODE_SIZE	4
+#define SMB2_WSL_XATTR_DEV_SIZE	8
+
+#define SMB2_WSL_MIN_QUERY_EA_RESP_SIZE \
+	(ALIGN((SMB2_WSL_NUM_XATTRS - 1) * \
+	       (SMB2_WSL_XATTR_NAME_LEN + 1 + \
+		sizeof(struct smb2_file_full_ea_info)), 4) + \
+	 SMB2_WSL_XATTR_NAME_LEN + 1 + sizeof(struct smb2_file_full_ea_info))
+
+#define SMB2_WSL_MAX_QUERY_EA_RESP_SIZE \
+	(ALIGN(SMB2_WSL_MIN_QUERY_EA_RESP_SIZE + \
+	       SMB2_WSL_XATTR_UID_SIZE + \
+	       SMB2_WSL_XATTR_GID_SIZE + \
+	       SMB2_WSL_XATTR_MODE_SIZE + \
+	       SMB2_WSL_XATTR_DEV_SIZE, 4))
+
 #endif				/* _SMB2PDU_H */
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 522fa387fcfd72..ce90ae0d77f849 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -411,6 +411,7 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done);
@@ -456,6 +457,7 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err);

From 2417900a8dcec30415ba9318f02d4346a158c34b Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Sun, 28 Jan 2024 21:52:03 -0300
Subject: [PATCH 656/707] smb: client: parse uid, gid, mode and dev from WSL
 reparse points

Parse the extended attributes from WSL reparse points to correctly
report uid, gid mode and dev from ther instantiated inodes.

Signed-off-by: Paulo Alcantara <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/inode.c   |  5 ++-
 fs/smb/client/readdir.c |  2 ++
 fs/smb/client/reparse.c | 78 +++++++++++++++++++++++++++++++++--------
 fs/smb/client/reparse.h | 29 +++++++++++++++
 4 files changed, 97 insertions(+), 17 deletions(-)

diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 56d77ff8249d50..24489e1e238ad1 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -758,6 +758,8 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
 	fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
 	fattr->cf_createtime = le64_to_cpu(info->CreationTime);
 	fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+	fattr->cf_uid = cifs_sb->ctx->linux_uid;
+	fattr->cf_gid = cifs_sb->ctx->linux_gid;
 
 	fattr->cf_mode = cifs_sb->ctx->file_mode;
 	if (cifs_open_data_reparse(data) &&
@@ -800,9 +802,6 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
 		fattr->cf_symlink_target = data->symlink_target;
 		data->symlink_target = NULL;
 	}
-
-	fattr->cf_uid = cifs_sb->ctx->linux_uid;
-	fattr->cf_gid = cifs_sb->ctx->linux_gid;
 }
 
 static int
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index 7ef5a4b37901db..80508a1af1a4d9 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -125,6 +125,8 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
 					if (likely(reparse_inode_match(inode, fattr))) {
 						fattr->cf_mode = inode->i_mode;
 						fattr->cf_rdev = inode->i_rdev;
+						fattr->cf_uid = inode->i_uid;
+						fattr->cf_gid = inode->i_gid;
 						fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size;
 						fattr->cf_symlink_target = NULL;
 					} else {
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 5f6674d8e098bd..6ed0ea27327050 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -254,7 +254,9 @@ static int mknod_wsl(unsigned int xid, struct inode *inode,
 {
 	struct cifs_open_info_data data;
 	struct reparse_data_buffer buf;
+	struct smb2_create_ea_ctx *cc;
 	struct inode *new;
+	unsigned int len;
 	struct kvec reparse_iov, xattr_iov;
 	int rc;
 
@@ -271,6 +273,11 @@ static int mknod_wsl(unsigned int xid, struct inode *inode,
 		.reparse = { .tag = le32_to_cpu(buf.ReparseTag), .buf = &buf, },
 	};
 
+	cc = xattr_iov.iov_base;
+	len = le32_to_cpu(cc->ctx.DataLength);
+	memcpy(data.wsl.eas, &cc->ea, len);
+	data.wsl.eas_len = len;
+
 	new = smb2_get_reparse_inode(&data, inode->i_sb,
 				     xid, tcon, full_path,
 				     &reparse_iov, &xattr_iov);
@@ -404,6 +411,62 @@ int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
 	return parse_reparse_point(buf, plen, cifs_sb, true, data);
 }
 
+static void wsl_to_fattr(struct cifs_open_info_data *data,
+			 struct cifs_sb_info *cifs_sb,
+			 u32 tag, struct cifs_fattr *fattr)
+{
+	struct smb2_file_full_ea_info *ea;
+	u32 next = 0;
+
+	switch (tag) {
+	case IO_REPARSE_TAG_LX_SYMLINK:
+		fattr->cf_mode |= S_IFLNK;
+		break;
+	case IO_REPARSE_TAG_LX_FIFO:
+		fattr->cf_mode |= S_IFIFO;
+		break;
+	case IO_REPARSE_TAG_AF_UNIX:
+		fattr->cf_mode |= S_IFSOCK;
+		break;
+	case IO_REPARSE_TAG_LX_CHR:
+		fattr->cf_mode |= S_IFCHR;
+		break;
+	case IO_REPARSE_TAG_LX_BLK:
+		fattr->cf_mode |= S_IFBLK;
+		break;
+	}
+
+	if (!data->wsl.eas_len)
+		goto out;
+
+	ea = (struct smb2_file_full_ea_info *)data->wsl.eas;
+	do {
+		const char *name;
+		void *v;
+		u8 nlen;
+
+		ea = (void *)((u8 *)ea + next);
+		next = le32_to_cpu(ea->next_entry_offset);
+		if (!le16_to_cpu(ea->ea_value_length))
+			continue;
+
+		name = ea->ea_data;
+		nlen = ea->ea_name_length;
+		v = (void *)((u8 *)ea->ea_data + ea->ea_name_length + 1);
+
+		if (!strncmp(name, SMB2_WSL_XATTR_UID, nlen))
+			fattr->cf_uid = wsl_make_kuid(cifs_sb, v);
+		else if (!strncmp(name, SMB2_WSL_XATTR_GID, nlen))
+			fattr->cf_gid = wsl_make_kgid(cifs_sb, v);
+		else if (!strncmp(name, SMB2_WSL_XATTR_MODE, nlen))
+			fattr->cf_mode = (umode_t)le32_to_cpu(*(__le32 *)v);
+		else if (!strncmp(name, SMB2_WSL_XATTR_DEV, nlen))
+			fattr->cf_rdev = wsl_mkdev(v);
+	} while (next);
+out:
+	fattr->cf_dtype = S_DT(fattr->cf_mode);
+}
+
 bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
 				 struct cifs_fattr *fattr,
 				 struct cifs_open_info_data *data)
@@ -444,24 +507,11 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
 
 	switch (tag) {
 	case IO_REPARSE_TAG_LX_SYMLINK:
-		fattr->cf_mode |= S_IFLNK;
-		fattr->cf_dtype = DT_LNK;
-		break;
 	case IO_REPARSE_TAG_LX_FIFO:
-		fattr->cf_mode |= S_IFIFO;
-		fattr->cf_dtype = DT_FIFO;
-		break;
 	case IO_REPARSE_TAG_AF_UNIX:
-		fattr->cf_mode |= S_IFSOCK;
-		fattr->cf_dtype = DT_SOCK;
-		break;
 	case IO_REPARSE_TAG_LX_CHR:
-		fattr->cf_mode |= S_IFCHR;
-		fattr->cf_dtype = DT_CHR;
-		break;
 	case IO_REPARSE_TAG_LX_BLK:
-		fattr->cf_mode |= S_IFBLK;
-		fattr->cf_dtype = DT_BLK;
+		wsl_to_fattr(data, cifs_sb, tag, fattr);
 		break;
 	case 0: /* SMB1 symlink */
 	case IO_REPARSE_TAG_SYMLINK:
diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h
index 9816bac9855257..6b55d1df9e2f84 100644
--- a/fs/smb/client/reparse.h
+++ b/fs/smb/client/reparse.h
@@ -8,6 +8,8 @@
 
 #include <linux/fs.h>
 #include <linux/stat.h>
+#include <linux/uidgid.h>
+#include "fs_context.h"
 #include "cifsglob.h"
 
 static inline dev_t reparse_nfs_mkdev(struct reparse_posix_data *buf)
@@ -17,6 +19,33 @@ static inline dev_t reparse_nfs_mkdev(struct reparse_posix_data *buf)
 	return MKDEV(v >> 32, v & 0xffffffff);
 }
 
+static inline dev_t wsl_mkdev(void *ptr)
+{
+	u64 v = le64_to_cpu(*(__le64 *)ptr);
+
+	return MKDEV(v & 0xffffffff, v >> 32);
+}
+
+static inline kuid_t wsl_make_kuid(struct cifs_sb_info *cifs_sb,
+				   void *ptr)
+{
+	u32 uid = le32_to_cpu(*(__le32 *)ptr);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
+		return cifs_sb->ctx->linux_uid;
+	return make_kuid(current_user_ns(), uid);
+}
+
+static inline kgid_t wsl_make_kgid(struct cifs_sb_info *cifs_sb,
+				   void *ptr)
+{
+	u32 gid = le32_to_cpu(*(__le32 *)ptr);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
+		return cifs_sb->ctx->linux_gid;
+	return make_kgid(current_user_ns(), gid);
+}
+
 static inline u64 reparse_mode_nfs_type(mode_t mode)
 {
 	switch (mode & S_IFMT) {

From 1b5af823d703ee183ffdde188aaf584ab93eea19 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 3 Jan 2024 11:26:49 +0100
Subject: [PATCH 657/707] soc/tegra: fix build failure on Tegra241

If all the other SoCs are disabled, the driver fails to build:

drivers/soc/tegra/fuse/fuse-tegra30.c:684:17: error: 'tegra30_fuse_read' undeclared here (not in a function); did you mean 'tegra_fuse_readl'?
  684 |         .read = tegra30_fuse_read,
      |                 ^~~~~~~~~~~~~~~~~
      |                 tegra_fuse_readl
drivers/soc/tegra/fuse/fuse-tegra30.c:694:17: error: 'tegra30_fuse_init' undeclared here (not in a function); did you mean 'tegra_fuse_info'?
  694 |         .init = tegra30_fuse_init,
      |                 ^~~~~~~~~~~~~~~~~

Fix the list of SoCs using this function to include the newly added one.

Fixes: dee509eb9cd5 ("soc/tegra: fuse: Add support for Tegra241")
Reviewed-by: Jon Hunter <jonathanh@nvidia.com>
Reviewed-by: Kartik <kkartik@nvidia.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/soc/tegra/fuse/fuse-tegra30.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/soc/tegra/fuse/fuse-tegra30.c b/drivers/soc/tegra/fuse/fuse-tegra30.c
index e94d46372a6396..92ac5693382637 100644
--- a/drivers/soc/tegra/fuse/fuse-tegra30.c
+++ b/drivers/soc/tegra/fuse/fuse-tegra30.c
@@ -38,7 +38,8 @@
     defined(CONFIG_ARCH_TEGRA_210_SOC) || \
     defined(CONFIG_ARCH_TEGRA_186_SOC) || \
     defined(CONFIG_ARCH_TEGRA_194_SOC) || \
-    defined(CONFIG_ARCH_TEGRA_234_SOC)
+    defined(CONFIG_ARCH_TEGRA_234_SOC) || \
+    defined(CONFIG_ARCH_TEGRA_241_SOC)
 static u32 tegra30_fuse_read_early(struct tegra_fuse *fuse, unsigned int offset)
 {
 	if (WARN_ON(!fuse->base))

From 4eacc39d5529cb0bb6bd9a6608658703d7af1e05 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 31 Jan 2024 21:57:27 +0100
Subject: [PATCH 658/707] dm-crypt, dm-verity: disable tasklets

Tasklets have an inherent problem with memory corruption. The function
tasklet_action_common calls tasklet_trylock, then it calls the tasklet
callback and then it calls tasklet_unlock. If the tasklet callback frees
the structure that contains the tasklet or if it calls some code that may
free it, tasklet_unlock will write into free memory.

The commits 8e14f610159d and d9a02e016aaf try to fix it for dm-crypt, but
it is not a sufficient fix and the data corruption can still happen [1].
There is no fix for dm-verity and dm-verity will write into free memory
with every tasklet-processed bio.

There will be atomic workqueues implemented in the kernel 6.9 [2]. They
will have better interface and they will not suffer from the memory
corruption problem.

But we need something that stops the memory corruption now and that can be
backported to the stable kernels. So, I'm proposing this commit that
disables tasklets in both dm-crypt and dm-verity. This commit doesn't
remove the tasklet support, because the tasklet code will be reused when
atomic workqueues will be implemented.

[1] https://lore.kernel.org/all/d390d7ee-f142-44d3-822a-87949e14608b@suse.de/T/
[2] https://lore.kernel.org/lkml/20240130091300.2968534-1-tj@kernel.org/

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org
Fixes: 39d42fa96ba1b ("dm crypt: add flags to optionally bypass kcryptd workqueues")
Fixes: 5721d4e5a9cdb ("dm verity: Add optional "try_verify_in_tasklet" feature")
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-crypt.c         | 38 ++---------------------------------
 drivers/md/dm-verity-target.c | 27 ++-----------------------
 2 files changed, 4 insertions(+), 61 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 855b482cbff1f0..f745f85082434d 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -73,10 +73,8 @@ struct dm_crypt_io {
 	struct bio *base_bio;
 	u8 *integrity_metadata;
 	bool integrity_metadata_from_pool:1;
-	bool in_tasklet:1;
 
 	struct work_struct work;
-	struct tasklet_struct tasklet;
 
 	struct convert_context ctx;
 
@@ -1762,7 +1760,6 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
 	io->ctx.r.req = NULL;
 	io->integrity_metadata = NULL;
 	io->integrity_metadata_from_pool = false;
-	io->in_tasklet = false;
 	atomic_set(&io->io_pending, 0);
 }
 
@@ -1771,13 +1768,6 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
 	atomic_inc(&io->io_pending);
 }
 
-static void kcryptd_io_bio_endio(struct work_struct *work)
-{
-	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
-
-	bio_endio(io->base_bio);
-}
-
 /*
  * One of the bios was finished. Check for completion of
  * the whole request and correctly clean up the buffer.
@@ -1801,20 +1791,6 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 
 	base_bio->bi_status = error;
 
-	/*
-	 * If we are running this function from our tasklet,
-	 * we can't call bio_endio() here, because it will call
-	 * clone_endio() from dm.c, which in turn will
-	 * free the current struct dm_crypt_io structure with
-	 * our tasklet. In this case we need to delay bio_endio()
-	 * execution to after the tasklet is done and dequeued.
-	 */
-	if (io->in_tasklet) {
-		INIT_WORK(&io->work, kcryptd_io_bio_endio);
-		queue_work(cc->io_queue, &io->work);
-		return;
-	}
-
 	bio_endio(base_bio);
 }
 
@@ -2246,11 +2222,6 @@ static void kcryptd_crypt(struct work_struct *work)
 		kcryptd_crypt_write_convert(io);
 }
 
-static void kcryptd_crypt_tasklet(unsigned long work)
-{
-	kcryptd_crypt((struct work_struct *)work);
-}
-
 static void kcryptd_queue_crypt(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->cc;
@@ -2262,15 +2233,10 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io)
 		 * irqs_disabled(): the kernel may run some IO completion from the idle thread, but
 		 * it is being executed with irqs disabled.
 		 */
-		if (in_hardirq() || irqs_disabled()) {
-			io->in_tasklet = true;
-			tasklet_init(&io->tasklet, kcryptd_crypt_tasklet, (unsigned long)&io->work);
-			tasklet_schedule(&io->tasklet);
+		if (!(in_hardirq() || irqs_disabled())) {
+			kcryptd_crypt(&io->work);
 			return;
 		}
-
-		kcryptd_crypt(&io->work);
-		return;
 	}
 
 	INIT_WORK(&io->work, kcryptd_crypt);
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 14e58ae705218f..a8c7dab1719524 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -21,7 +21,6 @@
 #include <linux/reboot.h>
 #include <linux/scatterlist.h>
 #include <linux/string.h>
-#include <linux/jump_label.h>
 
 #define DM_MSG_PREFIX			"verity"
 
@@ -645,23 +644,6 @@ static void verity_work(struct work_struct *w)
 	verity_finish_io(io, errno_to_blk_status(verity_verify_io(io)));
 }
 
-static void verity_tasklet(unsigned long data)
-{
-	struct dm_verity_io *io = (struct dm_verity_io *)data;
-	int err;
-
-	io->in_tasklet = true;
-	err = verity_verify_io(io);
-	if (err == -EAGAIN || err == -ENOMEM) {
-		/* fallback to retrying with work-queue */
-		INIT_WORK(&io->work, verity_work);
-		queue_work(io->v->verify_wq, &io->work);
-		return;
-	}
-
-	verity_finish_io(io, errno_to_blk_status(err));
-}
-
 static void verity_end_io(struct bio *bio)
 {
 	struct dm_verity_io *io = bio->bi_private;
@@ -674,13 +656,8 @@ static void verity_end_io(struct bio *bio)
 		return;
 	}
 
-	if (static_branch_unlikely(&use_tasklet_enabled) && io->v->use_tasklet) {
-		tasklet_init(&io->tasklet, verity_tasklet, (unsigned long)io);
-		tasklet_schedule(&io->tasklet);
-	} else {
-		INIT_WORK(&io->work, verity_work);
-		queue_work(io->v->verify_wq, &io->work);
-	}
+	INIT_WORK(&io->work, verity_work);
+	queue_work(io->v->verify_wq, &io->work);
 }
 
 /*

From c759bb3394ab095d87beca6f75e172ec87571a5c Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 1 Feb 2024 13:03:05 +1100
Subject: [PATCH 659/707] Revert "kasan: revert eviction of stack traces in
 generic mode"

This reverts commit 2ef8127ce56d76a4f861e7f0c40e241409c18087.
---
 mm/kasan/common.c     |  8 +++--
 mm/kasan/generic.c    | 68 ++++++++++++++++++++++++++++++++++++++-----
 mm/kasan/kasan.h      | 10 +++++++
 mm/kasan/quarantine.c |  5 +---
 4 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 6ca63e8dda741b..610efae9122094 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -65,7 +65,8 @@ void kasan_save_track(struct kasan_track *track, gfp_t flags)
 {
 	depot_stack_handle_t stack;
 
-	stack = kasan_save_stack(flags, STACK_DEPOT_FLAG_CAN_ALLOC);
+	stack = kasan_save_stack(flags,
+			STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET);
 	kasan_set_track(track, stack);
 }
 
@@ -265,9 +266,10 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object,
 		return true;
 
 	/*
-	 * Note: Keep per-object metadata to allow KASAN print stack traces for
-	 * use-after-free-before-realloc bugs.
+	 * If the object is not put into quarantine, it will likely be quickly
+	 * reallocated. Thus, release its metadata now.
 	 */
+	kasan_release_object_meta(cache, object);
 
 	/* Let slab put the object onto the freelist. */
 	return false;
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index fc9cf1860efb34..df6627f62402c0 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -485,6 +485,16 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object)
 	if (alloc_meta) {
 		/* Zero out alloc meta to mark it as invalid. */
 		__memset(alloc_meta, 0, sizeof(*alloc_meta));
+
+		/*
+		 * Prepare the lock for saving auxiliary stack traces.
+		 * Temporarily disable KASAN bug reporting to allow instrumented
+		 * raw_spin_lock_init to access aux_lock, which resides inside
+		 * of a redzone.
+		 */
+		kasan_disable_current();
+		raw_spin_lock_init(&alloc_meta->aux_lock);
+		kasan_enable_current();
 	}
 
 	/*
@@ -496,8 +506,18 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object)
 
 static void release_alloc_meta(struct kasan_alloc_meta *meta)
 {
-	/* Zero out alloc meta to mark it as invalid. */
-	__memset(meta, 0, sizeof(*meta));
+	/* Evict the stack traces from stack depot. */
+	stack_depot_put(meta->alloc_track.stack);
+	stack_depot_put(meta->aux_stack[0]);
+	stack_depot_put(meta->aux_stack[1]);
+
+	/*
+	 * Zero out alloc meta to mark it as invalid but keep aux_lock
+	 * initialized to avoid having to reinitialize it when another object
+	 * is allocated in the same slot.
+	 */
+	__memset(&meta->alloc_track, 0, sizeof(meta->alloc_track));
+	__memset(meta->aux_stack, 0, sizeof(meta->aux_stack));
 }
 
 static void release_free_meta(const void *object, struct kasan_free_meta *meta)
@@ -506,10 +526,27 @@ static void release_free_meta(const void *object, struct kasan_free_meta *meta)
 	if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META)
 		return;
 
+	/* Evict the stack trace from the stack depot. */
+	stack_depot_put(meta->free_track.stack);
+
 	/* Mark free meta as invalid. */
 	*(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREE;
 }
 
+void kasan_release_object_meta(struct kmem_cache *cache, const void *object)
+{
+	struct kasan_alloc_meta *alloc_meta;
+	struct kasan_free_meta *free_meta;
+
+	alloc_meta = kasan_get_alloc_meta(cache, object);
+	if (alloc_meta)
+		release_alloc_meta(alloc_meta);
+
+	free_meta = kasan_get_free_meta(cache, object);
+	if (free_meta)
+		release_free_meta(object, free_meta);
+}
+
 size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object)
 {
 	struct kasan_cache *info = &cache->kasan_info;
@@ -534,6 +571,8 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags)
 	struct kmem_cache *cache;
 	struct kasan_alloc_meta *alloc_meta;
 	void *object;
+	depot_stack_handle_t new_handle, old_handle;
+	unsigned long flags;
 
 	if (is_kfence_address(addr) || !slab)
 		return;
@@ -544,18 +583,33 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags)
 	if (!alloc_meta)
 		return;
 
+	new_handle = kasan_save_stack(0, depot_flags);
+
+	/*
+	 * Temporarily disable KASAN bug reporting to allow instrumented
+	 * spinlock functions to access aux_lock, which resides inside of a
+	 * redzone.
+	 */
+	kasan_disable_current();
+	raw_spin_lock_irqsave(&alloc_meta->aux_lock, flags);
+	old_handle = alloc_meta->aux_stack[1];
 	alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
-	alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags);
+	alloc_meta->aux_stack[0] = new_handle;
+	raw_spin_unlock_irqrestore(&alloc_meta->aux_lock, flags);
+	kasan_enable_current();
+
+	stack_depot_put(old_handle);
 }
 
 void kasan_record_aux_stack(void *addr)
 {
-	return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC);
+	return __kasan_record_aux_stack(addr,
+			STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET);
 }
 
 void kasan_record_aux_stack_noalloc(void *addr)
 {
-	return __kasan_record_aux_stack(addr, 0);
+	return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_GET);
 }
 
 void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
@@ -566,7 +620,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
 	if (!alloc_meta)
 		return;
 
-	/* Invalidate previous stack traces (might exist for krealloc or mempool). */
+	/* Evict previous stack traces (might exist for krealloc or mempool). */
 	release_alloc_meta(alloc_meta);
 
 	kasan_save_track(&alloc_meta->alloc_track, flags);
@@ -580,7 +634,7 @@ void kasan_save_free_info(struct kmem_cache *cache, void *object)
 	if (!free_meta)
 		return;
 
-	/* Invalidate previous stack trace (might exist for mempool). */
+	/* Evict previous stack trace (might exist for mempool). */
 	release_free_meta(object, free_meta);
 
 	kasan_save_track(&free_meta->free_track, 0);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index fb2b9ac0659a7a..d0f172f2b9783f 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -6,6 +6,7 @@
 #include <linux/kasan.h>
 #include <linux/kasan-tags.h>
 #include <linux/kfence.h>
+#include <linux/spinlock.h>
 #include <linux/stackdepot.h>
 
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
@@ -264,6 +265,13 @@ struct kasan_global {
 struct kasan_alloc_meta {
 	struct kasan_track alloc_track;
 	/* Free track is stored in kasan_free_meta. */
+	/*
+	 * aux_lock protects aux_stack from accesses from concurrent
+	 * kasan_record_aux_stack calls. It is a raw spinlock to avoid sleeping
+	 * on RT kernels, as kasan_record_aux_stack_noalloc can be called from
+	 * non-sleepable contexts.
+	 */
+	raw_spinlock_t aux_lock;
 	depot_stack_handle_t aux_stack[2];
 };
 
@@ -390,8 +398,10 @@ struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
 struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
 						const void *object);
 void kasan_init_object_meta(struct kmem_cache *cache, const void *object);
+void kasan_release_object_meta(struct kmem_cache *cache, const void *object);
 #else
 static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { }
+static inline void kasan_release_object_meta(struct kmem_cache *cache, const void *object) { }
 #endif
 
 depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags);
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 6958aa713c67ee..3ba02efb952aac 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -145,10 +145,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
 	void *object = qlink_to_object(qlink, cache);
 	struct kasan_free_meta *free_meta = kasan_get_free_meta(cache, object);
 
-	/*
-	 * Note: Keep per-object metadata to allow KASAN print stack traces for
-	 * use-after-free-before-realloc bugs.
-	 */
+	kasan_release_object_meta(cache, object);
 
 	/*
 	 * If init_on_free is enabled and KASAN's free metadata is stored in

From d0c0d80c116244ac37890506bc2fbe1ef7e6212f Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 1 Feb 2024 13:03:16 +1100
Subject: [PATCH 660/707] Revert "stackdepot: use variable size records for
 non-evictable entries"

This reverts commit d869d3fb362c51a59c173fdee050dc100ff68383.
---
 include/linux/poison.h |   3 -
 lib/stackdepot.c       | 250 ++++++++++++++++++++---------------------
 2 files changed, 123 insertions(+), 130 deletions(-)

diff --git a/include/linux/poison.h b/include/linux/poison.h
index 1f0ee2459f2aa2..27a7dad17eefb8 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -92,7 +92,4 @@
 /********** VFS **********/
 #define VFS_PTR_POISON ((void *)(0xF5 + POISON_POINTER_DELTA))
 
-/********** lib/stackdepot.c **********/
-#define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA))
-
 #endif
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 8f3b2c84ec2db3..5caa1f56655384 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -22,7 +22,6 @@
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
-#include <linux/poison.h>
 #include <linux/printk.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
@@ -44,7 +43,17 @@
 #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN)
 #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \
 			       STACK_DEPOT_EXTRA_BITS)
+#if IS_ENABLED(CONFIG_KMSAN) && CONFIG_STACKDEPOT_MAX_FRAMES >= 32
+/*
+ * KMSAN is frequently used in fuzzing scenarios and thus saves a lot of stack
+ * traces. As KMSAN does not support evicting stack traces from the stack
+ * depot, the stack depot capacity might be reached quickly with large stack
+ * records. Adjust the maximum number of stack depot pools for this case.
+ */
+#define DEPOT_POOLS_CAP (8192 * (CONFIG_STACKDEPOT_MAX_FRAMES / 16))
+#else
 #define DEPOT_POOLS_CAP 8192
+#endif
 #define DEPOT_MAX_POOLS \
 	(((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \
 	 (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP)
@@ -84,6 +93,9 @@ struct stack_record {
 	};
 };
 
+#define DEPOT_STACK_RECORD_SIZE \
+	ALIGN(sizeof(struct stack_record), 1 << DEPOT_STACK_ALIGN)
+
 static bool stack_depot_disabled;
 static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
 static bool __stack_depot_early_init_passed __initdata;
@@ -109,31 +121,32 @@ static void *stack_pools[DEPOT_MAX_POOLS];
 static void *new_pool;
 /* Number of pools in stack_pools. */
 static int pools_num;
-/* Offset to the unused space in the currently used pool. */
-static size_t pool_offset = DEPOT_POOL_SIZE;
 /* Freelist of stack records within stack_pools. */
 static LIST_HEAD(free_stacks);
+/*
+ * Stack depot tries to keep an extra pool allocated even before it runs out
+ * of space in the currently used pool. This flag marks whether this extra pool
+ * needs to be allocated. It has the value 0 when either an extra pool is not
+ * yet allocated or if the limit on the number of pools is reached.
+ */
+static bool new_pool_required = true;
 /* The lock must be held when performing pool or freelist modifications. */
 static DEFINE_RAW_SPINLOCK(pool_lock);
 
 /* Statistics counters for debugfs. */
 enum depot_counter_id {
-	DEPOT_COUNTER_REFD_ALLOCS,
-	DEPOT_COUNTER_REFD_FREES,
-	DEPOT_COUNTER_REFD_INUSE,
+	DEPOT_COUNTER_ALLOCS,
+	DEPOT_COUNTER_FREES,
+	DEPOT_COUNTER_INUSE,
 	DEPOT_COUNTER_FREELIST_SIZE,
-	DEPOT_COUNTER_PERSIST_COUNT,
-	DEPOT_COUNTER_PERSIST_BYTES,
 	DEPOT_COUNTER_COUNT,
 };
 static long counters[DEPOT_COUNTER_COUNT];
 static const char *const counter_names[] = {
-	[DEPOT_COUNTER_REFD_ALLOCS]	= "refcounted_allocations",
-	[DEPOT_COUNTER_REFD_FREES]	= "refcounted_frees",
-	[DEPOT_COUNTER_REFD_INUSE]	= "refcounted_in_use",
+	[DEPOT_COUNTER_ALLOCS]		= "allocations",
+	[DEPOT_COUNTER_FREES]		= "frees",
+	[DEPOT_COUNTER_INUSE]		= "in_use",
 	[DEPOT_COUNTER_FREELIST_SIZE]	= "freelist_size",
-	[DEPOT_COUNTER_PERSIST_COUNT]	= "persistent_count",
-	[DEPOT_COUNTER_PERSIST_BYTES]	= "persistent_bytes",
 };
 static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT);
 
@@ -281,52 +294,48 @@ int stack_depot_init(void)
 EXPORT_SYMBOL_GPL(stack_depot_init);
 
 /*
- * Initializes new stack pool, and updates the list of pools.
+ * Initializes new stack depot @pool, release all its entries to the freelist,
+ * and update the list of pools.
  */
-static bool depot_init_pool(void **prealloc)
+static void depot_init_pool(void *pool)
 {
+	int offset;
+
 	lockdep_assert_held(&pool_lock);
 
-	if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
-		/* Bail out if we reached the pool limit. */
-		WARN_ON_ONCE(pools_num > DEPOT_MAX_POOLS); /* should never happen */
-		WARN_ON_ONCE(!new_pool); /* to avoid unnecessary pre-allocation */
-		WARN_ONCE(1, "Stack depot reached limit capacity");
-		return false;
-	}
+	/* Initialize handles and link stack records into the freelist. */
+	for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE;
+	     offset += DEPOT_STACK_RECORD_SIZE) {
+		struct stack_record *stack = pool + offset;
 
-	if (!new_pool && *prealloc) {
-		/* We have preallocated memory, use it. */
-		WRITE_ONCE(new_pool, *prealloc);
-		*prealloc = NULL;
-	}
+		stack->handle.pool_index = pools_num;
+		stack->handle.offset = offset >> DEPOT_STACK_ALIGN;
+		stack->handle.extra = 0;
 
-	if (!new_pool)
-		return false; /* new_pool and *prealloc are NULL */
+		/*
+		 * Stack traces of size 0 are never saved, and we can simply use
+		 * the size field as an indicator if this is a new unused stack
+		 * record in the freelist.
+		 */
+		stack->size = 0;
 
-	/* Save reference to the pool to be used by depot_fetch_stack(). */
-	stack_pools[pools_num] = new_pool;
+		INIT_LIST_HEAD(&stack->hash_list);
+		/*
+		 * Add to the freelist front to prioritize never-used entries:
+		 * required in case there are entries in the freelist, but their
+		 * RCU cookie still belongs to the current RCU grace period
+		 * (there can still be concurrent readers).
+		 */
+		list_add(&stack->free_list, &free_stacks);
+		counters[DEPOT_COUNTER_FREELIST_SIZE]++;
+	}
 
-	/*
-	 * Stack depot tries to keep an extra pool allocated even before it runs
-	 * out of space in the currently used pool.
-	 *
-	 * To indicate that a new preallocation is needed new_pool is reset to
-	 * NULL; do not reset to NULL if we have reached the maximum number of
-	 * pools.
-	 */
-	if (pools_num < DEPOT_MAX_POOLS)
-		WRITE_ONCE(new_pool, NULL);
-	else
-		WRITE_ONCE(new_pool, STACK_DEPOT_POISON);
+	/* Save reference to the pool to be used by depot_fetch_stack(). */
+	stack_pools[pools_num] = pool;
 
 	/* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */
 	WRITE_ONCE(pools_num, pools_num + 1);
 	ASSERT_EXCLUSIVE_WRITER(pools_num);
-
-	pool_offset = 0;
-
-	return true;
 }
 
 /* Keeps the preallocated memory to be used for a new stack depot pool. */
@@ -338,51 +347,63 @@ static void depot_keep_new_pool(void **prealloc)
 	 * If a new pool is already saved or the maximum number of
 	 * pools is reached, do not use the preallocated memory.
 	 */
-	if (new_pool)
+	if (!new_pool_required)
 		return;
 
-	WRITE_ONCE(new_pool, *prealloc);
-	*prealloc = NULL;
+	/*
+	 * Use the preallocated memory for the new pool
+	 * as long as we do not exceed the maximum number of pools.
+	 */
+	if (pools_num < DEPOT_MAX_POOLS) {
+		new_pool = *prealloc;
+		*prealloc = NULL;
+	}
+
+	/*
+	 * At this point, either a new pool is kept or the maximum
+	 * number of pools is reached. In either case, take note that
+	 * keeping another pool is not required.
+	 */
+	WRITE_ONCE(new_pool_required, false);
 }
 
 /*
- * Try to initialize a new stack record from the current pool, a cached pool, or
- * the current pre-allocation.
+ * Try to initialize a new stack depot pool from either a previous or the
+ * current pre-allocation, and release all its entries to the freelist.
  */
-static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size)
+static bool depot_try_init_pool(void **prealloc)
 {
-	struct stack_record *stack;
-	void *current_pool;
-	u32 pool_index;
-
 	lockdep_assert_held(&pool_lock);
 
-	if (pool_offset + size > DEPOT_POOL_SIZE) {
-		if (!depot_init_pool(prealloc))
-			return NULL;
-	}
+	/* Check if we have a new pool saved and use it. */
+	if (new_pool) {
+		depot_init_pool(new_pool);
+		new_pool = NULL;
 
-	if (WARN_ON_ONCE(pools_num < 1))
-		return NULL;
-	pool_index = pools_num - 1;
-	current_pool = stack_pools[pool_index];
-	if (WARN_ON_ONCE(!current_pool))
-		return NULL;
+		/* Take note that we might need a new new_pool. */
+		if (pools_num < DEPOT_MAX_POOLS)
+			WRITE_ONCE(new_pool_required, true);
 
-	stack = current_pool + pool_offset;
+		return true;
+	}
 
-	/* Pre-initialize handle once. */
-	stack->handle.pool_index = pool_index;
-	stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN;
-	stack->handle.extra = 0;
-	INIT_LIST_HEAD(&stack->hash_list);
+	/* Bail out if we reached the pool limit. */
+	if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
+		WARN_ONCE(1, "Stack depot reached limit capacity");
+		return false;
+	}
 
-	pool_offset += size;
+	/* Check if we have preallocated memory and use it. */
+	if (*prealloc) {
+		depot_init_pool(*prealloc);
+		*prealloc = NULL;
+		return true;
+	}
 
-	return stack;
+	return false;
 }
 
-/* Try to find next free usable entry from the freelist. */
+/* Try to find next free usable entry. */
 static struct stack_record *depot_pop_free(void)
 {
 	struct stack_record *stack;
@@ -399,7 +420,7 @@ static struct stack_record *depot_pop_free(void)
 	 * check the first entry.
 	 */
 	stack = list_first_entry(&free_stacks, struct stack_record, free_list);
-	if (!poll_state_synchronize_rcu(stack->rcu_state))
+	if (stack->size && !poll_state_synchronize_rcu(stack->rcu_state))
 		return NULL;
 
 	list_del(&stack->free_list);
@@ -408,73 +429,48 @@ static struct stack_record *depot_pop_free(void)
 	return stack;
 }
 
-static inline size_t depot_stack_record_size(struct stack_record *s, unsigned int nr_entries)
-{
-	const size_t used = flex_array_size(s, entries, nr_entries);
-	const size_t unused = sizeof(s->entries) - used;
-
-	WARN_ON_ONCE(sizeof(s->entries) < used);
-
-	return ALIGN(sizeof(struct stack_record) - unused, 1 << DEPOT_STACK_ALIGN);
-}
-
 /* Allocates a new stack in a stack depot pool. */
 static struct stack_record *
-depot_alloc_stack(unsigned long *entries, int nr_entries, u32 hash, depot_flags_t flags, void **prealloc)
+depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
 {
-	struct stack_record *stack = NULL;
-	size_t record_size;
+	struct stack_record *stack;
 
 	lockdep_assert_held(&pool_lock);
 
 	/* This should already be checked by public API entry points. */
-	if (WARN_ON_ONCE(!nr_entries))
+	if (WARN_ON_ONCE(!size))
 		return NULL;
 
-	/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
-	if (nr_entries > CONFIG_STACKDEPOT_MAX_FRAMES)
-		nr_entries = CONFIG_STACKDEPOT_MAX_FRAMES;
-
-	if (flags & STACK_DEPOT_FLAG_GET) {
-		/*
-		 * Evictable entries have to allocate the max. size so they may
-		 * safely be re-used by differently sized allocations.
-		 */
-		record_size = depot_stack_record_size(stack, CONFIG_STACKDEPOT_MAX_FRAMES);
-		stack = depot_pop_free();
-	} else {
-		record_size = depot_stack_record_size(stack, nr_entries);
-	}
-
+	/* Check if we have a stack record to save the stack trace. */
+	stack = depot_pop_free();
 	if (!stack) {
-		stack = depot_pop_free_pool(prealloc, record_size);
-		if (!stack)
+		/* No usable entries on the freelist - try to refill the freelist. */
+		if (!depot_try_init_pool(prealloc))
+			return NULL;
+		stack = depot_pop_free();
+		if (WARN_ON(!stack))
 			return NULL;
 	}
 
+	/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
+	if (size > CONFIG_STACKDEPOT_MAX_FRAMES)
+		size = CONFIG_STACKDEPOT_MAX_FRAMES;
+
 	/* Save the stack trace. */
 	stack->hash = hash;
-	stack->size = nr_entries;
-	/* stack->handle is already filled in by depot_pop_free_pool(). */
-	memcpy(stack->entries, entries, flex_array_size(stack, entries, nr_entries));
-
-	if (flags & STACK_DEPOT_FLAG_GET) {
-		refcount_set(&stack->count, 1);
-		counters[DEPOT_COUNTER_REFD_ALLOCS]++;
-		counters[DEPOT_COUNTER_REFD_INUSE]++;
-	} else {
-		/* Warn on attempts to switch to refcounting this entry. */
-		refcount_set(&stack->count, REFCOUNT_SATURATED);
-		counters[DEPOT_COUNTER_PERSIST_COUNT]++;
-		counters[DEPOT_COUNTER_PERSIST_BYTES] += record_size;
-	}
+	stack->size = size;
+	/* stack->handle is already filled in by depot_init_pool(). */
+	refcount_set(&stack->count, 1);
+	memcpy(stack->entries, entries, flex_array_size(stack, entries, size));
 
 	/*
 	 * Let KMSAN know the stored stack record is initialized. This shall
 	 * prevent false positive reports if instrumented code accesses it.
 	 */
-	kmsan_unpoison_memory(stack, record_size);
+	kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE);
 
+	counters[DEPOT_COUNTER_ALLOCS]++;
+	counters[DEPOT_COUNTER_INUSE]++;
 	return stack;
 }
 
@@ -542,8 +538,8 @@ static void depot_free_stack(struct stack_record *stack)
 	list_add_tail(&stack->free_list, &free_stacks);
 
 	counters[DEPOT_COUNTER_FREELIST_SIZE]++;
-	counters[DEPOT_COUNTER_REFD_FREES]++;
-	counters[DEPOT_COUNTER_REFD_INUSE]--;
+	counters[DEPOT_COUNTER_FREES]++;
+	counters[DEPOT_COUNTER_INUSE]--;
 
 	printk_deferred_exit();
 	raw_spin_unlock_irqrestore(&pool_lock, flags);
@@ -664,7 +660,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
 	 * Allocate memory for a new pool if required now:
 	 * we won't be able to do that under the lock.
 	 */
-	if (unlikely(can_alloc && !READ_ONCE(new_pool))) {
+	if (unlikely(can_alloc && READ_ONCE(new_pool_required))) {
 		/*
 		 * Zero out zone modifiers, as we don't have specific zone
 		 * requirements. Keep the flags related to allocation in atomic
@@ -685,7 +681,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
 	found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
 	if (!found) {
 		struct stack_record *new =
-			depot_alloc_stack(entries, nr_entries, hash, depot_flags, &prealloc);
+			depot_alloc_stack(entries, nr_entries, hash, &prealloc);
 
 		if (new) {
 			/*

From 51b70ff55ed88edd19b080a524063446bcc34b62 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 1 Feb 2024 14:29:51 +1100
Subject: [PATCH 661/707] Add linux-next specific files for 20240201

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 Next/SHA1s        |  370 ++++
 Next/Trees        |  372 ++++
 Next/merge.log    | 5227 +++++++++++++++++++++++++++++++++++++++++++++
 localversion-next |    1 +
 4 files changed, 5970 insertions(+)
 create mode 100644 Next/SHA1s
 create mode 100644 Next/Trees
 create mode 100644 Next/merge.log
 create mode 100644 localversion-next

diff --git a/Next/SHA1s b/Next/SHA1s
new file mode 100644
index 00000000000000..ad17af9cebfd34
--- /dev/null
+++ b/Next/SHA1s
@@ -0,0 +1,370 @@
+Name		SHA1
+----		----
+origin		6764c317b6bb91bd806ef79adf6d9c0e428b191e
+fixes		2dde18cd1d8fac735875f2e4987f11817cc0bc2c
+mm-hotfixes	bbac2bacc15831e9f92dbf7deabe8192c2d8ea92
+kbuild-current	bfef491df67022c56aab3b831044f8d259f9441f
+arc-current	861deac3b092f37b2c5e6871732f3e11486f7082
+arm-current	f54e8634d1366926c807e2af6125b33cff555fa7
+arm64-fixes	c7767f5c43df2c453af4651d1f58f489e3eb4ac1
+arm-soc-fixes	1b5af823d703ee183ffdde188aaf584ab93eea19
+davinci-current	6613476e225e090cc9aad49be7fa504e290dd33d
+drivers-memory-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+sophgo-fixes	41bccc98fb7931d63d03f326a746ac4d429c1dd3
+tee-fixes	ceaa837f96adb69c0df0397937cd74991d5d821a
+m68k-current	6b9c045b0602cf64b33ea6da5e6aa6f81dd47ae8
+powerpc-fixes	18f14afe281648e31ed35c9ad2fcb724c4838ad9
+s390-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+fscrypt-current	4bcf6f827a79c59806c695dc280e763c5b6a6813
+fsverity-current	a075bacde257f755bea0e53400c9f1cdd1b8e8e6
+net		c9ec85153fea6873c52ed4f5055c87263f1b54f9
+bpf		577e4432f3ac810049cb7e6b71f4d96ec7c6e894
+ipsec		983a73da1f996faee9997149eb05b12fa7bd8cbf
+netfilter	a2933a8759a62269754e54733d993b19de870e84
+ipvs		a2933a8759a62269754e54733d993b19de870e84
+wireless	f3f8f050316893fe2da523458ff7f5f6d61fb1a6
+wpan		b85ea95d086471afb4ad062012a4d73cd328fa86
+rdma-fixes	80dde187f734cf9ccf988d5c2ef1a46b990660fd
+sound-current	2468e8922d2f6da81a6192b73023eff67e3fefdd
+sound-asoc-fixes	eeab239d6a2418fc5d2cd7ea76187085a97acde0
+regmap-fixes	8b921545ddc68c960f86699af906b6c6f361f16c
+regulator-fixes	a3fa9838e8140584a6f338e8516f2b05d3bea812
+spi-fixes	6500ad28fd5d67d5ca0fee9da73c463090842440
+pci-current	925bd5e08106bd5bfbd1cb8e124c89a0b4003569
+driver-core.current	98323e9d70172f1b46d1cadb20d6c54abf62870d
+tty.current	b35f8dbbce818b02c730dc85133dc7754266e084
+usb.current	f2e5d3de7e1fbf24483e7f996e519b3ebc3935a1
+usb-serial-fixes	b4a1f4eaf1d798066affc6ad040f76eb1a16e1c9
+phy		7104ba0f1958adb250319e68a15eff89ec4fd36d
+staging.current	6613476e225e090cc9aad49be7fa504e290dd33d
+iio-fixes	6f6c72acddf4357fcc83593c20ef9064fb42db92
+counter-current	6613476e225e090cc9aad49be7fa504e290dd33d
+char-misc.current	ac9762a74c7ca7cbfcb4c65f5871373653a046ac
+soundwire-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+thunderbolt-fixes	ec4d82f855ce332de26fe080892483de98cc1a19
+input-current	2b9c3eb32a699acdd4784d6b93743271b4970899
+crypto-current	c5a2f74db71a849f3a60bc153d684d6d28a0c665
+vfio-fixes	4ea95c04fa6b9043a1a301240996aeebe3cb28ec
+kselftest-fixes	b54761f6e9773350c0d1fb8e1e5aacaba7769d0f
+modules-fixes	f412eef03938d3a40d4f6f5a79d0f98ed89b596d
+dmaengine-fixes	a22fe1d6dec7e98535b97249fdc95c2be79120bb
+backlight-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+mtd-fixes	7c1b1906229db88c487e21e1ecb622db64a1830d
+mfd-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+v4l-dvb-fixes	b32431b753217d8d45b018443b1a7aac215921fb
+reset-fixes	4a6756f56bcf8e64c87144a626ce53aea4899c0e
+mips-fixes	59be5c35850171e307ca5d3d703ee9ff4096b948
+at91-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+omap-fixes	9b6a51aab5f5f9f71d2fa16e8b4d530e1643dfcb
+kvm-fixes	0dd3ee31125508cd67f7e7172247f05b7fd1753a
+kvms390-fixes	83303a4c776ce1032d88df59e811183479acea77
+hwmon-fixes	915644189c22d9c93e9fee7c7c993b58e745bef7
+nvdimm-fixes	33908660e814203e996f6e775d033c5c32fcf9a7
+cxl-fixes	6be99530c92c6b8ff7a01903edc42393575ad63b
+btrfs-fixes	c94bd41cb0b62737d2d05c9e2e9f7aa678046b86
+vfs-fixes	485053bb81c81a122edd982b263277e65d7485c5
+dma-mapping-fixes	d5090484b021794271280ab64d20253883b7f6fd
+drivers-x86-fixes	1abdf288b0ef5606f76b6e191fa6df05330e3d7e
+samsung-krzk-fixes	eab4f56d3e75dad697acf8dc2c8be3c341d6c63e
+pinctrl-samsung-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+devicetree-fixes	8f7e917907385e112a845d668ae2832f41e64bf5
+dt-krzk-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+scsi-fixes	f4469f3858352ad1197434557150b1f7086762a0
+drm-fixes	41bccc98fb7931d63d03f326a746ac4d429c1dd3
+drm-intel-fixes	41bccc98fb7931d63d03f326a746ac4d429c1dd3
+mmc-fixes	41bccc98fb7931d63d03f326a746ac4d429c1dd3
+rtc-fixes	08279468a294d8c996a657ecc9e51bd5c084c75d
+gnss-fixes	41bccc98fb7931d63d03f326a746ac4d429c1dd3
+hyperv-fixes	564eac2860bdbe6ac651e6909ac07ecd93d778f3
+soc-fsl-fixes	06c2afb862f9da8dc5efa4b6076a0e48c3fbaaa5
+risc-v-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+riscv-dt-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+riscv-soc-fixes	a9d022ae8c4faca73467f70ca4a880787d855f94
+fpga-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+spdx		6613476e225e090cc9aad49be7fa504e290dd33d
+gpio-brgl-fixes	41bccc98fb7931d63d03f326a746ac4d429c1dd3
+gpio-intel-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+pinctrl-intel-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+erofs-fixes	d9281660ff3ffb4a05302b485cc59a87e709aefc
+kunit-fixes	1a9f2c776d1416c4ea6cb0d0b9917778c41a1a7d
+ubifs-fixes	2241ab53cbb5cdb08a6b2d4688feb13971058f65
+memblock-fixes	6a9531c3a88096a26cf3ac582f7ec44f94a7dcb2
+nfsd-fixes	ccbca118ef1a71d5faa012b9bb1ecd784e9e2b42
+renesas-fixes	9eab43facdadb7d00456c2657001ae2e5353c814
+perf-current	fdd0ae72b34e56eb5e896d067c49a78ecb451032
+efi-fixes	aa0e784dea7c1a026aabff9db1cb5d2bd92b3e92
+zstd-fixes	77618db346455129424fadbbaec596a09feaf3bb
+battery-fixes	d0266d7ab1618482d58015d67a5220e590333298
+uml-fixes	73a23d7710331a530e972903318528b75e5a5f58
+iommufd-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+rust-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+v9fs-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+w1-fixes	6613476e225e090cc9aad49be7fa504e290dd33d
+pmdomain-fixes	c41336f4d69057cbf88fed47951379b384540df5
+overlayfs-fixes	420332b94119cdc7db4477cc88484691cb92ae71
+i2c-host-fixes	9189526c46f2ad14df25dbdc30443f79d03dc7bd
+drm-misc-fixes	1c1914d6e8c6edbf5b45047419ff51abdb1dce96
+mm-stable	6613476e225e090cc9aad49be7fa504e290dd33d
+mm-nonmm-stable	6613476e225e090cc9aad49be7fa504e290dd33d
+mm		ec2dacea82ce333584fa31384eb0b55eb2dbc604
+kbuild		bd768db42ef6d27c3eca1d29ec8beb4474e4ba35
+clang-format	5a205c6a9f79d14db38006aa2d7c4f4e76b1bfc7
+perf		7727d59de44e4568d0ad0f3867c8bdec69d688fe
+compiler-attributes	2993eb7a8d34aee6165e1f6676e81cdf1d22aa62
+dma-mapping	7c65aa3cc072cee76f577262fbe381a111a98774
+asm-generic	34b2321cc648a246d08cc51e423532eac690ccf1
+arc		0bb80ecc33a8fb5a682236443c1e740d5c917d1d
+arm		8790fade1a19caf714ba1d91ce1fdceb9f2067f2
+arm64		1b20d0486a602417defb5bf33320d31b2a7a47f8
+arm-perf	bb339db4d363c84e0a8d70827df591397ccd7312
+arm-soc		0d1d824a4ac102db35bc8524a8be97ada8ad37ab
+amlogic		0dd3ee31125508cd67f7e7172247f05b7fd1753a
+asahi-soc	ffc253263a1375a65fa6c9f62a893e9767fbebfa
+aspeed		e60f7a99d3789b5d0b24d3c0571b013309e56815
+at91		6613476e225e090cc9aad49be7fa504e290dd33d
+broadcom	bbf6d7dc2d94fe03a57b173a13f8fbf9123bb2a8
+davinci		6613476e225e090cc9aad49be7fa504e290dd33d
+drivers-memory	2f542c937c48c2bd5a8ddf180b417fbe7152559f
+imx-mxs		4db02d61a81ea4ab3bf9ec6878ccf3f2a3f27405
+mediatek	9802b60bd6d895a8be9c44cc5f74bb11131aa8bc
+mvebu		476887312c6082e2c03efc3f016e8134c076108e
+omap		0012c1958460386adc5770baf2f53206aed77ff3
+qcom		f70a1e7dd74f0bf5f58137a4379453873f64797a
+renesas		6fc5bb9da080f9f12f2dc13647a695846cb2f8f5
+reset		c3c46acd5be9a3351c163d2869045cab4d5342dc
+rockchip	a3c3232263626d7e6629cc4f134532a2b792c05c
+samsung-krzk	819ce8ab3d99371ad17662c22a8843d450ca237a
+scmi		99f798bdfb75a8e07f90462399930186c8392997
+sophgo		41bccc98fb7931d63d03f326a746ac4d429c1dd3
+stm32		bda732fda19365b7a7397d0d37090f6dc253232c
+sunxi		38ed19495066966979ba821b9e0f549ad5ea620d
+tee		84ec4fd8883176442d21454bcecd3c29c0aabad6
+tegra		5e6333ef8ea5ab004f8a24a8ebb0a3bc15b05586
+ti		6613476e225e090cc9aad49be7fa504e290dd33d
+xilinx		0ee74e0d7b9786976cc565ac08887469605346e7
+clk		efe5a1b888ab0f6acf723e2a12a4644a599294d0
+clk-imx		f52f00069888e410cec718792b3e314624f209ea
+clk-renesas	096311157d2a6bb8f06e28e1143e2a5de6a0183b
+csky		2c40c1c6adab90ee4660caf03722b3a3ec67767b
+loongarch	48ef9e87b407f89f230f804815af7ac2031ec17a
+m68k		6b9c045b0602cf64b33ea6da5e6aa6f81dd47ae8
+m68knommu	41bccc98fb7931d63d03f326a746ac4d429c1dd3
+microblaze	6613476e225e090cc9aad49be7fa504e290dd33d
+mips		6613476e225e090cc9aad49be7fa504e290dd33d
+openrisc	c289330331eb93bc6a3c68b9119ccd7d4285a4a2
+parisc-hd	913b9d443a0180cf0de3548f1ab3149378998486
+powerpc		44a1aad2fe6c10bfe0589d8047057b10a4c18a19
+soc-fsl		fb9c384625dd604e8a5be1f42b35e83104b90670
+risc-v		cb4ede926134a65bc3bf90ed58dace8451d7e759
+riscv-dt	2db68ddbf33a76b5913ca281660979b4c48f1df6
+riscv-soc	6613476e225e090cc9aad49be7fa504e290dd33d
+s390		8eb3db95a8c8ecd6f8bb082a99ded3bbc79b023f
+sh		6613476e225e090cc9aad49be7fa504e290dd33d
+uml		83aec96c631e0fa75cfe6d6a1b113a32151aaa88
+xtensa		a03cd7602a090eae277d2b79d43925661e7fbe9a
+bcachefs	6bb3f7f4c3f4da8e09de188f2f63e8f741bba3bd
+pidfd		a901a3568fd26ca9c4a82d8bc5ed5b3ed844d451
+fscrypt		c919330dd57835970b37676d377de3eaaea2c1e9
+afs		abcbd3bfbbfe97a8912d0c929d4aa18f50d9bc52
+btrfs		932ab07c383e655d4a353bb3b67c7164cad64eb3
+ceph		ded080c86b3f99683774af0441a58fc2e3d60cae
+cifs		2417900a8dcec30415ba9318f02d4346a158c34b
+configfs	4425c1d9b44ded655d2668e1ce95a62bccf7b21b
+ecryptfs	a3d78fe3e1ae8c6a1901635c54a1a799656f72c8
+erofs		aa12a790d31be14b289d5a2c6f41ca535fcc7841
+exfat		8b29fa18400ccb7fb681f105d74b2cabb59e5d62
+exportfs	42c3732fa8073717dd7d924472f1c0bc5b452fdc
+ext3		cd04011c5859b711e5030bf2e0fd14615be9ccfe
+ext4		68da4c44b994aea797eb9821acb3a4a36015293e
+f2fs		f31438c16879f0612fae83f02b11367c906a7d00
+fsverity	919dc320956ea353a7fb2d84265195ad5ef525ac
+fuse		3f29f1c336c0e8a4bec52f1e5217f88835553e5b
+gfs2		acd2d246f4b26460d0499bc4e0042f63380e526b
+jfs		e42e29cc442395d62f1a8963ec2dfb700ba6a5d7
+ksmbd		41bccc98fb7931d63d03f326a746ac4d429c1dd3
+nfs		052d534373b7ed33712a63d5e17b2b6cdbce84fd
+nfs-anna	57331a59ac0d680f606403eb24edd3c35aecba31
+nfsd		6c1c91f97746154611770e14f9be4d65a52f77c6
+ntfs3		622cd3daa8eae37359a6fd3c07c36d19f66606b5
+orangefs	31720a2b109b3080eb77e97b8f6f50a27b4ae599
+overlayfs	d17bb4620f90f81d8a8a45c3d025c679a1b5efcd
+ubifs		adbf4c4954e33e623897058a617c583d65a177f6
+v9fs		ff49bf1867578f23a5ffdd38f927f6e1e16796c4
+v9fs-ericvh	be57855f505003c5cafff40338d5d0f23b00ba4d
+xfs		881f78f472556ed05588172d5b5676b48dc48240
+zonefs		8812387d056957355ef1d026cd38bed3830649db
+iomap		3ac974796e5d94509b85a403449132ea660127c2
+djw-vfs		ce85a1e04645b1ed386b074297df27ab5b8801c0
+file-locks	e0152e7481c6c63764d6ea8ee41af5cf9dfac5e9
+iversion	e0152e7481c6c63764d6ea8ee41af5cf9dfac5e9
+vfs-brauner	de9861b0c277e1d86067255a2634e19a9a0e329b
+vfs		052d534373b7ed33712a63d5e17b2b6cdbce84fd
+printk		6c3a34e38436a2a3f7a1fa764c108ee19b05b893
+pci		95bf9132f8b48f8ca59eacd9b40afa5cce4feb53
+pstore		24a0b5e196cf70ccff97bc0add6fa7178ad50cc4
+hid		a54f72c74c2d6db65b3f3d3bcb9b6487c719d7fb
+i2c		41bccc98fb7931d63d03f326a746ac4d429c1dd3
+i2c-host	11f1357336cde9924da0b455e528f11fbd5011f4
+i3c		4fa0888f6f3e6a67cac5afafb23e33f8222cfdd0
+hwmon-staging	6120fec68e78eefdf7d89325668c082a90817c75
+jc_docs		5c7944ca7b13978744ec83e131aef9255fdbabbe
+v4l-dvb		6613476e225e090cc9aad49be7fa504e290dd33d
+v4l-dvb-next	04447d48afd365a837e23cde631517f166045b9d
+pm		7543bfcb6b1a6cecb3e1df9a1bf864d916e8b40b
+cpufreq-arm	eaffb10b51bf74415c9252fd8fb4dd77122501ee
+cpupower	0086ffec768bec6f5d61fc7e406af640eb912a24
+devfreq		aed5ed595960c6d301dcd4ed31aeaa7a8054c0c6
+pmdomain	90a7463fae9eb9f68a6e4ff3e8868beb8fbfc649
+opp		ace4b31b297dfd7b8c969ff5046c8128c3e025be
+thermal		5314b1543787e6cd5d248186fcfd5c5fc4ca2146
+dlm		5beebc1dda47719dac85830c53bca1a0ab497d96
+rdma		a400073ce3dd3dbdf843e6c9c0a0a7f6ca9f05d7
+net-next	e7f8df0e81bf73ab6dc6ac1dc01273fa06564119
+bpf-next	cd1c194ffe28820e0389299060b2cd425ce3ec44
+ipsec-next	ab1e1a38de240057ed108075abd1309c02e2a2a4
+mlx5-next	d727d27db536faea7178290c677cc0567f647231
+netfilter-next	5264ab612e28058536de8069bcf83eb20fd65c29
+ipvs-next	7ad269787b6615ca56bb161063331991fce51abf
+bluetooth	64692e12507b3efd71b4ff5596c9742d91f1ffe5
+wireless-next	3fbf61207c66ff7ac9b60ab76d4bfd239f97e973
+wpan-next	2373699560a754079579b7722b50d1d38de1960e
+wpan-staging	2373699560a754079579b7722b50d1d38de1960e
+mtd		98d4fda8f2d4bc3fb97958d2ef4c90e161a628f2
+nand		023e6aad7e5e7f2e086c399abd0675589c123728
+spi-nor		3c0e1dfa703cd2a16fbfb1290b0970b61add3cde
+crypto		4d314d27130b674a3687135fe94f44a40f107f76
+drm		41bccc98fb7931d63d03f326a746ac4d429c1dd3
+drm-ci		ad6bfe1b66a5c146ec236847eca7af4c8806d666
+drm-exynos	6613476e225e090cc9aad49be7fa504e290dd33d
+drm-misc	1f1626ac0428820f998245478610f452650bcab5
+amdgpu		9217b91c64587459362f211b0310e2bdaeb67719
+drm-intel	fe4c6ff50c68aa467f04c376fa3cf2a60e62c07d
+drm-tegra	2429b3c529da29d4277d519bd66d034842dcd70c
+drm-msm		d4ca26ac4be0d9aea7005c40df75e6775749671b
+drm-msm-lumag	d4ca26ac4be0d9aea7005c40df75e6775749671b
+etnaviv		c9959996a8fc171bbb2c2d9c7478306f331a6cca
+fbdev		72fee6b0a3a4ad6c5131d4c20e8ab7253b16e38b
+regmap		a1214cdfe92bd421a449f16d75d4dae2df36060b
+sound		8b87a7863fa57f87f5a63fb2dd69a4400593d92c
+ieee1394	dd754748f1bef240c38f987cabd70366b7e91474
+sound-asoc	e3468b7aab5cf18b86ee67e02b6e70c72e792165
+modules		3559ad395bf02f3dee576dc9acab4ce330ce57b5
+input		7d0f351da46098b3bbb147f886f059473b84ff48
+block		b48b5a7c9bc1da4e78105780ace46edd74832ab8
+device-mapper	4eacc39d5529cb0bb6bd9a6608658703d7af1e05
+libata		c8474c7273ac3bad718c33118aa82efb7b374f6e
+pcmcia		4f733de8b78a209501041a4b0a44c83ece0e8933
+mmc		4e99ffb173faaf38f010acb369bff57a20e9e531
+mfd		1e0ea9e75ff3f395ad6f85f0be2258ef114a53dc
+backlight	3b75d271e161e22aff8171940a77510d2fb2ad6f
+battery		4c5d387d79a65355b73e526cbf5754a9dcd5377b
+regulator	a2fc922ece40709ac81f7a9901dd84ecb3298740
+security	5a287d3d2b9de2b3e747132c615599907ba5c3c1
+apparmor	8ead196be219adade3bd0d4115cc9b8506643121
+integrity	1ed4b563100230ea68821a2b25a3d9f25388a3e6
+selinux		90593caf7db74da2300f7a7056a26ae000b3e7cd
+smack		f0816d4332c3f764cd42cf8124a193e17eeccba9
+tomoyo		0bb80ecc33a8fb5a682236443c1e740d5c917d1d
+tpmdd		610347effc2ecb5ededf5037e82240b151f883ab
+watchdog	41bccc98fb7931d63d03f326a746ac4d429c1dd3
+iommu		75f74f85a42eb294b657f847c33e1bb7921dbec9
+audit		aa13b709084a0287ef250a9fbde5993e4dfc3078
+devicetree	85f838adad5487f96ff3acf6d3eb8263a39a0757
+dt-krzk		8c82b4eef2972200f6171aaa260d7bba2ad29889
+mailbox		cd795fb0c352c1f70e5fa437b01572c8693e1b77
+spi		60fbb72e3018c87adc2e3e5ff743ce757c8b4e89
+tip		078b7b997b47c7166c1240cf1d39db9f646a56be
+clockevents	0076a37a426b6c850a0b41b814952760e4a70fcf
+edac		5f9d6dfd6c4a9efd221faf1925f382f9562a3840
+ftrace		4af12c95cbe888b71e905058c48e8d1e779264b5
+rcu		bc31e6cb27a9334140ff2f0a209d59b08bc0bc8c
+kvm		a9ef277488cfc1b7da88235dc11c338a14f34835
+kvm-arm		87bbb6a32237af19d248957a268b3536cfeca6ba
+kvms390		10f7b1dcdfe05efcd26e90e337daf1bfd8f4a6da
+kvm-ppc		180c6b072bf360b686e53d893d8dcf7dbbaec6bb
+kvm-riscv	4d0e8f9a361b3a1f7b67418c536b258323de734f
+kvm-x86		f0f3b810edda57f317d79f452056786257089667
+xen-tip		2d2db7d40254d5fb53b11ebd703cd1ed0c5de7a1
+percpu		2d9ad81ef93570bc0d4929d05d0601ea400d6fcf
+workqueues	15930da42f8981dc42c19038042947b475b19f47
+drivers-x86	3f399b5d7189bcb608c75abc85fe39f7a5509cfa
+chrome-platform	6613476e225e090cc9aad49be7fa504e290dd33d
+chrome-platform-firmware	6613476e225e090cc9aad49be7fa504e290dd33d
+hsi		fa72d143471d04ce3055d8dad9743b08c19e4060
+leds-lj		4289e434c46c8cbd32cf8b67fa7689b3d2ca4361
+ipmi		296455ade1fdcf5f8f8c033201633b60946c589a
+driver-core	f297a3844aa059c53be3f69be85ebc071b8a6d16
+usb		f1a27f081c1fa1eeebf38406e45f29636114470f
+thunderbolt	dec6a613574cd3dea799170b7aaa8fd76e22f176
+usb-serial	6613476e225e090cc9aad49be7fa504e290dd33d
+tty		fccc9d9233f918ee50cf2955ae7134a7f3418351
+char-misc	390b60f7638a8755beee22f83a5fbe54fdc9831d
+accel		dddb2e526a3650f3aa19c9ed315ae0f49c768810
+coresight	60e5f23dc5d68ec01e6dae8f4311230c7d2ccb8a
+fastrpc		6613476e225e090cc9aad49be7fa504e290dd33d
+fpga		6613476e225e090cc9aad49be7fa504e290dd33d
+icc		7158ba962f419c9e490e187b2a150f9ec5296bfe
+iio		a0295c1bd4a79461f291c9b0df0523cbbeb75560
+phy-next	25ee21fc97db6cb7f476464e4aa8616652b3be49
+soundwire	0707496ff4e416ea08c90053fd5fde5811b11b22
+extcon		7803680964c025f598f827b7ea7433467ef21a56
+gnss		41bccc98fb7931d63d03f326a746ac4d429c1dd3
+vfio		78f70c02bdbccb5e9b0b0c728185d4aeb7044ace
+w1		6613476e225e090cc9aad49be7fa504e290dd33d
+spmi		b85ea95d086471afb4ad062012a4d73cd328fa86
+staging		ce54e9342124ededf0a00ed4e8a8aee535bfbf00
+counter-next	0b3bbd8f9baf245ec77d86f6f5bc902105b4bfa9
+mux		44c026a73be8038f03dbdeef028b642880cf1511
+dmaengine	93bdff7bb83a9ea79f41d4e48e1711fd5f4ec4ed
+cgroup		8d4c171f451d384f3a287eb14bd60825d0b2381b
+scsi		890d900e7fec7f7956c26bd47b4f0f07a0a507b1
+scsi-mkp	3f90ac7138edb995b4312221647b58afcc15ec06
+vhost		f16d65124380ac6de8055c4a8e5373a1043bb09b
+rpmsg		99f59b148871dadb9104366e3d25b120a97f897b
+gpio		0bb80ecc33a8fb5a682236443c1e740d5c917d1d
+gpio-brgl	6933ba529d06afdd3faf5501855e410b46b77160
+gpio-intel	6613476e225e090cc9aad49be7fa504e290dd33d
+pinctrl		47eed1127d2af6fada49565efca4671a56e5839d
+pinctrl-intel	6613476e225e090cc9aad49be7fa504e290dd33d
+pinctrl-renesas	fea58424e2523376ece6f734479e63061e17ad7f
+pinctrl-samsung	6613476e225e090cc9aad49be7fa504e290dd33d
+pwm		979c6fe7e799d2cab0a99c4b8c41cc48f10aca0c
+ktest		7dc8e24f0e09834341f84d37433840b353d64bc8
+kselftest	6a71770442b5b7cf8f880ca1c0a72456c918c757
+kunit		6613476e225e090cc9aad49be7fa504e290dd33d
+kunit-next	6613476e225e090cc9aad49be7fa504e290dd33d
+livepatching	602bf18307981f3bfd9ebf19921791a4256d3fd1
+rtc		14688f1a91e1f37bc6bf50ff5241e857f24338e0
+nvdimm		a085a5eb6594a3ebe5c275e9c2c2d341f686c23c
+at24		6613476e225e090cc9aad49be7fa504e290dd33d
+ntb		9341b37ec17a8793e8439e9b18354ba69556b786
+seccomp		0c6f28a844311b8f81b4b117f586189c704d3f33
+fsi		c5eeb63edac9497f9a0d46d3b75cf8b293771ecf
+slimbus		04b945e4cf81a12365f8207a4d34dbc81ba17413
+nvmem		a0cfd5e997824d0bd8c7620d40cdb324121a2fc7
+xarray		2a15de80dd0f7e04a823291aa9eb49c5294f56af
+hyperv		ce9ecca0238b140b88f43859b211c9fdfd8e5b70
+auxdisplay	c52391fafcefe4c562bdac62088a2735c185b942
+kgdb		4f41d30cd6dc865c3cbc1a852372321eba6d4e4c
+hmm		6613476e225e090cc9aad49be7fa504e290dd33d
+cfi		06c2afb862f9da8dc5efa4b6076a0e48c3fbaaa5
+mhi		8ddf54a32111f6dbe06cd318af443c6545a6c037
+memblock	2159bd4e905704b1765b6b883ea15e51ad986a6a
+cxl		73bf93edeeea866b0b6efbc8d2595bdaaba7f1a5
+zstd		3f832dfb8a8eafee3cecd479d99651a64a61485a
+efi		4afa688d7141ae7a166d32224abbfd536acccfca
+unicode		367122c529f35b4655acbe33c0cc4d6d3b32ba71
+slab		7d2ec24bd8a59853c7660d3eac50a3b7ffce8ae3
+random		615d300648869c774bd1fe54b4627bb0c20faed4
+landlock	2f8bb71d737c25ca97a83e47b445837aa96cec77
+rust		f090f0d0eea9666a96702b29bc9a64cbabee85c5
+sysctl		6613476e225e090cc9aad49be7fa504e290dd33d
+execve		90383cc07895183c75a0db2460301c2ffd912359
+bitmap		071ad962baf5e857fd965595421cf6fb588610ed
+hte		b85ea95d086471afb4ad062012a4d73cd328fa86
+kspp		34b82a2fb7475aba5adfd3ae9f2c66da7f1979f7
+kspp-gustavo	6613476e225e090cc9aad49be7fa504e290dd33d
+nolibc		6613476e225e090cc9aad49be7fa504e290dd33d
+tsm		f4738f56d1dc62aaba69b33702a5ab098f1b8c63
+iommufd		6613476e225e090cc9aad49be7fa504e290dd33d
+header_cleanup	5f4c01f1e3c7b0c8d1e5dd6f080531de7aa5e47b
diff --git a/Next/Trees b/Next/Trees
new file mode 100644
index 00000000000000..f8b6a7c11afd7c
--- /dev/null
+++ b/Next/Trees
@@ -0,0 +1,372 @@
+Trees included into this release:
+
+Name		Type	Url
+----		----	---
+origin		git	git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git#master
+fixes		git	git://git.kernel.org/pub/scm/linux/kernel/git/sfr/next-fixes.git#fixes
+mm-hotfixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm#mm-hotfixes-unstable
+kbuild-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git#fixes
+arc-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git#for-curr
+arm-current	git	git://git.armlinux.org.uk/~rmk/linux-arm.git#fixes
+arm64-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux#for-next/fixes
+arm-soc-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git#arm/fixes
+davinci-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git#davinci/for-current
+drivers-memory-fixes	git	https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git#fixes
+sophgo-fixes	git	https://github.com/sophgo/linux.git#fixes
+tee-fixes	git	https://git.linaro.org/people/jens.wiklander/linux-tee.git#fixes
+m68k-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k.git#for-linus
+powerpc-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git#fixes
+s390-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git#fixes
+fscrypt-current	git	git://git.kernel.org/pub/scm/fs/fscrypt/linux.git#for-current
+fsverity-current	git	git://git.kernel.org/pub/scm/fs/fsverity/linux.git#for-current
+net		git	git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git#main
+bpf		git	git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git#master
+ipsec		git	git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git#master
+netfilter	git	git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git#main
+ipvs		git	git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs.git#main
+wireless	git	git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git#for-next
+wpan		git	git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan.git#master
+rdma-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git#for-rc
+sound-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git#for-linus
+sound-asoc-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git#for-linus
+regmap-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap.git#for-linus
+regulator-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git#for-linus
+spi-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git#for-linus
+pci-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git#for-linus
+driver-core.current	git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git#driver-core-linus
+tty.current	git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git#tty-linus
+usb.current	git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git#usb-linus
+usb-serial-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/johan/usb-serial.git#usb-linus
+phy		git	git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git#fixes
+staging.current	git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git#staging-linus
+iio-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git#fixes-togreg
+counter-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/wbg/counter.git#counter-current
+char-misc.current	git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git#char-misc-linus
+soundwire-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/soundwire.git#fixes
+thunderbolt-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git#fixes
+input-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input.git#for-linus
+crypto-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6.git#master
+vfio-fixes	git	git://github.com/awilliam/linux-vfio.git#for-linus
+kselftest-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#fixes
+modules-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git#modules-linus
+dmaengine-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/dmaengine.git#fixes
+backlight-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/lee/backlight.git#for-backlight-fixes
+mtd-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git#mtd/fixes
+mfd-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git#for-mfd-fixes
+v4l-dvb-fixes	git	https://git.linuxtv.org/media_stage.git#fixes
+reset-fixes	git	https://git.pengutronix.de/git/pza/linux#reset/fixes
+mips-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git#mips-fixes
+at91-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git#at91-fixes
+omap-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git#fixes
+kvm-fixes	git	git://git.kernel.org/pub/scm/virt/kvm/kvm.git#master
+kvms390-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git#master
+hwmon-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git#hwmon
+nvdimm-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git#libnvdimm-fixes
+cxl-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git#fixes
+btrfs-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git#next-fixes
+vfs-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git#fixes
+dma-mapping-fixes	git	git://git.infradead.org/users/hch/dma-mapping.git#for-linus
+drivers-x86-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git#fixes
+samsung-krzk-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux.git#fixes
+pinctrl-samsung-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/samsung.git#fixes
+devicetree-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git#dt/linus
+dt-krzk-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-dt.git#fixes
+scsi-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git#fixes
+drm-fixes	git	git://git.freedesktop.org/git/drm/drm.git#drm-fixes
+drm-intel-fixes	git	git://anongit.freedesktop.org/drm-intel#for-linux-next-fixes
+mmc-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git#fixes
+rtc-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git#rtc-fixes
+gnss-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/johan/gnss.git#gnss-linus
+hyperv-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git#hyperv-fixes
+soc-fsl-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/leo/linux.git#fix
+risc-v-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git#fixes
+riscv-dt-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git#riscv-dt-fixes
+riscv-soc-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git#riscv-soc-fixes
+fpga-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/fpga/linux-fpga.git#fixes
+spdx		git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/spdx.git#spdx-linus
+gpio-brgl-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git#gpio/for-current
+gpio-intel-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-gpio-intel.git#fixes
+pinctrl-intel-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/intel.git#fixes
+erofs-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git#fixes
+kunit-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#kunit-fixes
+ubifs-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git#fixes
+memblock-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git#fixes
+nfsd-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux#nfsd-fixes
+renesas-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git#fixes
+perf-current	git	git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools#perf-tools
+efi-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git#urgent
+zstd-fixes	git	https://github.com/terrelln/linux.git#zstd-linus
+battery-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply.git#fixes
+uml-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/uml/linux.git#fixes
+iommufd-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git#for-rc
+rust-fixes	git	https://github.com/Rust-for-Linux/linux.git#rust-fixes
+v9fs-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git#fixes/next
+w1-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-w1.git#fixes
+pmdomain-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/linux-pm.git#fixes
+overlayfs-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/overlayfs/vfs.git#ovl-fixes
+i2c-host-fixes	git	git://git.kernel.org/pub/scm/linux/kernel/git/andi.shyti/linux.git#i2c/i2c-host-fixes
+drm-misc-fixes	git	git://anongit.freedesktop.org/drm/drm-misc#for-linux-next-fixes
+mm-stable	git	git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm#mm-stable
+mm-nonmm-stable	git	git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm#mm-nonmm-stable
+mm		git	git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm#mm-everything
+kbuild		git	git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git#for-next
+clang-format	git	https://github.com/ojeda/linux.git#clang-format
+perf		git	git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git#perf-tools-next
+compiler-attributes	git	https://github.com/ojeda/linux.git#compiler-attributes
+dma-mapping	git	git://git.infradead.org/users/hch/dma-mapping.git#for-next
+asm-generic	git	git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git#master
+arc		git	git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git#for-next
+arm		git	git://git.armlinux.org.uk/~rmk/linux-arm.git#for-next
+arm64		git	git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux#for-next/core
+arm-perf	git	git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git#for-next/perf
+arm-soc		git	git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git#for-next
+amlogic		git	git://git.kernel.org/pub/scm/linux/kernel/git/amlogic/linux.git#for-next
+asahi-soc	git	https://github.com/AsahiLinux/linux.git#asahi-soc/for-next
+aspeed		git	git://git.kernel.org/pub/scm/linux/kernel/git/joel/bmc.git#for-next
+at91		git	git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git#at91-next
+broadcom	git	https://github.com/Broadcom/stblinux.git#next
+davinci		git	git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git#davinci/for-next
+drivers-memory	git	https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git#for-next
+imx-mxs		git	git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git#for-next
+mediatek	git	git://git.kernel.org/pub/scm/linux/kernel/git/matthias.bgg/linux.git#for-next
+mvebu		git	git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu.git#for-next
+omap		git	git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git#for-next
+qcom		git	git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git#for-next
+renesas		git	git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git#next
+reset		git	https://git.pengutronix.de/git/pza/linux#reset/next
+rockchip	git	git://git.kernel.org/pub/scm/linux/kernel/git/mmind/linux-rockchip.git#for-next
+samsung-krzk	git	git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux.git#for-next
+scmi		git	git://git.kernel.org/pub/scm/linux/kernel/git/sudeep.holla/linux.git#for-linux-next
+sophgo		git	https://github.com/sophgo/linux.git#for-next
+stm32		git	git://git.kernel.org/pub/scm/linux/kernel/git/atorgue/stm32.git#stm32-next
+sunxi		git	git://git.kernel.org/pub/scm/linux/kernel/git/sunxi/linux.git#sunxi/for-next
+tee		git	https://git.linaro.org/people/jens.wiklander/linux-tee.git#next
+tegra		git	git://git.kernel.org/pub/scm/linux/kernel/git/tegra/linux.git#for-next
+ti		git	git://git.kernel.org/pub/scm/linux/kernel/git/ti/linux.git#ti-next
+xilinx		git	git://github.com/Xilinx/linux-xlnx.git#for-next
+clk		git	git://git.kernel.org/pub/scm/linux/kernel/git/clk/linux.git#clk-next
+clk-imx		git	git://git.kernel.org/pub/scm/linux/kernel/git/abelvesa/linux.git#for-next
+clk-renesas	git	git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-drivers.git#renesas-clk
+csky		git	git://github.com/c-sky/csky-linux.git#linux-next
+loongarch	git	git://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson.git#loongarch-next
+m68k		git	git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k.git#for-next
+m68knommu	git	git://git.kernel.org/pub/scm/linux/kernel/git/gerg/m68knommu.git#for-next
+microblaze	git	git://git.monstr.eu/linux-2.6-microblaze.git#next
+mips		git	git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git#mips-next
+openrisc	git	git://github.com/openrisc/linux.git#for-next
+parisc-hd	git	git://git.kernel.org/pub/scm/linux/kernel/git/deller/parisc-linux.git#for-next
+powerpc		git	git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git#next
+soc-fsl		git	git://git.kernel.org/pub/scm/linux/kernel/git/leo/linux.git#next
+risc-v		git	git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git#for-next
+riscv-dt	git	git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git#riscv-dt-for-next
+riscv-soc	git	git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git#riscv-soc-for-next
+s390		git	git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git#for-next
+sh		git	git:git.kernel.org/pub/scm/linux/kernel/git/glaubitz/sh-linux.git#for-next
+uml		git	git://git.kernel.org/pub/scm/linux/kernel/git/uml/linux.git#next
+xtensa		git	git://github.com/jcmvbkbc/linux-xtensa.git#xtensa-for-next
+bcachefs	git	https://evilpiepirate.org/git/bcachefs.git#for-next
+pidfd		git	git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git#for-next
+fscrypt		git	git://git.kernel.org/pub/scm/fs/fscrypt/linux.git#for-next
+afs		git	git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git#afs-next
+btrfs		git	git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git#for-next
+ceph		git	git://github.com/ceph/ceph-client.git#master
+cifs		git	git://git.samba.org/sfrench/cifs-2.6.git#for-next
+configfs	git	git://git.infradead.org/users/hch/configfs.git#for-next
+ecryptfs	git	git://git.kernel.org/pub/scm/linux/kernel/git/tyhicks/ecryptfs.git#next
+erofs		git	git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git#dev
+exfat		git	git://git.kernel.org/pub/scm/linux/kernel/git/linkinjeon/exfat.git#dev
+exportfs	git	git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux#exportfs-next
+ext3		git	git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git#for_next
+ext4		git	git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git#dev
+f2fs		git	git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git#dev
+fsverity	git	git://git.kernel.org/pub/scm/fs/fsverity/linux.git#for-next
+fuse		git	git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git#for-next
+gfs2		git	git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2.git#for-next
+jfs		git	git://github.com/kleikamp/linux-shaggy.git#jfs-next
+ksmbd		git	https://github.com/smfrench/smb3-kernel.git#ksmbd-for-next
+nfs		git	git://git.linux-nfs.org/projects/trondmy/nfs-2.6.git#linux-next
+nfs-anna	git	git://git.linux-nfs.org/projects/anna/linux-nfs.git#linux-next
+nfsd		git	git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux#nfsd-next
+ntfs3		git	https://github.com/Paragon-Software-Group/linux-ntfs3.git#master
+orangefs	git	git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux#for-next
+overlayfs	git	git://git.kernel.org/pub/scm/linux/kernel/git/overlayfs/vfs.git#overlayfs-next
+ubifs		git	git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git#next
+v9fs		git	git://github.com/martinetd/linux#9p-next
+v9fs-ericvh	git	git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git#ericvh/for-next
+xfs		git	git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git#for-next
+zonefs		git	git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs.git#for-next
+iomap		git	git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git#iomap-for-next
+djw-vfs		git	git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git#vfs-for-next
+file-locks	git	git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git#locks-next
+iversion	git	git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git#iversion-next
+vfs-brauner	git	git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git#vfs.all
+vfs		git	git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git#for-next
+printk		git	git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux.git#for-next
+pci		git	git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git#next
+pstore		git	git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git#for-next/pstore
+hid		git	git://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git#for-next
+i2c		git	git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git#i2c/for-next
+i2c-host	git	git://git.kernel.org/pub/scm/linux/kernel/git/andi.shyti/linux.git#i2c/i2c-host
+i3c		git	git://git.kernel.org/pub/scm/linux/kernel/git/i3c/linux.git#i3c/next
+hwmon-staging	git	git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git#hwmon-next
+jc_docs		git	git://git.lwn.net/linux.git#docs-next
+v4l-dvb		git	git://linuxtv.org/media_tree.git#master
+v4l-dvb-next	git	git://linuxtv.org/mchehab/media-next.git#master
+pm		git	git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git#linux-next
+cpufreq-arm	git	git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git#cpufreq/arm/linux-next
+cpupower	git	git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux.git#cpupower
+devfreq		git	git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/linux.git#devfreq-next
+pmdomain	git	git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/linux-pm.git#next
+opp		git	git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git#opp/linux-next
+thermal		git	git://git.kernel.org/pub/scm/linux/kernel/git/thermal/linux.git#thermal/linux-next
+dlm		git	git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm.git#next
+rdma		git	git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git#for-next
+net-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git#main
+bpf-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git#for-next
+ipsec-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git#master
+mlx5-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git#mlx5-next
+netfilter-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next.git#main
+ipvs-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git#main
+bluetooth	git	git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git#master
+wireless-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git#for-next
+wpan-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git#master
+wpan-staging	git	git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git#staging
+mtd		git	git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git#mtd/next
+nand		git	git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git#nand/next
+spi-nor		git	git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git#spi-nor/next
+crypto		git	git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git#master
+drm		git	git://git.freedesktop.org/git/drm/drm.git#drm-next
+drm-ci		git	git://git.freedesktop.org/git/drm/drm.git#topic/drm-ci
+drm-exynos	git	git://git.kernel.org/pub/scm/linux/kernel/git/daeinki/drm-exynos.git#for-linux-next
+drm-misc	git	git://anongit.freedesktop.org/drm/drm-misc#for-linux-next
+amdgpu		git	https://gitlab.freedesktop.org/agd5f/linux#drm-next
+drm-intel	git	git://anongit.freedesktop.org/drm-intel#for-linux-next
+drm-tegra	git	https://gitlab.freedesktop.org/drm/tegra.git#for-next
+drm-msm		git	https://gitlab.freedesktop.org/drm/msm.git#msm-next
+drm-msm-lumag	git	https://gitlab.freedesktop.org/lumag/msm.git#msm-next-lumag
+etnaviv		git	https://git.pengutronix.de/git/lst/linux#etnaviv/next
+fbdev		git	git://git.kernel.org/pub/scm/linux/kernel/git/deller/linux-fbdev.git#for-next
+regmap		git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap.git#for-next
+sound		git	git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git#for-next
+ieee1394	git	https://git.kernel.org/pub/scm/linux/kernel/git/ieee1394/linux1394.git#for-next
+sound-asoc	git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git#for-next
+modules		git	git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git#modules-next
+input		git	git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input.git#next
+block		git	git://git.kernel.dk/linux-block.git#for-next
+device-mapper	git	git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git#for-next
+libata		git	git://git.kernel.org/pub/scm/linux/kernel/git/libata/linux#for-next
+pcmcia		git	git://git.kernel.org/pub/scm/linux/kernel/git/brodo/linux.git#pcmcia-next
+mmc		git	git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git#next
+mfd		git	git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git#for-mfd-next
+backlight	git	git://git.kernel.org/pub/scm/linux/kernel/git/lee/backlight.git#for-backlight-next
+battery		git	git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply.git#for-next
+regulator	git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git#for-next
+security	git	git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git#next
+apparmor	git	git://git.kernel.org/pub/scm/linux/kernel/git/jj/linux-apparmor#apparmor-next
+integrity	git	git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity#next-integrity
+selinux		git	git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git#next
+smack		git	git://github.com/cschaufler/smack-next#next
+tomoyo		git	https://scm.osdn.net/gitroot/tomoyo/tomoyo-test1.git#master
+tpmdd		git	git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git#next
+watchdog	git	git://www.linux-watchdog.org/linux-watchdog-next.git#master
+iommu		git	git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git#next
+audit		git	git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit.git#next
+devicetree	git	git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git#for-next
+dt-krzk		git	git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-dt.git#for-next
+mailbox		git	git://git.kernel.org/pub/scm/linux/kernel/git/jassibrar/mailbox.git#for-next
+spi		git	git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git#for-next
+tip		git	git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git#master
+clockevents	git	git://git.linaro.org/people/daniel.lezcano/linux.git#timers/drivers/next
+edac		git	git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git#edac-for-next
+ftrace		git	git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git#for-next
+rcu		git	git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git#rcu/next
+kvm		git	git://git.kernel.org/pub/scm/virt/kvm/kvm.git#next
+kvm-arm		git	git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git#next
+kvms390		git	git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git#next
+kvm-ppc		git	git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git#topic/ppc-kvm
+kvm-riscv	git	https://github.com/kvm-riscv/linux.git#riscv_kvm_next
+kvm-x86		git	https://github.com/kvm-x86/linux.git#next
+xen-tip		git	git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git#linux-next
+percpu		git	git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu.git#for-next
+workqueues	git	git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git#for-next
+drivers-x86	git	git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git#for-next
+chrome-platform	git	git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git#for-next
+chrome-platform-firmware	git	git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git#for-firmware-next
+hsi		git	git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-hsi.git#for-next
+leds-lj		git	git://git.kernel.org/pub/scm/linux/kernel/git/lee/leds.git#for-leds-next
+ipmi		git	git://github.com/cminyard/linux-ipmi.git#for-next
+driver-core	git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git#driver-core-next
+usb		git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git#usb-next
+thunderbolt	git	git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git#next
+usb-serial	git	git://git.kernel.org/pub/scm/linux/kernel/git/johan/usb-serial.git#usb-next
+tty		git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git#tty-next
+char-misc	git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git#char-misc-next
+accel		git	git://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux.git#habanalabs-next
+coresight	git	git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git#next
+fastrpc		git	git://git.kernel.org/pub/scm/linux/kernel/git/srini/fastrpc.git#for-next
+fpga		git	git://git.kernel.org/pub/scm/linux/kernel/git/fpga/linux-fpga.git#for-next
+icc		git	git://git.kernel.org/pub/scm/linux/kernel/git/djakov/icc.git#icc-next
+iio		git	git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git#togreg
+phy-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git#next
+soundwire	git	git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/soundwire.git#next
+extcon		git	git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/extcon.git#extcon-next
+gnss		git	git://git.kernel.org/pub/scm/linux/kernel/git/johan/gnss.git#gnss-next
+vfio		git	git://github.com/awilliam/linux-vfio.git#next
+w1		git	git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-w1.git#for-next
+spmi		git	git://git.kernel.org/pub/scm/linux/kernel/git/sboyd/spmi.git#spmi-next
+staging		git	git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git#staging-next
+counter-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/wbg/counter.git#counter-next
+mux		git	https://gitlab.com/peda-linux/mux.git#for-next
+dmaengine	git	git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/dmaengine.git#next
+cgroup		git	git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git#for-next
+scsi		git	git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git#for-next
+scsi-mkp	git	git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git#for-next
+vhost		git	git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git#linux-next
+rpmsg		git	git://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux.git#for-next
+gpio		git	git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-gpio.git#for-next
+gpio-brgl	git	git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git#gpio/for-next
+gpio-intel	git	git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-gpio-intel.git#for-next
+pinctrl		git	git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl.git#for-next
+pinctrl-intel	git	git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/intel.git#for-next
+pinctrl-renesas	git	git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-drivers.git#renesas-pinctrl
+pinctrl-samsung	git	git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/samsung.git#for-next
+pwm		git	git://git.kernel.org/pub/scm/linux/kernel/git/ukleinek/linux.git#pwm/for-next
+ktest		git	git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-ktest.git#for-next
+kselftest	git	git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#next
+kunit		git	git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#test
+kunit-next	git	git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#kunit
+livepatching	git	git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching#for-next
+rtc		git	git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git#rtc-next
+nvdimm		git	git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git#libnvdimm-for-next
+at24		git	git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git#at24/for-next
+ntb		git	https://github.com/jonmason/ntb.git#ntb-next
+seccomp		git	git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git#for-next/seccomp
+fsi		git	git://git.kernel.org/pub/scm/linux/kernel/git/joel/fsi.git#next
+slimbus		git	git://git.kernel.org/pub/scm/linux/kernel/git/srini/slimbus.git#for-next
+nvmem		git	git://git.kernel.org/pub/scm/linux/kernel/git/srini/nvmem.git#for-next
+xarray		git	git://git.infradead.org/users/willy/xarray.git#main
+hyperv		git	git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git#hyperv-next
+auxdisplay	git	https://github.com/ojeda/linux.git#auxdisplay
+kgdb		git	git://git.kernel.org/pub/scm/linux/kernel/git/danielt/linux.git#kgdb/for-next
+hmm		git	git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git#hmm
+cfi		git	git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git#cfi/next
+mhi		git	git://git.kernel.org/pub/scm/linux/kernel/git/mani/mhi.git#mhi-next
+memblock	git	git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git#for-next
+cxl		git	git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git#next
+zstd		git	https://github.com/terrelln/linux.git#zstd-next
+efi		git	git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git#next
+unicode		git	git://git.kernel.org/pub/scm/linux/kernel/git/krisman/unicode.git#for-next
+slab		git	git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab.git#slab/for-next
+random		git	git://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git#master
+landlock	git	git://git.kernel.org/pub/scm/linux/kernel/git/mic/linux.git#next
+rust		git	https://github.com/Rust-for-Linux/linux.git#rust-next
+sysctl		git	git://git.kernel.org/pub/scm/linux/kernel/git/sysctl/sysctl.git#sysctl-next
+execve		git	git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git#for-next/execve
+bitmap		git	https://github.com/norov/linux.git#bitmap-for-next
+hte		git	git://git.kernel.org/pub/scm/linux/kernel/git/pateldipen1984/linux.git#for-next
+kspp		git	git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git#for-next/kspp
+kspp-gustavo	git	git://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux.git#for-next/kspp
+nolibc		git	git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git#nolibc
+tsm		git	git://git.kernel.org/pub/scm/linux/kernel/git/djbw/linux#tsm-next
+iommufd		git	git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git#for-next
+header_cleanup	git	https://evilpiepirate.org/git/bcachefs.git#header_cleanup
diff --git a/Next/merge.log b/Next/merge.log
new file mode 100644
index 00000000000000..32af96d5094a9c
--- /dev/null
+++ b/Next/merge.log
@@ -0,0 +1,5227 @@
+$ date -R
+Thu, 01 Feb 2024 08:11:51 +1100
+$ git checkout master
+Already on 'master'
+$ git reset --hard stable
+HEAD is now at 861c0981648f Merge tag 'jfs-6.8-rc3' of github.com:kleikamp/linux-shaggy
+Merging origin/master (6764c317b6bb Merge tag 'scsi-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git origin/master
+Updating 861c0981648f..6764c317b6bb
+Fast-forward (no commit created; -m option ignored)
+ Documentation/dev-tools/kunit/usage.rst            |  19 +++-
+ MAINTAINERS                                        |   3 +-
+ drivers/scsi/initio.c                              |   3 +-
+ drivers/scsi/isci/request.c                        |   2 +-
+ drivers/scsi/scsi_error.c                          |   8 +-
+ drivers/scsi/scsi_lib.c                            |   2 +-
+ drivers/scsi/scsi_priv.h                           |   2 +-
+ drivers/scsi/storvsc_drv.c                         |  12 ++-
+ drivers/scsi/virtio_scsi.c                         |   2 -
+ drivers/soc/apple/mailbox.c                        |   6 +-
+ fs/erofs/compress.h                                |   5 +-
+ fs/erofs/decompressor.c                            |   5 +-
+ fs/erofs/decompressor_deflate.c                    |  19 ++--
+ fs/erofs/decompressor_lzma.c                       |  17 +++-
+ fs/erofs/fscache.c                                 |   2 +-
+ fs/erofs/inode.c                                   |   2 +-
+ fs/erofs/utils.c                                   |   2 +-
+ fs/erofs/zdata.c                                   |  98 ++++++++++---------
+ lib/kunit/device.c                                 |   4 +-
+ lib/kunit/executor.c                               |   4 +
+ lib/kunit/kunit-test.c                             |   2 +-
+ lib/kunit/test.c                                   |  14 ++-
+ tools/testing/selftests/livepatch/functions.sh     |  37 ++++----
+ .../testing/selftests/rseq/basic_percpu_ops_test.c |  14 ++-
+ tools/testing/selftests/rseq/param_test.c          |  22 +++--
+ .../testing/selftests/seccomp/seccomp_benchmark.c  | 104 +++++++++++++--------
+ 26 files changed, 251 insertions(+), 159 deletions(-)
+Merging fixes/fixes (2dde18cd1d8f Linux 6.5)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/sfr/next-fixes.git fixes/fixes
+Already up to date.
+Merging mm-hotfixes/mm-hotfixes-unstable (bbac2bacc158 mm/zswap: don't return LRU_SKIP if we have dropped lru lock)
+$ git merge -m Merge branch 'mm-hotfixes-unstable' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-hotfixes/mm-hotfixes-unstable
+Merge made by the 'ort' strategy.
+ .mailmap                                        |  1 +
+ arch/arm/mm/fault.c                             |  2 +
+ fs/hugetlbfs/inode.c                            | 19 +++++--
+ fs/nilfs2/recovery.c                            |  7 +--
+ fs/proc/array.c                                 | 66 ++++++++++++++-----------
+ fs/xfs/scrub/xfile.c                            |  5 ++
+ include/linux/pagemap.h                         | 14 ++++++
+ kernel/exit.c                                   | 10 ++--
+ kernel/sys.c                                    | 54 ++++++++++++--------
+ mm/madvise.c                                    |  1 +
+ mm/memcontrol.c                                 | 49 +++++++++++-------
+ mm/memory-failure.c                             |  3 ++
+ mm/userfaultfd.c                                | 14 +++---
+ mm/zswap.c                                      | 12 ++---
+ tools/testing/selftests/core/close_range_test.c |  1 +
+ 15 files changed, 165 insertions(+), 93 deletions(-)
+Merging kbuild-current/fixes (bfef491df670 kconfig: initialize sym->curr.tri to 'no' for all symbol types again)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git kbuild-current/fixes
+Auto-merging Makefile
+Merge made by the 'ort' strategy.
+ Makefile                    | 14 +++++++-------
+ arch/m68k/Makefile          |  4 ++--
+ arch/parisc/Makefile        |  4 ++--
+ arch/um/Makefile            |  4 +++-
+ arch/x86/Makefile           | 10 +++++-----
+ scripts/Makefile.defconf    |  8 ++++----
+ scripts/kconfig/symbol.c    |  4 +++-
+ scripts/mod/modpost.c       | 15 +++------------
+ scripts/mod/modpost.h       |  6 +-----
+ scripts/package/kernel.spec | 22 +++++++++++-----------
+ 10 files changed, 41 insertions(+), 50 deletions(-)
+Merging arc-current/for-curr (861deac3b092 Linux 6.7-rc7)
+$ git merge -m Merge branch 'for-curr' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git arc-current/for-curr
+Already up to date.
+Merging arm-current/fixes (f54e8634d136 ARM: 9330/1: davinci: also select PINCTRL)
+$ git merge -m Merge branch 'fixes' of git://git.armlinux.org.uk/~rmk/linux-arm.git arm-current/fixes
+Already up to date.
+Merging arm64-fixes/for-next/fixes (c7767f5c43df arm64: vdso32: Remove unused vdso32-offsets.h)
+$ git merge -m Merge branch 'for-next/fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux arm64-fixes/for-next/fixes
+Merge made by the 'ort' strategy.
+ arch/arm64/Makefile               | 2 +-
+ arch/arm64/include/asm/vdso.h     | 3 ---
+ arch/arm64/kernel/Makefile        | 6 +++---
+ arch/arm64/kernel/vdso32/Makefile | 9 ---------
+ 4 files changed, 4 insertions(+), 16 deletions(-)
+Merging arm-soc-fixes/arm/fixes (1b5af823d703 soc/tegra: fix build failure on Tegra241)
+$ git merge -m Merge branch 'arm/fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git arm-soc-fixes/arm/fixes
+Merge made by the 'ort' strategy.
+ drivers/soc/tegra/fuse/fuse-tegra30.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+Merging davinci-current/davinci/for-current (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'davinci/for-current' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git davinci-current/davinci/for-current
+Already up to date.
+Merging drivers-memory-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git drivers-memory-fixes/fixes
+Already up to date.
+Merging sophgo-fixes/fixes (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'fixes' of https://github.com/sophgo/linux.git sophgo-fixes/fixes
+Already up to date.
+Merging tee-fixes/fixes (ceaa837f96ad Linux 6.2-rc8)
+$ git merge -m Merge branch 'fixes' of https://git.linaro.org/people/jens.wiklander/linux-tee.git tee-fixes/fixes
+Already up to date.
+Merging m68k-current/for-linus (6b9c045b0602 m68k: defconfig: Update defconfigs for v6.7-rc1)
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k.git m68k-current/for-linus
+Already up to date.
+Merging powerpc-fixes/fixes (18f14afe2816 powerpc/64s: Increase default stack size to 32KB)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git powerpc-fixes/fixes
+Already up to date.
+Merging s390-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git s390-fixes/fixes
+Already up to date.
+Merging fscrypt-current/for-current (4bcf6f827a79 fscrypt: check for NULL keyring in fscrypt_put_master_key_activeref())
+$ git merge -m Merge branch 'for-current' of git://git.kernel.org/pub/scm/fs/fscrypt/linux.git fscrypt-current/for-current
+Already up to date.
+Merging fsverity-current/for-current (a075bacde257 fsverity: don't drop pagecache at end of FS_IOC_ENABLE_VERITY)
+$ git merge -m Merge branch 'for-current' of git://git.kernel.org/pub/scm/fs/fsverity/linux.git fsverity-current/for-current
+Already up to date.
+Merging net/main (c9ec85153fea selftests: net: add missing config for GENEVE)
+$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git net/main
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ MAINTAINERS                                        |   2 +
+ drivers/net/dsa/mt7530.c                           |   3 +-
+ drivers/net/dsa/qca/qca8k-8xxx.c                   |   3 +-
+ drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c      |   2 +-
+ drivers/net/ethernet/google/gve/gve_rx.c           |   8 +-
+ drivers/net/ethernet/intel/e1000e/e1000.h          |  20 +++
+ drivers/net/ethernet/intel/e1000e/ptp.c            |  22 ++-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c      |   3 +-
+ drivers/net/ethernet/mediatek/mtk_eth_soc.c        |   5 +-
+ .../net/ethernet/microchip/lan966x/lan966x_port.c  |   5 +-
+ .../net/ethernet/netronome/nfp/flower/conntrack.c  |  46 ++++++-
+ drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c    |   4 +
+ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |   3 +
+ drivers/net/phy/mediatek-ge-soc.c                  | 147 ++++++++++++---------
+ include/net/ip.h                                   |   2 +-
+ net/bridge/br_multicast.c                          |  20 ++-
+ net/bridge/br_private.h                            |   4 +-
+ net/devlink/port.c                                 |   2 +-
+ net/hsr/hsr_device.c                               |   4 +-
+ net/ipv4/ip_sockglue.c                             |   6 +-
+ net/ipv4/ipmr.c                                    |   2 +-
+ net/ipv4/raw.c                                     |   2 +-
+ net/ipv4/tcp.c                                     |  12 +-
+ net/ipv4/udp.c                                     |   2 +-
+ net/ipv6/addrconf_core.c                           |  21 ++-
+ net/ipv6/ip6_tunnel.c                              |  21 ++-
+ net/llc/af_llc.c                                   |   2 +
+ net/nfc/nci/core.c                                 |   4 +
+ net/smc/smc_core.c                                 |  12 +-
+ tools/testing/selftests/net/Makefile               |   6 +-
+ tools/testing/selftests/net/config                 |   9 ++
+ tools/testing/selftests/net/lib.sh                 |   5 +-
+ tools/testing/selftests/net/setup_veth.sh          |   2 +-
+ tools/testing/selftests/net/tcp_ao/config          |  10 ++
+ tools/testing/selftests/net/tcp_ao/settings        |   1 +
+ tools/testing/selftests/net/udpgro.sh              |   4 +-
+ tools/testing/selftests/net/udpgro_bench.sh        |   4 +-
+ tools/testing/selftests/net/udpgro_frglist.sh      |   6 +-
+ tools/testing/selftests/net/udpgro_fwd.sh          |   8 +-
+ tools/testing/selftests/net/veth.sh                |   4 +-
+ tools/testing/selftests/net/xdp_dummy.c            |  13 ++
+ 41 files changed, 326 insertions(+), 135 deletions(-)
+ create mode 100644 tools/testing/selftests/net/tcp_ao/config
+ create mode 100644 tools/testing/selftests/net/tcp_ao/settings
+ create mode 100644 tools/testing/selftests/net/xdp_dummy.c
+Merging bpf/master (577e4432f3ac tcp: add sanity checks to rx zerocopy)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git bpf/master
+Already up to date.
+Merging ipsec/master (983a73da1f99 xfrm: Pass UDP encapsulation in TX packet offload)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git ipsec/master
+Merge made by the 'ort' strategy.
+ net/xfrm/xfrm_device.c | 2 +-
+ net/xfrm/xfrm_policy.c | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+Merging netfilter/main (a2933a8759a6 selftests: bonding: do not test arp/ns target with mode balance-alb/tlb)
+$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git netfilter/main
+Already up to date.
+Merging ipvs/main (a2933a8759a6 selftests: bonding: do not test arp/ns target with mode balance-alb/tlb)
+$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs.git ipvs/main
+Already up to date.
+Merging wireless/for-next (f3f8f0503168 wifi: fill in MODULE_DESCRIPTION()s for mt76 drivers)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git wireless/for-next
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ MAINTAINERS                                                   | 9 ++++-----
+ drivers/net/wireless/ath/ar5523/ar5523.c                      | 1 +
+ drivers/net/wireless/ath/wcn36xx/main.c                       | 1 +
+ drivers/net/wireless/broadcom/brcm80211/brcmfmac/bca/module.c | 1 +
+ drivers/net/wireless/broadcom/brcm80211/brcmfmac/cyw/module.c | 1 +
+ drivers/net/wireless/broadcom/brcm80211/brcmfmac/wcc/module.c | 1 +
+ drivers/net/wireless/intel/iwlwifi/iwl-drv.c                  | 1 +
+ drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c             | 6 ++++--
+ drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c         | 3 ++-
+ drivers/net/wireless/intersil/p54/p54spi.c                    | 1 +
+ drivers/net/wireless/mediatek/mt76/mt7603/main.c              | 1 +
+ drivers/net/wireless/mediatek/mt76/mt7615/main.c              | 1 +
+ drivers/net/wireless/mediatek/mt76/mt7615/mmio.c              | 1 +
+ drivers/net/wireless/mediatek/mt76/mt7615/sdio.c              | 1 +
+ drivers/net/wireless/mediatek/mt76/mt7615/usb.c               | 1 +
+ drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c          | 1 +
+ drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c          | 1 +
+ drivers/net/wireless/mediatek/mt76/mt76x0/eeprom.c            | 1 +
+ drivers/net/wireless/mediatek/mt76/mt76x0/pci.c               | 1 +
+ drivers/net/wireless/mediatek/mt76/mt76x0/usb.c               | 1 +
+ drivers/net/wireless/mediatek/mt76/mt76x02_usb_mcu.c          | 1 +
+ drivers/net/wireless/mediatek/mt76/mt76x02_util.c             | 1 +
+ drivers/net/wireless/mediatek/mt76/mt76x2/eeprom.c            | 1 +
+ drivers/net/wireless/mediatek/mt76/mt76x2/pci.c               | 1 +
+ drivers/net/wireless/mediatek/mt76/mt76x2/usb.c               | 1 +
+ drivers/net/wireless/mediatek/mt76/mt7915/mmio.c              | 1 +
+ drivers/net/wireless/mediatek/mt76/mt7921/main.c              | 1 +
+ drivers/net/wireless/mediatek/mt76/mt7921/pci.c               | 1 +
+ drivers/net/wireless/mediatek/mt76/mt7921/sdio.c              | 1 +
+ drivers/net/wireless/mediatek/mt76/mt7921/usb.c               | 1 +
+ drivers/net/wireless/mediatek/mt76/mt7925/main.c              | 1 +
+ drivers/net/wireless/mediatek/mt76/mt7925/pci.c               | 1 +
+ drivers/net/wireless/mediatek/mt76/mt7925/usb.c               | 1 +
+ drivers/net/wireless/mediatek/mt76/mt792x_core.c              | 1 +
+ drivers/net/wireless/mediatek/mt76/mt792x_usb.c               | 1 +
+ drivers/net/wireless/mediatek/mt76/mt7996/mmio.c              | 1 +
+ drivers/net/wireless/mediatek/mt76/sdio.c                     | 1 +
+ drivers/net/wireless/mediatek/mt76/usb.c                      | 1 +
+ drivers/net/wireless/mediatek/mt76/util.c                     | 1 +
+ drivers/net/wireless/microchip/wilc1000/netdev.c              | 1 +
+ drivers/net/wireless/microchip/wilc1000/sdio.c                | 1 +
+ drivers/net/wireless/microchip/wilc1000/spi.c                 | 1 +
+ drivers/net/wireless/ti/wl1251/sdio.c                         | 1 +
+ drivers/net/wireless/ti/wl1251/spi.c                          | 1 +
+ drivers/net/wireless/ti/wl12xx/main.c                         | 1 +
+ drivers/net/wireless/ti/wl18xx/main.c                         | 1 +
+ drivers/net/wireless/ti/wlcore/main.c                         | 1 +
+ drivers/net/wireless/ti/wlcore/sdio.c                         | 1 +
+ drivers/net/wireless/ti/wlcore/spi.c                          | 1 +
+ net/mac80211/wbrf.c                                           | 2 --
+ net/wireless/core.c                                           | 3 ++-
+ 51 files changed, 58 insertions(+), 11 deletions(-)
+Merging wpan/master (b85ea95d0864 Linux 6.7-rc1)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan.git wpan/master
+Already up to date.
+Merging rdma-fixes/for-rc (80dde187f734 RDMA/bnxt_re: Add a missing check in bnxt_qplib_query_srq)
+$ git merge -m Merge branch 'for-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git rdma-fixes/for-rc
+Merge made by the 'ort' strategy.
+ drivers/infiniband/hw/bnxt_re/ib_verbs.c | 43 +++++++++++++++++++++-----------
+ drivers/infiniband/hw/bnxt_re/main.c     |  3 ---
+ drivers/infiniband/hw/bnxt_re/qplib_fp.c |  3 ++-
+ drivers/infiniband/hw/hfi1/pio.c         |  6 ++++-
+ 4 files changed, 35 insertions(+), 20 deletions(-)
+Merging sound-current/for-linus (2468e8922d2f ALSA: hda/realtek: Apply headset jack quirk for non-bass alc287 thinkpads)
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git sound-current/for-linus
+Merge made by the 'ort' strategy.
+ sound/core/pcm.c                     |  4 ++++
+ sound/pci/hda/cs35l41_hda_property.c |  4 ++++
+ sound/pci/hda/hda_intel.c            |  6 ++++--
+ sound/pci/hda/patch_cs8409.c         |  1 +
+ sound/pci/hda/patch_realtek.c        | 15 +++++++++++---
+ sound/usb/clock.c                    | 24 ++++++++++++++++++++++-
+ sound/usb/format.c                   | 20 +++++++++++++++++++
+ sound/usb/midi2.c                    |  2 +-
+ sound/usb/quirks.c                   | 38 +++++++++++++++++++++---------------
+ sound/virtio/virtio_card.c           |  2 --
+ sound/virtio/virtio_ctl_msg.c        |  2 --
+ sound/virtio/virtio_pcm_msg.c        |  2 --
+ 12 files changed, 91 insertions(+), 29 deletions(-)
+Merging sound-asoc-fixes/for-linus (eeab239d6a24 ASoC: wcd934x: fix an incorrect use of kstrndup())
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git sound-asoc-fixes/for-linus
+Merge made by the 'ort' strategy.
+ .../bindings/sound/allwinner,sun4i-a10-spdif.yaml  |   5 +-
+ sound/soc/amd/acp/acp-mach-common.c                |  16 +-
+ sound/soc/amd/acp/acp-sof-mach.c                   |   4 +
+ sound/soc/amd/acp/acp3x-es83xx/acp3x-es83xx.c      |   8 +
+ sound/soc/amd/yc/acp6x-mach.c                      |   7 +
+ sound/soc/codecs/es8326.c                          | 186 +++++++++++++++------
+ sound/soc/codecs/es8326.h                          |   3 +-
+ sound/soc/codecs/lpass-wsa-macro.c                 |   7 -
+ sound/soc/codecs/wcd9335.c                         |   4 -
+ sound/soc/codecs/wcd934x.c                         |   3 +-
+ sound/soc/codecs/wcd938x.c                         |   8 +-
+ sound/soc/codecs/wsa883x.c                         |   6 +-
+ sound/soc/qcom/sc8280xp.c                          |  12 +-
+ sound/soc/soc-core.c                               |   5 +-
+ sound/soc/sunxi/sun4i-spdif.c                      |   5 +
+ 15 files changed, 203 insertions(+), 76 deletions(-)
+ mode change 100755 => 100644 sound/soc/codecs/es8326.c
+Merging regmap-fixes/for-linus (8b921545ddc6 Merge remote-tracking branch 'regmap/for-6.7' into regmap-linus)
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap.git regmap-fixes/for-linus
+Merge made by the 'ort' strategy.
+Merging regulator-fixes/for-linus (a3fa9838e814 regulator (max5970): Fix IRQ handler)
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git regulator-fixes/for-linus
+Merge made by the 'ort' strategy.
+ drivers/regulator/max5970-regulator.c |  2 +-
+ drivers/regulator/pwm-regulator.c     | 43 +++++++++++++++++++++++++++++++++++
+ drivers/regulator/ti-abb-regulator.c  | 22 +++++++++++++++---
+ 3 files changed, 63 insertions(+), 4 deletions(-)
+Merging spi-fixes/for-linus (6500ad28fd5d spi: sh-msiof: avoid integer overflow in constants)
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git spi-fixes/for-linus
+Merge made by the 'ort' strategy.
+ drivers/spi/spi-sh-msiof.c | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+Merging pci-current/for-linus (925bd5e08106 MAINTAINERS: Add Manivannan Sadhasivam as PCI Endpoint maintainer)
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git pci-current/for-linus
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ MAINTAINERS                            |  3 +-
+ drivers/pci/bus.c                      | 49 +++++++++++++--------
+ drivers/pci/controller/dwc/pcie-qcom.c |  2 +-
+ drivers/pci/pci.c                      | 78 ++++++++++++++++++++++------------
+ drivers/pci/pci.h                      |  4 +-
+ drivers/pci/pcie/aspm.c                | 13 ++++--
+ include/linux/pci.h                    |  5 +++
+ 7 files changed, 102 insertions(+), 52 deletions(-)
+Merging driver-core.current/driver-core-linus (98323e9d7017 topology: Set capacity_freq_ref in all cases)
+$ git merge -m Merge branch 'driver-core-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git driver-core.current/driver-core-linus
+Merge made by the 'ort' strategy.
+ drivers/base/arch_topology.c | 13 ++++++-------
+ 1 file changed, 6 insertions(+), 7 deletions(-)
+Merging tty.current/tty-linus (b35f8dbbce81 serial: max310x: prevent infinite while() loop in port startup)
+$ git merge -m Merge branch 'tty-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git tty.current/tty-linus
+Merge made by the 'ort' strategy.
+ drivers/tty/serial/8250/8250_pci1xxxx.c |  4 +--
+ drivers/tty/serial/max310x.c            | 53 ++++++++++++++++++++++++++-------
+ drivers/tty/serial/serial_core.c        |  2 +-
+ include/uapi/linux/serial.h             | 13 ++++----
+ 4 files changed, 53 insertions(+), 19 deletions(-)
+Merging usb.current/usb-linus (f2e5d3de7e1f usb: typec: tcpm: fix the PD disabled case)
+$ git merge -m Merge branch 'usb-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git usb.current/usb-linus
+Merge made by the 'ort' strategy.
+ Documentation/usb/gadget-testing.rst         | 22 +++----
+ drivers/usb/chipidea/ci.h                    |  2 +
+ drivers/usb/chipidea/core.c                  | 44 +++++++-------
+ drivers/usb/common/ulpi.c                    |  2 +-
+ drivers/usb/core/hub.c                       | 46 ++++++++++-----
+ drivers/usb/dwc3/dwc3-pci.c                  |  4 ++
+ drivers/usb/dwc3/gadget.c                    |  6 +-
+ drivers/usb/dwc3/host.c                      |  4 +-
+ drivers/usb/gadget/function/f_mass_storage.c | 20 ++++++-
+ drivers/usb/gadget/function/f_ncm.c          |  8 +--
+ drivers/usb/gadget/udc/pch_udc.c             |  1 -
+ drivers/usb/host/xhci-mem.c                  | 16 ++---
+ drivers/usb/host/xhci-plat.c                 |  3 +
+ drivers/usb/host/xhci-ring.c                 | 80 ++++++++++++++++++++-----
+ drivers/usb/host/xhci.h                      |  1 +
+ drivers/usb/typec/tcpm/tcpm.c                |  6 +-
+ drivers/usb/typec/ucsi/ucsi.c                |  2 +
+ drivers/usb/typec/ucsi/ucsi_acpi.c           | 88 +++++++++++++++++++++++++---
+ 18 files changed, 264 insertions(+), 91 deletions(-)
+Merging usb-serial-fixes/usb-linus (b4a1f4eaf1d7 USB: serial: option: add Fibocom FM101-GL variant)
+$ git merge -m Merge branch 'usb-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/johan/usb-serial.git usb-serial-fixes/usb-linus
+Merge made by the 'ort' strategy.
+ drivers/usb/serial/cp210x.c   | 1 +
+ drivers/usb/serial/option.c   | 1 +
+ drivers/usb/serial/qcserial.c | 2 ++
+ 3 files changed, 4 insertions(+)
+Merging phy/fixes (7104ba0f1958 phy: ti: phy-omap-usb2: Fix NULL pointer dereference for SRP)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git phy/fixes
+Merge made by the 'ort' strategy.
+ drivers/phy/microchip/lan966x_serdes.c   |  2 ++
+ drivers/phy/qualcomm/phy-qcom-qmp-usb.c  | 30 ++++++++++++++++++++++++++++--
+ drivers/phy/renesas/phy-rcar-gen3-usb2.c |  4 ----
+ drivers/phy/ti/phy-omap-usb2.c           |  4 ++--
+ 4 files changed, 32 insertions(+), 8 deletions(-)
+Merging staging.current/staging-linus (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'staging-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git staging.current/staging-linus
+Already up to date.
+Merging iio-fixes/fixes-togreg (6f6c72acddf4 iio: move LIGHT_UVA and LIGHT_UVB to the end of iio_modifier)
+$ git merge -m Merge branch 'fixes-togreg' of git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git iio-fixes/fixes-togreg
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ MAINTAINERS                                     |  8 ++++++++
+ drivers/iio/adc/ad7091r8.c                      |  2 +-
+ drivers/iio/humidity/Kconfig                    | 12 ++++++++++++
+ drivers/iio/humidity/Makefile                   |  1 +
+ drivers/iio/humidity/hdc3020.c                  |  2 +-
+ drivers/iio/imu/bno055/Kconfig                  |  1 +
+ drivers/iio/industrialio-core.c                 |  5 ++++-
+ drivers/iio/magnetometer/rm3100-core.c          | 10 ++++++++--
+ drivers/iio/pressure/bmp280-spi.c               |  1 +
+ drivers/staging/iio/impedance-analyzer/ad5933.c |  2 +-
+ include/linux/iio/adc/ad_sigma_delta.h          |  4 +++-
+ include/linux/iio/imu/adis.h                    |  3 ++-
+ include/uapi/linux/iio/types.h                  |  4 ++--
+ 13 files changed, 45 insertions(+), 10 deletions(-)
+Merging counter-current/counter-current (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'counter-current' of git://git.kernel.org/pub/scm/linux/kernel/git/wbg/counter.git counter-current/counter-current
+Already up to date.
+Merging char-misc.current/char-misc-linus (ac9762a74c7c misc: open-dice: Fix spurious lockdep warning)
+$ git merge -m Merge branch 'char-misc-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git char-misc.current/char-misc-linus
+Merge made by the 'ort' strategy.
+ drivers/misc/fastrpc.c   | 2 +-
+ drivers/misc/open-dice.c | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+Merging soundwire-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/soundwire.git soundwire-fixes/fixes
+Already up to date.
+Merging thunderbolt-fixes/fixes (ec4d82f855ce thunderbolt: Fix setting the CNS bit in ROUTER_CS_5)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git thunderbolt-fixes/fixes
+Merge made by the 'ort' strategy.
+ drivers/thunderbolt/tb_regs.h | 2 +-
+ drivers/thunderbolt/usb4.c    | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+Merging input-current/for-linus (2b9c3eb32a69 Input: bcm5974 - check endpoint type before starting traffic)
+$ git merge -m Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input.git input-current/for-linus
+Merge made by the 'ort' strategy.
+ drivers/input/joystick/xpad.c      |  2 ++
+ drivers/input/mouse/bcm5974.c      | 20 ++++++++++++++++++++
+ drivers/input/touchscreen/goodix.c |  3 ++-
+ 3 files changed, 24 insertions(+), 1 deletion(-)
+Merging crypto-current/master (c5a2f74db71a crypto: caam - fix asynchronous hash)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6.git crypto-current/master
+Merge made by the 'ort' strategy.
+ drivers/crypto/caam/caamalg_qi2.c                    | 7 +++++--
+ drivers/crypto/caam/caamhash.c                       | 7 +++++--
+ drivers/crypto/intel/qat/qat_4xxx/adf_4xxx_hw_data.c | 1 +
+ 3 files changed, 11 insertions(+), 4 deletions(-)
+Merging vfio-fixes/for-linus (4ea95c04fa6b vfio: Drop vfio_file_iommu_group() stub to fudge around a KVM wart)
+$ git merge -m Merge branch 'for-linus' of git://github.com/awilliam/linux-vfio.git vfio-fixes/for-linus
+Already up to date.
+Merging kselftest-fixes/fixes (b54761f6e977 kselftest/seccomp: Report each expectation we assert as a KTAP test)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git kselftest-fixes/fixes
+Already up to date.
+Merging modules-fixes/modules-linus (f412eef03938 Documentation: livepatch: module-elf-format: Remove local klp_modinfo definition)
+$ git merge -m Merge branch 'modules-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git modules-fixes/modules-linus
+Already up to date.
+Merging dmaengine-fixes/fixes (a22fe1d6dec7 dmaengine: fix is_slave_direction() return false when DMA_DEV_TO_DEV)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/dmaengine.git dmaengine-fixes/fixes
+Merge made by the 'ort' strategy.
+ drivers/dma/at_hdmac.c                  | 21 +++++++++++----------
+ drivers/dma/fsl-dpaa2-qdma/dpaa2-qdma.c | 10 ++++++----
+ drivers/dma/fsl-qdma.c                  | 33 ++++++++++++---------------------
+ drivers/dma/ti/edma.c                   | 10 ++++++++++
+ drivers/dma/ti/k3-udma.c                | 10 ++++++++--
+ include/linux/dmaengine.h               |  3 ++-
+ 6 files changed, 49 insertions(+), 38 deletions(-)
+Merging backlight-fixes/for-backlight-fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-backlight-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/backlight.git backlight-fixes/for-backlight-fixes
+Already up to date.
+Merging mtd-fixes/mtd/fixes (7c1b1906229d mtd: spinand: gigadevice: Fix the get ecc status issue)
+$ git merge -m Merge branch 'mtd/fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git mtd-fixes/mtd/fixes
+Merge made by the 'ort' strategy.
+ drivers/mtd/nand/spi/gigadevice.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+Merging mfd-fixes/for-mfd-fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-mfd-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git mfd-fixes/for-mfd-fixes
+Already up to date.
+Merging v4l-dvb-fixes/fixes (b32431b75321 media: vb2: refactor setting flags and caps, fix missing cap)
+$ git merge -m Merge branch 'fixes' of https://git.linuxtv.org/media_stage.git v4l-dvb-fixes/fixes
+Already up to date.
+Merging reset-fixes/reset/fixes (4a6756f56bcf reset: Fix crash when freeing non-existent optional resets)
+$ git merge -m Merge branch 'reset/fixes' of https://git.pengutronix.de/git/pza/linux reset-fixes/reset/fixes
+Already up to date.
+Merging mips-fixes/mips-fixes (59be5c358501 mips: Call lose_fpu(0) before initializing fcr31 in mips_set_personality_nan)
+$ git merge -m Merge branch 'mips-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git mips-fixes/mips-fixes
+Already up to date.
+Merging at91-fixes/at91-fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'at91-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git at91-fixes/at91-fixes
+Already up to date.
+Merging omap-fixes/fixes (9b6a51aab5f5 ARM: dts: Fix occasional boot hang for am3 usb)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git omap-fixes/fixes
+Already up to date.
+Merging kvm-fixes/master (0dd3ee311255 Linux 6.7)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/virt/kvm/kvm.git kvm-fixes/master
+Already up to date.
+Merging kvms390-fixes/master (83303a4c776c KVM: s390: fix cc for successful PQAP)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git kvms390-fixes/master
+Auto-merging arch/s390/kvm/vsie.c
+Merge made by the 'ort' strategy.
+ arch/s390/kvm/priv.c | 8 ++++++--
+ arch/s390/kvm/vsie.c | 1 -
+ arch/s390/mm/gmap.c  | 1 +
+ 3 files changed, 7 insertions(+), 3 deletions(-)
+Merging hwmon-fixes/hwmon (915644189c22 hwmon: (pmbus/mp2975) Correct comment inside 'mp2975_read_byte_data')
+$ git merge -m Merge branch 'hwmon' of git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git hwmon-fixes/hwmon
+Merge made by the 'ort' strategy.
+ drivers/hwmon/gigabyte_waterforce.c |  2 +-
+ drivers/hwmon/pmbus/mp2975.c        | 16 ++++++++++++++++
+ 2 files changed, 17 insertions(+), 1 deletion(-)
+Merging nvdimm-fixes/libnvdimm-fixes (33908660e814 ACPI: NFIT: Fix incorrect calculation of idt size)
+$ git merge -m Merge branch 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git nvdimm-fixes/libnvdimm-fixes
+Already up to date.
+Merging cxl-fixes/fixes (6be99530c92c x86/numa: Fix the sort compare func used in numa_fill_memblks())
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git cxl-fixes/fixes
+Merge made by the 'ort' strategy.
+ arch/x86/mm/numa.c       | 21 ++++++++-------------
+ drivers/cxl/core/pci.c   | 43 +++++++++++++++++++++++++++++++------------
+ include/linux/memblock.h |  2 ++
+ mm/memblock.c            |  5 +++--
+ 4 files changed, 44 insertions(+), 27 deletions(-)
+Merging btrfs-fixes/next-fixes (c94bd41cb0b6 Merge branch 'misc-6.8' into next-fixes)
+$ git merge -m Merge branch 'next-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git btrfs-fixes/next-fixes
+Merge made by the 'ort' strategy.
+Merging vfs-fixes/fixes (485053bb81c8 fix ufs_get_locked_folio() breakage)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git vfs-fixes/fixes
+Already up to date.
+Merging dma-mapping-fixes/for-linus (d5090484b021 swiotlb: do not try to allocate a TLB bigger than MAX_ORDER pages)
+$ git merge -m Merge branch 'for-linus' of git://git.infradead.org/users/hch/dma-mapping.git dma-mapping-fixes/for-linus
+Already up to date.
+Merging drivers-x86-fixes/fixes (1abdf288b0ef platform/x86: touchscreen_dmi: Add info for the TECLAST X16 Plus tablet)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git drivers-x86-fixes/fixes
+Already up to date.
+Merging samsung-krzk-fixes/fixes (eab4f56d3e75 ARM: dts: exynos4212-tab3: add samsung,invert-vclk flag to fimd)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux.git samsung-krzk-fixes/fixes
+Already up to date.
+Merging pinctrl-samsung-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/samsung.git pinctrl-samsung-fixes/fixes
+Already up to date.
+Merging devicetree-fixes/dt/linus (8f7e91790738 of: property: fix typo in io-channels)
+$ git merge -m Merge branch 'dt/linus' of git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git devicetree-fixes/dt/linus
+Auto-merging Documentation/devicetree/bindings/gpio/xlnx,zynqmp-gpio-modepin.yaml
+Auto-merging Documentation/devicetree/bindings/usb/microchip,usb5744.yaml
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/Makefile                  |  5 ++++-
+ Documentation/devicetree/bindings/ata/ceva,ahci-1v84.yaml   |  3 ++-
+ .../devicetree/bindings/display/bridge/nxp,tda998x.yaml     |  7 +++++--
+ .../devicetree/bindings/gpio/xlnx,zynqmp-gpio-modepin.yaml  |  3 ++-
+ .../devicetree/bindings/reset/xlnx,zynqmp-reset.yaml        |  3 ++-
+ Documentation/devicetree/bindings/tpm/tpm-common.yaml       |  2 +-
+ Documentation/devicetree/bindings/usb/dwc3-xilinx.yaml      |  3 ++-
+ .../devicetree/bindings/usb/microchip,usb5744.yaml          |  3 ++-
+ Documentation/devicetree/bindings/usb/xlnx,usb2.yaml        |  3 ++-
+ drivers/of/property.c                                       |  2 +-
+ tools/testing/selftests/dt/test_unprobed_devices.sh         | 13 +++++++------
+ 11 files changed, 30 insertions(+), 17 deletions(-)
+Merging dt-krzk-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-dt.git dt-krzk-fixes/fixes
+Already up to date.
+Merging scsi-fixes/fixes (f4469f385835 scsi: storvsc: Fix ring buffer size calculation)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git scsi-fixes/fixes
+Already up to date.
+Merging drm-fixes/drm-fixes (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'drm-fixes' of git://git.freedesktop.org/git/drm/drm.git drm-fixes/drm-fixes
+Already up to date.
+Merging drm-intel-fixes/for-linux-next-fixes (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'for-linux-next-fixes' of git://anongit.freedesktop.org/drm-intel drm-intel-fixes/for-linux-next-fixes
+Already up to date.
+Merging mmc-fixes/fixes (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git mmc-fixes/fixes
+Already up to date.
+Merging rtc-fixes/rtc-fixes (08279468a294 rtc: sunplus: fix format string for printing resource)
+$ git merge -m Merge branch 'rtc-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git rtc-fixes/rtc-fixes
+Already up to date.
+Merging gnss-fixes/gnss-linus (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'gnss-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/johan/gnss.git gnss-fixes/gnss-linus
+Already up to date.
+Merging hyperv-fixes/hyperv-fixes (564eac2860bd hv_utils: Allow implicit ICTIMESYNCFLAG_SYNC)
+$ git merge -m Merge branch 'hyperv-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git hyperv-fixes/hyperv-fixes
+Merge made by the 'ort' strategy.
+ drivers/hv/hv_util.c | 31 ++++++++++++++++++++++++++++++-
+ 1 file changed, 30 insertions(+), 1 deletion(-)
+Merging soc-fsl-fixes/fix (06c2afb862f9 Linux 6.5-rc1)
+$ git merge -m Merge branch 'fix' of git://git.kernel.org/pub/scm/linux/kernel/git/leo/linux.git soc-fsl-fixes/fix
+Already up to date.
+Merging risc-v-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git risc-v-fixes/fixes
+Already up to date.
+Merging riscv-dt-fixes/riscv-dt-fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'riscv-dt-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git riscv-dt-fixes/riscv-dt-fixes
+Already up to date.
+Merging riscv-soc-fixes/riscv-soc-fixes (a9d022ae8c4f Merge branch 'riscv-soc-drivers-fixes' into riscv-soc-fixes)
+$ git merge -m Merge branch 'riscv-soc-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git riscv-soc-fixes/riscv-soc-fixes
+Merge made by the 'ort' strategy.
+ drivers/soc/microchip/Kconfig | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+Merging fpga-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/fpga/linux-fpga.git fpga-fixes/fixes
+Already up to date.
+Merging spdx/spdx-linus (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'spdx-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/spdx.git spdx/spdx-linus
+Already up to date.
+Merging gpio-brgl-fixes/gpio/for-current (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'gpio/for-current' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git gpio-brgl-fixes/gpio/for-current
+Already up to date.
+Merging gpio-intel-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-gpio-intel.git gpio-intel-fixes/fixes
+Already up to date.
+Merging pinctrl-intel-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/intel.git pinctrl-intel-fixes/fixes
+Already up to date.
+Merging erofs-fixes/fixes (d9281660ff3f erofs: relaxed temporary buffers allocation on readahead)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git erofs-fixes/fixes
+Already up to date.
+Merging kunit-fixes/kunit-fixes (1a9f2c776d14 Documentation: KUnit: Update the instructions on how to test static functions)
+$ git merge -m Merge branch 'kunit-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git kunit-fixes/kunit-fixes
+Already up to date.
+Merging ubifs-fixes/fixes (2241ab53cbb5 Linux 6.2-rc5)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git ubifs-fixes/fixes
+Already up to date.
+Merging memblock-fixes/fixes (6a9531c3a880 memblock: fix crash when reserved memory is not added to memory)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git memblock-fixes/fixes
+Already up to date.
+Merging nfsd-fixes/nfsd-fixes (ccbca118ef1a NFSv4.1: Assign the right value for initval and retries for rpc timeout)
+$ git merge -m Merge branch 'nfsd-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux nfsd-fixes/nfsd-fixes
+Merge made by the 'ort' strategy.
+ net/sunrpc/svc.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+Merging renesas-fixes/fixes (9eab43facdad soc: renesas: ARCH_R9A07G043 depends on !RISCV_ISA_ZICBOM)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git renesas-fixes/fixes
+Already up to date.
+Merging perf-current/perf-tools (fdd0ae72b34e perf tools headers: update the asm-generic/unaligned.h copy with the kernel sources)
+$ git merge -m Merge branch 'perf-tools' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools perf-current/perf-tools
+Merge made by the 'ort' strategy.
+ tools/arch/x86/include/asm/cpufeatures.h           |   8 +-
+ tools/arch/x86/include/asm/msr-index.h             |   8 +
+ tools/arch/x86/include/uapi/asm/kvm.h              |   3 +
+ tools/arch/x86/lib/memcpy_64.S                     |   4 +-
+ tools/arch/x86/lib/memset_64.S                     |   4 +-
+ tools/include/asm-generic/unaligned.h              |  24 +-
+ tools/include/uapi/asm-generic/unistd.h            |  15 +-
+ tools/include/uapi/drm/drm.h                       |  72 +++++-
+ tools/include/uapi/drm/i915_drm.h                  |  12 +-
+ tools/include/uapi/linux/fcntl.h                   |   3 +
+ tools/include/uapi/linux/kvm.h                     | 140 ++++--------
+ tools/include/uapi/linux/mount.h                   |  70 ++++++
+ tools/include/uapi/linux/stat.h                    |   1 +
+ tools/perf/Documentation/perf-list.txt             |   4 +
+ tools/perf/Makefile.perf                           |  10 +
+ tools/perf/builtin-list.c                          | 211 ++++++++++-------
+ tools/perf/builtin-record.c                        |   4 +-
+ tools/perf/builtin-top.c                           |   2 +-
+ .../pmu-events/arch/x86/alderlake/adl-metrics.json | 254 ++++++++++-----------
+ .../arch/x86/alderlaken/adln-metrics.json          |   4 -
+ .../arch/x86/sapphirerapids/spr-metrics.json       |  25 +-
+ tools/perf/tests/shell/daemon.sh                   |  34 ++-
+ tools/perf/tests/shell/list.sh                     |  21 +-
+ tools/perf/tests/shell/script.sh                   |  12 +-
+ tools/perf/trace/beauty/statx.c                    |   1 +
+ tools/perf/util/evlist.c                           |   9 +-
+ tools/perf/util/hist.c                             |   4 +-
+ tools/perf/util/include/linux/linkage.h            |   4 +
+ tools/perf/util/metricgroup.c                      |   2 +-
+ tools/perf/util/print-events.c                     |   2 +-
+ tools/perf/util/synthetic-events.c                 |   4 +-
+ 31 files changed, 588 insertions(+), 383 deletions(-)
+Merging efi-fixes/urgent (aa0e784dea7c efi/libstub: Add one kernel-doc comment)
+$ git merge -m Merge branch 'urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git efi-fixes/urgent
+Merge made by the 'ort' strategy.
+ drivers/firmware/efi/libstub/Makefile      |  4 ++--
+ drivers/firmware/efi/libstub/alignedmem.c  |  1 +
+ drivers/firmware/efi/libstub/efistub.h     |  3 ++-
+ drivers/firmware/efi/libstub/kaslr.c       |  2 +-
+ drivers/firmware/efi/libstub/randomalloc.c | 12 +++++++-----
+ drivers/firmware/efi/libstub/x86-stub.c    | 25 +++++++++++++++----------
+ drivers/firmware/efi/libstub/x86-stub.h    |  4 ++--
+ drivers/firmware/efi/libstub/zboot.c       |  2 +-
+ 8 files changed, 31 insertions(+), 22 deletions(-)
+Merging zstd-fixes/zstd-linus (77618db34645 zstd: Fix array-index-out-of-bounds UBSAN warning)
+$ git merge -m Merge branch 'zstd-linus' of https://github.com/terrelln/linux.git zstd-fixes/zstd-linus
+Already up to date.
+Merging battery-fixes/fixes (d0266d7ab161 Revert "power: supply: qcom_battmgr: Register the power supplies after PDR is up")
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply.git battery-fixes/fixes
+Merge made by the 'ort' strategy.
+ drivers/power/supply/qcom_battmgr.c | 109 ++++++++++++++++--------------------
+ 1 file changed, 49 insertions(+), 60 deletions(-)
+Merging uml-fixes/fixes (73a23d771033 um: harddog: fix modular build)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/uml/linux.git uml-fixes/fixes
+Already up to date.
+Merging iommufd-fixes/for-rc (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git iommufd-fixes/for-rc
+Already up to date.
+Merging rust-fixes/rust-fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'rust-fixes' of https://github.com/Rust-for-Linux/linux.git rust-fixes/rust-fixes
+Already up to date.
+Merging v9fs-fixes/fixes/next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes/next' of git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git v9fs-fixes/fixes/next
+Already up to date.
+Merging w1-fixes/fixes (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-w1.git w1-fixes/fixes
+Already up to date.
+Merging pmdomain-fixes/fixes (c41336f4d690 pmdomain: mediatek: fix race conditions with genpd)
+$ git merge -m Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/linux-pm.git pmdomain-fixes/fixes
+Merge made by the 'ort' strategy.
+ drivers/pmdomain/core.c                    |  2 +-
+ drivers/pmdomain/mediatek/mtk-pm-domains.c | 15 +++++++--------
+ drivers/pmdomain/renesas/r8a77980-sysc.c   |  3 ++-
+ 3 files changed, 10 insertions(+), 10 deletions(-)
+Merging overlayfs-fixes/ovl-fixes (420332b94119 ovl: mark xwhiteouts directory with overlay.opaque='x')
+$ git merge -m Merge branch 'ovl-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/overlayfs/vfs.git overlayfs-fixes/ovl-fixes
+Already up to date.
+Merging i2c-host-fixes/i2c/i2c-host-fixes (9189526c46f2 MAINTAINERS: Update i2c host drivers repository)
+$ git merge -m Merge branch 'i2c/i2c-host-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/andi.shyti/linux.git i2c-host-fixes/i2c/i2c-host-fixes
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ MAINTAINERS | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+Merging drm-misc-fixes/for-linux-next-fixes (1c1914d6e8c6 dma-buf: heaps: Don't track CMA dma-buf pages under RssFile)
+$ git merge -m Merge branch 'for-linux-next-fixes' of git://anongit.freedesktop.org/drm/drm-misc drm-misc-fixes/for-linux-next-fixes
+Merge made by the 'ort' strategy.
+ drivers/dma-buf/heaps/cma_heap.c     | 7 +++----
+ drivers/gpu/drm/virtio/virtgpu_drv.c | 1 +
+ 2 files changed, 4 insertions(+), 4 deletions(-)
+Merging mm-stable/mm-stable (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'mm-stable' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-stable/mm-stable
+Already up to date.
+Merging mm-nonmm-stable/mm-nonmm-stable (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'mm-nonmm-stable' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-nonmm-stable/mm-nonmm-stable
+Already up to date.
+Merging mm/mm-everything (ec2dacea82ce Merge branch 'mm-nonmm-unstable' into mm-everything)
+$ git merge -m Merge branch 'mm-everything' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm/mm-everything
+Auto-merging Makefile
+Auto-merging arch/arm64/kernel/Makefile
+Auto-merging arch/x86/Makefile
+Auto-merging arch/x86/kernel/alternative.c
+Auto-merging drivers/firmware/efi/libstub/Makefile
+Auto-merging include/linux/sched.h
+Auto-merging net/bridge/br_multicast.c
+Merge made by the 'ort' strategy.
+ Documentation/ABI/testing/sysfs-bus-dax            |  153 ++
+ .../ABI/testing/sysfs-kernel-mm-mempolicy          |    4 +
+ .../sysfs-kernel-mm-mempolicy-weighted-interleave  |   25 +
+ Documentation/admin-guide/cgroup-v2.rst            |   18 +-
+ Documentation/admin-guide/kdump/vmcoreinfo.rst     |    8 +-
+ Documentation/admin-guide/mm/damon/usage.rst       |   42 +-
+ .../admin-guide/mm/numa_memory_policy.rst          |    9 +
+ Documentation/admin-guide/sysctl/kernel.rst        |   14 +-
+ Documentation/process/changes.rst                  |    2 +-
+ .../zh_CN/admin-guide/mm/damon/usage.rst           |   20 +-
+ .../zh_TW/admin-guide/mm/damon/usage.rst           |   20 +-
+ Makefile                                           |    8 -
+ arch/Kconfig                                       |    2 +-
+ arch/arm/Kconfig.debug                             |    2 +-
+ arch/arm/configs/aspeed_g4_defconfig               |    2 +-
+ arch/arm/configs/aspeed_g5_defconfig               |    2 +-
+ arch/arm/include/asm/current.h                     |    8 +-
+ arch/arm/include/asm/ptdump.h                      |    6 +-
+ arch/arm/kernel/setup.c                            |    4 +-
+ arch/arm/mm/init.c                                 |    2 +-
+ arch/arm64/Kconfig                                 |   12 +-
+ .../include/asm/{crash_core.h => crash_reserve.h}  |    4 +-
+ arch/arm64/include/asm/esr.h                       |    4 +
+ arch/arm64/include/asm/kexec.h                     |    2 +-
+ arch/arm64/include/asm/pgtable.h                   |    8 +
+ arch/arm64/include/asm/ptdump.h                    |    7 -
+ arch/arm64/include/asm/tlbflush.h                  |   16 +
+ arch/arm64/kernel/Makefile                         |    2 +-
+ arch/arm64/kernel/machine_kexec.c                  |    2 +-
+ arch/arm64/kernel/machine_kexec_file.c             |   10 +-
+ arch/arm64/kernel/{crash_core.c => vmcore_info.c}  |    3 +-
+ arch/arm64/mm/fault.c                              |   78 +-
+ arch/arm64/mm/init.c                               |    2 +-
+ arch/arm64/mm/mmu.c                                |   30 +-
+ arch/arm64/mm/ptdump.c                             |   11 +-
+ arch/loongarch/kernel/setup.c                      |    2 +-
+ arch/mips/kernel/setup.c                           |   17 +-
+ arch/powerpc/Kconfig                               |   11 +-
+ arch/powerpc/Makefile                              |    4 +-
+ arch/powerpc/kernel/setup-common.c                 |    2 +-
+ arch/powerpc/kvm/book3s_hv_nested.c                |    2 +-
+ arch/powerpc/mm/hugetlbpage.c                      |    2 +-
+ arch/powerpc/mm/mmu_decl.h                         |    6 -
+ arch/powerpc/mm/nohash/kaslr_booke.c               |    4 +-
+ arch/powerpc/mm/pgtable_32.c                       |    4 -
+ arch/powerpc/mm/pgtable_64.c                       |    3 -
+ arch/powerpc/mm/ptdump/ptdump.c                    |   21 +-
+ arch/powerpc/platforms/powernv/opal-core.c         |    2 +-
+ arch/riscv/Kconfig                                 |    6 +-
+ .../include/asm/{crash_core.h => crash_reserve.h}  |    4 +-
+ arch/riscv/include/asm/ftrace.h                    |   14 +-
+ arch/riscv/include/asm/ptdump.h                    |   22 -
+ arch/riscv/kernel/Makefile                         |    2 +-
+ arch/riscv/kernel/elf_kexec.c                      |    9 +-
+ arch/riscv/kernel/mcount.S                         |   10 +-
+ arch/riscv/kernel/{crash_core.c => vmcore_info.c}  |    3 +-
+ arch/riscv/mm/init.c                               |    5 +-
+ arch/riscv/mm/ptdump.c                             |   12 +-
+ arch/s390/Kconfig                                  |    1 +
+ arch/s390/include/asm/ftrace.h                     |    2 +-
+ arch/s390/include/asm/ptdump.h                     |   14 -
+ arch/s390/kernel/kexec_elf.c                       |    2 +
+ arch/s390/kernel/kexec_image.c                     |    2 +
+ arch/s390/kernel/machine_kexec_file.c              |   10 +
+ arch/s390/mm/dump_pagetables.c                     |   21 +-
+ arch/s390/mm/init.c                                |    5 -
+ arch/s390/mm/pgtable.c                             |    4 +-
+ arch/s390/mm/vmem.c                                |   62 +-
+ arch/sh/kernel/machine_kexec.c                     |    3 +
+ arch/sh/kernel/setup.c                             |    2 +-
+ arch/x86/Kconfig                                   |    2 +-
+ arch/x86/Makefile                                  |    6 -
+ .../include/asm/{crash_core.h => crash_reserve.h}  |    6 +-
+ arch/x86/include/asm/mmu.h                         |    2 +-
+ arch/x86/include/asm/pgtable.h                     |    5 +-
+ arch/x86/kernel/Makefile                           |    6 +-
+ arch/x86/kernel/alternative.c                      |    2 +-
+ arch/x86/kernel/cpu/mshyperv.c                     |    4 +
+ arch/x86/kernel/kexec-bzimage64.c                  |    4 +
+ arch/x86/kernel/kvm.c                              |    4 +-
+ arch/x86/kernel/machine_kexec_64.c                 |    3 +
+ arch/x86/kernel/reboot.c                           |    2 +-
+ arch/x86/kernel/setup.c                            |    2 +-
+ arch/x86/kernel/smp.c                              |    2 +-
+ .../kernel/{crash_core_32.c => vmcore_info_32.c}   |    2 +-
+ .../kernel/{crash_core_64.c => vmcore_info_64.c}   |    2 +-
+ arch/x86/mm/dump_pagetables.c                      |   24 +-
+ arch/x86/mm/init_32.c                              |    2 -
+ arch/x86/mm/init_64.c                              |    2 -
+ arch/x86/mm/tlb.c                                  |   37 +-
+ arch/x86/power/Makefile                            |    2 +-
+ arch/x86/xen/enlighten_hvm.c                       |    4 +
+ arch/x86/xen/mmu_pv.c                              |    2 +-
+ crypto/blake2b_generic.c                           |    2 +-
+ drivers/android/binder.c                           |    4 +-
+ drivers/base/cacheinfo.c                           |   50 +-
+ drivers/base/cpu.c                                 |    6 +-
+ drivers/base/memory.c                              |   23 +-
+ drivers/cpuidle/cpuidle.c                          |    2 +-
+ drivers/dax/bus.c                                  |  295 +++-
+ drivers/firmware/efi/libstub/Makefile              |    2 +-
+ drivers/firmware/qemu_fw_cfg.c                     |   14 +-
+ drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c           |    2 +-
+ drivers/md/bcache/sysfs.c                          |    8 +-
+ drivers/media/test-drivers/vicodec/codec-fwht.c    |    2 +-
+ drivers/regulator/Kconfig                          |    2 +-
+ drivers/s390/char/sclp_cmd.c                       |   44 +-
+ fs/Kconfig                                         |    1 +
+ fs/nilfs2/alloc.c                                  |   89 +-
+ fs/nilfs2/bmap.c                                   |    3 -
+ fs/nilfs2/cpfile.c                                 |  323 ++--
+ fs/nilfs2/cpfile.h                                 |   10 +-
+ fs/nilfs2/dat.c                                    |   38 +-
+ fs/nilfs2/ifile.c                                  |   21 +-
+ fs/nilfs2/ifile.h                                  |   10 +-
+ fs/nilfs2/inode.c                                  |   44 +-
+ fs/nilfs2/mdt.c                                    |    4 +-
+ fs/nilfs2/nilfs.h                                  |    3 +-
+ fs/nilfs2/page.c                                   |    8 +-
+ fs/nilfs2/segbuf.c                                 |    4 +-
+ fs/nilfs2/segment.c                                |  121 +-
+ fs/nilfs2/sufile.c                                 |   86 +-
+ fs/nilfs2/super.c                                  |   31 +-
+ fs/ocfs2/dlmglue.c                                 |    2 +-
+ fs/proc/Kconfig                                    |    2 +-
+ fs/proc/kcore.c                                    |    2 +-
+ fs/proc/task_mmu.c                                 |   17 +-
+ fs/userfaultfd.c                                   |    2 +-
+ include/asm-generic/vmlinux.lds.h                  |    2 +-
+ include/linux/buildid.h                            |    2 +-
+ include/linux/compiler-clang.h                     |   10 +-
+ include/linux/crash_core.h                         |  158 +-
+ include/linux/crash_reserve.h                      |   48 +
+ include/linux/flex_proportions.h                   |   32 -
+ include/linux/gfp.h                                |    2 +-
+ include/linux/highmem.h                            |   14 +
+ include/linux/hugetlb.h                            |    2 +-
+ include/linux/kexec.h                              |   47 +-
+ include/linux/list.h                               |   15 +
+ include/linux/list_lru.h                           |   18 -
+ include/linux/memory.h                             |    9 +
+ include/linux/memory_hotplug.h                     |   24 +-
+ include/linux/memremap.h                           |    1 +
+ include/linux/min_heap.h                           |   42 +-
+ include/linux/mm.h                                 |   12 +-
+ include/linux/mmu_context.h                        |    2 +-
+ include/linux/moduleloader.h                       |    8 +
+ include/linux/padata.h                             |    2 +
+ include/linux/poison.h                             |    3 +
+ include/linux/ptdump.h                             |   10 +
+ include/linux/sched.h                              |    1 +
+ include/linux/start_kernel.h                       |    2 -
+ include/linux/swap.h                               |    5 +-
+ include/linux/swapops.h                            |   13 +
+ include/linux/vmalloc.h                            |    1 -
+ include/linux/vmcore_info.h                        |   81 +
+ include/linux/win_minmax.h                         |    4 +-
+ include/linux/zswap.h                              |    7 +-
+ include/trace/events/oom.h                         |   19 +-
+ include/uapi/linux/mempolicy.h                     |    1 +
+ init/initramfs.c                                   |    2 +-
+ init/main.c                                        |   16 +-
+ ipc/ipc_sysctl.c                                   |   37 +-
+ ipc/mq_sysctl.c                                    |   36 +
+ kernel/Kconfig.kexec                               |   12 +-
+ kernel/Makefile                                    |    5 +-
+ kernel/bounds.c                                    |    2 +-
+ kernel/crash_core.c                                |  816 +++-------
+ kernel/crash_reserve.c                             |  464 ++++++
+ kernel/{crash_dump.c => elfcorehdr.c}              |    0
+ kernel/events/uprobes.c                            |    2 +-
+ kernel/kallsyms_selftest.c                         |    1 -
+ kernel/kexec.c                                     |   11 +-
+ kernel/kexec_core.c                                |  250 +--
+ kernel/kexec_file.c                                |   13 +-
+ kernel/kexec_internal.h                            |    2 +
+ kernel/kprobes.c                                   |    4 +-
+ kernel/ksysfs.c                                    |   10 +-
+ kernel/module/main.c                               |    5 +
+ kernel/padata.c                                    |   14 +-
+ kernel/panic.c                                     |    5 +
+ kernel/printk/printk.c                             |    4 +-
+ kernel/ptrace.c                                    |   13 +-
+ kernel/user_namespace.c                            |    2 +-
+ kernel/vmcore_info.c                               |  230 +++
+ lib/Kconfig.debug                                  |    4 +-
+ lib/Kconfig.kasan                                  |    2 +-
+ lib/buildid.c                                      |    2 +-
+ lib/dhry_1.c                                       |    2 +-
+ lib/dhry_run.c                                     |    1 -
+ lib/flex_proportions.c                             |   77 -
+ lib/iov_iter.c                                     |    5 +-
+ lib/maple_tree.c                                   |    6 +-
+ lib/raid6/Makefile                                 |    2 +-
+ lib/sort.c                                         |   20 +-
+ lib/stackdepot.c                                   |  254 +--
+ lib/stackinit_kunit.c                              |    2 +-
+ mm/cma.c                                           |    8 +-
+ mm/compaction.c                                    |  222 ++-
+ mm/damon/Kconfig                                   |    7 +-
+ mm/damon/dbgfs.c                                   |   27 +-
+ mm/filemap.c                                       |    4 +-
+ mm/huge_memory.c                                   |   23 +-
+ mm/hugetlb.c                                       |  234 ++-
+ mm/hugetlb_vmemmap.c                               |   55 +-
+ mm/internal.h                                      |   18 +-
+ mm/kasan/common.c                                  |    8 +-
+ mm/kasan/generic.c                                 |   68 +-
+ mm/kasan/kasan.h                                   |   10 -
+ mm/kasan/quarantine.c                              |    5 +-
+ mm/khugepaged.c                                    |   22 +-
+ mm/kmsan/hooks.c                                   |   50 +-
+ mm/list_lru.c                                      |   17 +-
+ mm/memcontrol.c                                    |  150 +-
+ mm/memory.c                                        |   76 +-
+ mm/memory_hotplug.c                                |   34 +-
+ mm/mempolicy.c                                     |  484 +++++-
+ mm/mm_init.c                                       |    1 +
+ mm/mmap.c                                          |   83 +-
+ mm/mprotect.c                                      |    4 +-
+ mm/nommu.c                                         |    2 -
+ mm/oom_kill.c                                      |    6 +-
+ mm/page_alloc.c                                    |   89 +-
+ mm/ptdump.c                                        |   22 +
+ mm/readahead.c                                     |    6 +-
+ mm/rmap.c                                          |   10 +-
+ mm/slab_common.c                                   |    2 +-
+ mm/sparse.c                                        |    3 +-
+ mm/swapfile.c                                      |   28 +-
+ mm/userfaultfd.c                                   |    2 +-
+ mm/vmalloc.c                                       | 1086 +++++++++----
+ mm/vmscan.c                                        |   51 +-
+ mm/zswap.c                                         | 1641 ++++++++++----------
+ net/bridge/br_multicast.c                          |    2 +-
+ scripts/min-tool-version.sh                        |    2 +-
+ scripts/recordmcount.pl                            |    2 +-
+ security/Kconfig                                   |    2 -
+ tools/mm/Makefile                                  |    9 +-
+ tools/mm/thpmaps                                   |  675 ++++++++
+ tools/objtool/noreturns.h                          |    1 -
+ tools/testing/selftests/damon/_chk_dependency.sh   |   11 +-
+ tools/testing/selftests/damon/_debugfs_common.sh   |    7 +
+ .../selftests/damon/debugfs_empty_targets.sh       |   12 +-
+ .../selftests/filesystems/eventfd/.gitignore       |    2 +
+ .../testing/selftests/filesystems/eventfd/Makefile |    7 +
+ .../selftests/filesystems/eventfd/eventfd_test.c   |  186 +++
+ tools/testing/selftests/memfd/memfd_test.c         |   10 -
+ tools/testing/selftests/mm/.gitignore              |    1 +
+ tools/testing/selftests/mm/Makefile                |    6 +
+ .../selftests/mm/charge_reserved_hugetlb.sh        |    4 +
+ tools/testing/selftests/mm/config                  |    3 +
+ tools/testing/selftests/mm/hugepage-shm.c          |   49 +-
+ tools/testing/selftests/mm/hugepage-vmemmap.c      |   39 +-
+ tools/testing/selftests/mm/hugetlb-madvise.c       |  207 +--
+ tools/testing/selftests/mm/hugetlb-read-hwpoison.c |  116 +-
+ tools/testing/selftests/mm/hugetlb_madv_vs_map.c   |  124 ++
+ .../selftests/mm/hugetlb_reparenting_test.sh       |    9 +-
+ tools/testing/selftests/mm/khugepaged.c            |  385 ++---
+ tools/testing/selftests/mm/ksm_functional_tests.c  |    4 +-
+ tools/testing/selftests/mm/on-fault-limit.c        |   38 +-
+ tools/testing/selftests/mm/protection_keys.c       |   34 +
+ tools/testing/selftests/mm/run_vmtests.sh          |   19 +-
+ 262 files changed, 7348 insertions(+), 4347 deletions(-)
+ create mode 100644 Documentation/ABI/testing/sysfs-bus-dax
+ create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
+ create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
+ rename arch/arm64/include/asm/{crash_core.h => crash_reserve.h} (81%)
+ rename arch/arm64/kernel/{crash_core.c => vmcore_info.c} (92%)
+ rename arch/riscv/include/asm/{crash_core.h => crash_reserve.h} (78%)
+ delete mode 100644 arch/riscv/include/asm/ptdump.h
+ rename arch/riscv/kernel/{crash_core.c => vmcore_info.c} (88%)
+ delete mode 100644 arch/s390/include/asm/ptdump.h
+ rename arch/x86/include/asm/{crash_core.h => crash_reserve.h} (92%)
+ rename arch/x86/kernel/{crash_core_32.c => vmcore_info_32.c} (90%)
+ rename arch/x86/kernel/{crash_core_64.c => vmcore_info_64.c} (94%)
+ create mode 100644 include/linux/crash_reserve.h
+ create mode 100644 include/linux/vmcore_info.h
+ create mode 100644 kernel/crash_reserve.c
+ rename kernel/{crash_dump.c => elfcorehdr.c} (100%)
+ create mode 100644 kernel/vmcore_info.c
+ create mode 100644 tools/mm/thpmaps
+ create mode 100644 tools/testing/selftests/filesystems/eventfd/.gitignore
+ create mode 100644 tools/testing/selftests/filesystems/eventfd/Makefile
+ create mode 100644 tools/testing/selftests/filesystems/eventfd/eventfd_test.c
+ create mode 100644 tools/testing/selftests/mm/hugetlb_madv_vs_map.c
+Merging kbuild/for-next (bd768db42ef6 kbuild: deb-pkg: call more misc debhelper commands)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git kbuild/for-next
+Merge made by the 'ort' strategy.
+ Documentation/kbuild/kconfig.rst | 363 ++++++++++++++++++---------------------
+ scripts/kconfig/lexer.l          |  38 ++--
+ scripts/package/builddeb         |  48 ++----
+ scripts/package/debian/rules     |  63 ++++++-
+ 4 files changed, 249 insertions(+), 263 deletions(-)
+Merging clang-format/clang-format (5a205c6a9f79 clang-format: Update with v6.7-rc4's `for_each` macro list)
+$ git merge -m Merge branch 'clang-format' of https://github.com/ojeda/linux.git clang-format/clang-format
+Already up to date.
+Merging perf/perf-tools-next (7727d59de44e perf tools: Add -H short option for --hierarchy)
+  63f209b6fa4d ("perf evlist: Fix evlist__new_default() for > 1 core PMU")
+$ git merge -m Merge branch 'perf-tools-next' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git perf/perf-tools-next
+Auto-merging tools/perf/builtin-record.c
+Auto-merging tools/perf/builtin-top.c
+Merge made by the 'ort' strategy.
+ tools/perf/Documentation/perf-report.txt        |  29 +++-
+ tools/perf/Documentation/perf-top.txt           |  32 +++-
+ tools/perf/Makefile.config                      |   6 +
+ tools/perf/arch/arm/util/pmu.c                  |   3 +
+ tools/perf/arch/arm64/util/mem-events.c         |  39 +----
+ tools/perf/arch/arm64/util/mem-events.h         |   7 +
+ tools/perf/arch/powerpc/util/Build              |   1 +
+ tools/perf/arch/powerpc/util/mem-events.c       |  16 +-
+ tools/perf/arch/powerpc/util/mem-events.h       |   7 +
+ tools/perf/arch/powerpc/util/pmu.c              |  12 ++
+ tools/perf/arch/x86/util/mem-events.c           |  99 ++---------
+ tools/perf/arch/x86/util/mem-events.h           |  10 ++
+ tools/perf/arch/x86/util/pmu.c                  |  19 +-
+ tools/perf/builtin-c2c.c                        |  45 ++---
+ tools/perf/builtin-mem.c                        |  48 ++---
+ tools/perf/builtin-record.c                     |  14 +-
+ tools/perf/builtin-report.c                     |   2 +-
+ tools/perf/builtin-sched.c                      |  57 ++----
+ tools/perf/builtin-top.c                        |   2 +-
+ tools/perf/builtin-version.c                    |   1 +
+ tools/perf/tests/shell/stat_bpf_counters.sh     |  12 +-
+ tools/perf/tests/shell/test_arm_callgraph_fp.sh |   6 +
+ tools/perf/util/annotate-data.c                 | 119 +++++++++++--
+ tools/perf/util/annotate-data.h                 |   8 +-
+ tools/perf/util/annotate.c                      | 153 ++++++++++++++--
+ tools/perf/util/annotate.h                      |  12 +-
+ tools/perf/util/data.c                          |  10 +-
+ tools/perf/util/data.h                          |   6 +-
+ tools/perf/util/dwarf-aux.c                     | 187 +++++++++++++++++---
+ tools/perf/util/dwarf-aux.h                     |  18 ++
+ tools/perf/util/evsel.c                         | 146 ++++++++++++++++
+ tools/perf/util/evsel.h                         |   1 +
+ tools/perf/util/mem-events.c                    | 221 ++++++++++++++----------
+ tools/perf/util/mem-events.h                    |  19 +-
+ tools/perf/util/pmu.c                           |  16 +-
+ tools/perf/util/pmu.h                           |   7 +
+ tools/perf/util/pmus.c                          |   6 -
+ tools/perf/util/pmus.h                          |   1 -
+ 38 files changed, 985 insertions(+), 412 deletions(-)
+ create mode 100644 tools/perf/arch/arm64/util/mem-events.h
+ create mode 100644 tools/perf/arch/powerpc/util/mem-events.h
+ create mode 100644 tools/perf/arch/powerpc/util/pmu.c
+ create mode 100644 tools/perf/arch/x86/util/mem-events.h
+Merging compiler-attributes/compiler-attributes (2993eb7a8d34 Compiler Attributes: counted_by: fixup clang URL)
+$ git merge -m Merge branch 'compiler-attributes' of https://github.com/ojeda/linux.git compiler-attributes/compiler-attributes
+Merge made by the 'ort' strategy.
+ include/linux/compiler_attributes.h | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+Merging dma-mapping/for-next (7c65aa3cc072 dma-debug: fix kernel-doc warnings)
+$ git merge -m Merge branch 'for-next' of git://git.infradead.org/users/hch/dma-mapping.git dma-mapping/for-next
+Already up to date.
+Merging asm-generic/master (34b2321cc648 MAINTAINERS: Add Andreas Larsson as co-maintainer for arch/sparc)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git asm-generic/master
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+Merging arc/for-next (0bb80ecc33a8 Linux 6.6-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git arc/for-next
+Already up to date.
+Merging arm/for-next (8790fade1a19 Merge branches 'misc' and 'fixes' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.armlinux.org.uk/~rmk/linux-arm.git arm/for-next
+Already up to date.
+Merging arm64/for-next/core (1b20d0486a60 arm64: Fix silcon-errata.rst formatting)
+$ git merge -m Merge branch 'for-next/core' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux arm64/for-next/core
+Already up to date.
+Merging arm-perf/for-next/perf (bb339db4d363 arm: perf: Fix ARCH=arm build with GCC)
+$ git merge -m Merge branch 'for-next/perf' of git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git arm-perf/for-next/perf
+Already up to date.
+Merging arm-soc/for-next (0d1d824a4ac1 Merge tag 'samsung-fixes-6.8' of https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux into arm/fixes)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git arm-soc/for-next
+Already up to date.
+Merging amlogic/for-next (0dd3ee311255 Linux 6.7)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/amlogic/linux.git amlogic/for-next
+Already up to date.
+Merging asahi-soc/asahi-soc/for-next (ffc253263a13 Linux 6.6)
+$ git merge -m Merge branch 'asahi-soc/for-next' of https://github.com/AsahiLinux/linux.git asahi-soc/asahi-soc/for-next
+Already up to date.
+Merging aspeed/for-next (e60f7a99d378 ARM: dts: aspeed: minerva: add sgpio line name)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/joel/bmc.git aspeed/for-next
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/arm/aspeed/aspeed.yaml     |   4 +
+ arch/arm/boot/dts/aspeed/Makefile                  |   6 +-
+ .../dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts     | 322 ++++++++++++
+ .../dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts   | 324 ++++++++++++
+ .../boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts  | 377 +++++++++++++
+ .../boot/dts/aspeed/aspeed-bmc-facebook-harma.dts  | 585 +++++++++++++++++++++
+ .../dts/aspeed/aspeed-bmc-facebook-minerva-cmc.dts | 265 ----------
+ .../dts/aspeed/aspeed-bmc-facebook-minerva.dts     | 543 +++++++++++++++++++
+ 8 files changed, 2160 insertions(+), 266 deletions(-)
+ create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-e3c256d4i.dts
+ create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-spc621d8hm3.dts
+ create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-asrock-x570d4u.dts
+ create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-harma.dts
+ delete mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva-cmc.dts
+ create mode 100644 arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-minerva.dts
+Merging at91/at91-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'at91-next' of git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git at91/at91-next
+Already up to date.
+Merging broadcom/next (bbf6d7dc2d94 Merge branch 'soc/next' into next)
+$ git merge -m Merge branch 'next' of https://github.com/Broadcom/stblinux.git broadcom/next
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/bus/brcm,gisb-arb.yaml  |  1 +
+ arch/arm/include/debug/brcmstb.S                          |  8 +++++---
+ .../boot/dts/broadcom/bcmbca/bcm4908-asus-gt-ac5300.dts   | 13 +++++++------
+ arch/arm64/boot/dts/broadcom/bcmbca/bcm4908.dtsi          |  3 ---
+ drivers/bus/brcmstb_gisb.c                                | 15 +++++++++++++++
+ 5 files changed, 28 insertions(+), 12 deletions(-)
+Merging davinci/davinci/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'davinci/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git davinci/davinci/for-next
+Already up to date.
+Merging drivers-memory/for-next (2f542c937c48 dt-bindings: memory-controllers: narrow regex for unit address to hex numbers)
+$ git merge -m Merge branch 'for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git drivers-memory/for-next
+Merge made by the 'ort' strategy.
+ .../memory-controllers/nvidia,tegra20-emc.yaml     |  2 +-
+ drivers/memory/emif.c                              | 65 ++++++++--------------
+ 2 files changed, 25 insertions(+), 42 deletions(-)
+Merging imx-mxs/for-next (4db02d61a81e Merge branch 'imx/dt64' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git imx-mxs/for-next
+Merge made by the 'ort' strategy.
+Merging mediatek/for-next (9802b60bd6d8 Merge branch 'v6.6-next/soc' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/matthias.bgg/linux.git mediatek/for-next
+Merge made by the 'ort' strategy.
+Merging mvebu/for-next (476887312c60 Merge branch 'mvebu/drivers' into mvebu/for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu.git mvebu/for-next
+Merge made by the 'ort' strategy.
+Merging omap/for-next (0012c1958460 Merge branches 'sgx-for-v6.9' and 'omap-for-v6.9/soc' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git omap/for-next
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ .../{img,powervr.yaml => img,powervr-rogue.yaml}   |   4 +-
+ .../devicetree/bindings/gpu/img,powervr-sgx.yaml   | 138 +++++++++++++++++++++
+ MAINTAINERS                                        |   3 +-
+ arch/arm/boot/dts/ti/omap/am33xx.dtsi              |   9 +-
+ arch/arm/boot/dts/ti/omap/am3517.dtsi              |  11 +-
+ arch/arm/boot/dts/ti/omap/am4372.dtsi              |   6 +
+ arch/arm/boot/dts/ti/omap/dra7.dtsi                |   9 +-
+ arch/arm/boot/dts/ti/omap/omap34xx.dtsi            |  11 +-
+ arch/arm/boot/dts/ti/omap/omap36xx.dtsi            |   9 +-
+ arch/arm/boot/dts/ti/omap/omap4.dtsi               |   9 +-
+ arch/arm/boot/dts/ti/omap/omap5.dtsi               |   9 +-
+ arch/arm/mach-omap2/am33xx-restart.c               |   2 +-
+ arch/arm/mach-omap2/clkt2xxx_virt_prcm_set.c       |   2 +-
+ arch/arm/mach-omap2/clockdomain.c                  |   4 +-
+ arch/arm/mach-omap2/cm33xx.c                       |   2 +-
+ arch/arm/mach-omap2/cminst44xx.c                   |   2 +-
+ arch/arm/mach-omap2/omap-secure.c                  |   4 +-
+ arch/arm/mach-omap2/omap_hwmod.c                   |   9 +-
+ arch/arm/mach-omap2/omap_hwmod_common_data.c       |   6 +-
+ arch/arm/mach-omap2/pmic-cpcap.c                   |  24 ++--
+ arch/arm/mach-omap2/powerdomain.c                  |   2 +-
+ arch/arm/mach-omap2/prm44xx.c                      |   2 +-
+ arch/arm/mach-omap2/prm_common.c                   |   4 +-
+ arch/arm/mach-omap2/wd_timer.c                     |   4 +-
+ arch/arm64/boot/dts/ti/k3-am65-main.dtsi           |   7 ++
+ 25 files changed, 231 insertions(+), 61 deletions(-)
+ rename Documentation/devicetree/bindings/gpu/{img,powervr.yaml => img,powervr-rogue.yaml} (91%)
+ create mode 100644 Documentation/devicetree/bindings/gpu/img,powervr-sgx.yaml
+Merging qcom/for-next (f70a1e7dd74f Merge branches 'arm32-for-6.9', 'arm64-fixes-for-6.8', 'arm64-for-6.9', 'clk-for-6.9' and 'drivers-for-6.9' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git qcom/for-next
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/arm/qcom.yaml    |    5 +
+ .../bindings/clock/qcom,gcc-sc8180x.yaml           |    7 +
+ .../bindings/soc/qcom/qcom,rpm-master-stats.yaml   |    2 +
+ .../dts/qcom/qcom-apq8026-samsung-matisse-wifi.dts |    7 +
+ arch/arm/boot/dts/qcom/qcom-apq8064.dtsi           |   38 +-
+ arch/arm/boot/dts/qcom/qcom-ipq4019-ap.dk01.1.dtsi |  146 ++-
+ arch/arm/boot/dts/qcom/qcom-ipq4019.dtsi           |   10 +-
+ arch/arm/boot/dts/qcom/qcom-ipq8064.dtsi           |    2 -
+ arch/arm/boot/dts/qcom/qcom-msm8660.dtsi           |   17 +-
+ arch/arm/boot/dts/qcom/qcom-msm8926-htc-memul.dts  |   15 +-
+ arch/arm/boot/dts/qcom/qcom-msm8974.dtsi           |   18 +-
+ arch/arm/boot/dts/qcom/qcom-sdx55.dtsi             |   32 +-
+ arch/arm/boot/dts/qcom/qcom-sdx65.dtsi             |   48 +-
+ arch/arm64/boot/dts/qcom/Makefile                  |    1 +
+ .../dts/qcom/apq8016-sbc-d3-camera-mezzanine.dts   |    8 +-
+ arch/arm64/boot/dts/qcom/ipq5332.dtsi              |    8 +-
+ arch/arm64/boot/dts/qcom/ipq6018.dtsi              |   13 +
+ arch/arm64/boot/dts/qcom/ipq8074.dtsi              |   14 +
+ arch/arm64/boot/dts/qcom/msm8916.dtsi              |    9 +
+ arch/arm64/boot/dts/qcom/msm8939.dtsi              |    9 +
+ arch/arm64/boot/dts/qcom/msm8953.dtsi              |    7 +-
+ arch/arm64/boot/dts/qcom/msm8996.dtsi              |    8 +-
+ arch/arm64/boot/dts/qcom/msm8998.dtsi              |    7 +-
+ .../boot/dts/qcom/{pm2250.dtsi => pm4125.dtsi}     |    8 +-
+ arch/arm64/boot/dts/qcom/qcm6490-fairphone-fp5.dts |   56 +-
+ arch/arm64/boot/dts/qcom/qcs404.dtsi               |   16 +
+ arch/arm64/boot/dts/qcom/qrb2210-rb1.dts           |   78 +-
+ arch/arm64/boot/dts/qcom/sa8775p.dtsi              |   16 +-
+ arch/arm64/boot/dts/qcom/sc7180.dtsi               |   14 +-
+ arch/arm64/boot/dts/qcom/sc7280-chrome-common.dtsi |   11 +
+ arch/arm64/boot/dts/qcom/sc7280.dtsi               |   54 +-
+ arch/arm64/boot/dts/qcom/sc8180x.dtsi              |   63 +-
+ arch/arm64/boot/dts/qcom/sdm630.dtsi               |   26 +-
+ arch/arm64/boot/dts/qcom/sdm670.dtsi               |   14 +-
+ .../arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi |    8 +-
+ arch/arm64/boot/dts/qcom/sdm845.dtsi               |   47 +-
+ arch/arm64/boot/dts/qcom/sm6115.dtsi               |   22 +-
+ arch/arm64/boot/dts/qcom/sm6125.dtsi               |    9 +-
+ arch/arm64/boot/dts/qcom/sm6350.dtsi               |   13 +-
+ arch/arm64/boot/dts/qcom/sm6375.dtsi               |   12 +-
+ arch/arm64/boot/dts/qcom/sm7225-fairphone-fp4.dts  |    8 +-
+ arch/arm64/boot/dts/qcom/sm8150.dtsi               |   95 +-
+ arch/arm64/boot/dts/qcom/sm8250.dtsi               |   93 +-
+ arch/arm64/boot/dts/qcom/sm8350.dtsi               |   75 +-
+ arch/arm64/boot/dts/qcom/sm8450-hdk.dts            |    4 +-
+ arch/arm64/boot/dts/qcom/sm8450.dtsi               |   79 +-
+ arch/arm64/boot/dts/qcom/sm8550-hdk.dts            | 1293 ++++++++++++++++++++
+ arch/arm64/boot/dts/qcom/sm8550.dtsi               |  107 +-
+ arch/arm64/boot/dts/qcom/sm8650-mtp.dts            |    2 +-
+ arch/arm64/boot/dts/qcom/sm8650-qrd.dts            |    2 +-
+ arch/arm64/boot/dts/qcom/sm8650.dtsi               |   36 +-
+ arch/arm64/boot/dts/qcom/x1e80100.dtsi             |   10 +-
+ drivers/clk/qcom/gcc-ipq6018.c                     |   17 +
+ drivers/clk/qcom/gcc-sdm845.c                      |    1 +
+ drivers/clk/qcom/gcc-sm8150.c                      |  352 +++---
+ drivers/soc/qcom/Makefile                          |    1 +
+ drivers/soc/qcom/qcom_aoss.c                       |  103 +-
+ drivers/soc/qcom/smem.c                            |   11 -
+ drivers/soc/qcom/smp2p.c                           |    6 +-
+ drivers/soc/qcom/socinfo.c                         |    4 +-
+ drivers/soc/qcom/trace-aoss.h                      |   48 +
+ include/dt-bindings/arm/qcom,ids.h                 |    2 +
+ include/dt-bindings/clock/qcom,gcc-sm8150.h        |    3 +
+ include/soc/qcom/qcom-spmi-pmic.h                  |    2 +-
+ 64 files changed, 2714 insertions(+), 538 deletions(-)
+ rename arch/arm64/boot/dts/qcom/{pm2250.dtsi => pm4125.dtsi} (91%)
+ create mode 100644 arch/arm64/boot/dts/qcom/sm8550-hdk.dts
+ create mode 100644 drivers/soc/qcom/trace-aoss.h
+Merging renesas/next (6fc5bb9da080 Merge branches 'renesas-arm-defconfig-for-v6.9', 'renesas-drivers-for-v6.9', 'renesas-dt-bindings-for-v6.9' and 'renesas-dts-for-v6.9' into renesas-next)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel.git renesas/next
+Merge made by the 'ort' strategy.
+ .../bindings/clock/renesas,cpg-mssr.yaml           |   1 +
+ .../bindings/power/renesas,rcar-sysc.yaml          |   1 +
+ .../devicetree/bindings/reset/renesas,rst.yaml     |   1 +
+ .../devicetree/bindings/soc/renesas/renesas.yaml   |  13 +
+ arch/arm/boot/dts/renesas/r8a73a4-ape6evm.dts      |  12 +
+ arch/arm/boot/dts/renesas/r8a73a4.dtsi             |  23 +-
+ arch/arm/configs/multi_v7_defconfig                |   1 -
+ arch/arm/configs/shmobile_defconfig                |   2 -
+ arch/arm64/boot/dts/renesas/Makefile               |   5 +
+ .../boot/dts/renesas/r8a779g0-white-hawk-cpu.dts   |  13 +
+ .../boot/dts/renesas/r8a779g0-white-hawk-cpu.dtsi  | 368 +-------------------
+ .../arm64/boot/dts/renesas/r8a779g0-white-hawk.dts |  58 +---
+ arch/arm64/boot/dts/renesas/r8a779g0.dtsi          |  84 ++---
+ .../dts/renesas/r8a779g2-white-hawk-single.dts     |  26 ++
+ arch/arm64/boot/dts/renesas/r8a779g2.dtsi          |  12 +
+ .../boot/dts/renesas/r8a779h0-gray-hawk-single.dts |  52 +++
+ arch/arm64/boot/dts/renesas/r8a779h0.dtsi          | 121 +++++++
+ arch/arm64/boot/dts/renesas/r9a07g043u.dtsi        |  69 ++++
+ arch/arm64/boot/dts/renesas/r9a08g045.dtsi         |  14 +
+ arch/arm64/boot/dts/renesas/rzg3s-smarc-som.dtsi   |   5 +
+ arch/arm64/boot/dts/renesas/rzg3s-smarc.dtsi       |  53 +++
+ arch/arm64/boot/dts/renesas/ulcb-kf.dtsi           |  42 +--
+ arch/arm64/boot/dts/renesas/white-hawk-common.dtsi |  65 ++++
+ .../boot/dts/renesas/white-hawk-cpu-common.dtsi    | 375 +++++++++++++++++++++
+ ...e-hawk-csi-dsi.dtsi => white-hawk-csi-dsi.dtsi} |   2 +-
+ ...hawk-ethernet.dtsi => white-hawk-ethernet.dtsi} |   2 +-
+ arch/arm64/configs/defconfig                       |   1 +
+ drivers/soc/renesas/Kconfig                        |  17 +-
+ drivers/soc/renesas/rcar-rst.c                     |   1 +
+ drivers/soc/renesas/renesas-soc.c                  |   8 +
+ .../dt-bindings/clock/renesas,r8a779h0-cpg-mssr.h  |  96 ++++++
+ include/dt-bindings/power/renesas,r8a779h0-sysc.h  |  49 +++
+ 32 files changed, 1088 insertions(+), 504 deletions(-)
+ create mode 100644 arch/arm64/boot/dts/renesas/r8a779g0-white-hawk-cpu.dts
+ create mode 100644 arch/arm64/boot/dts/renesas/r8a779g2-white-hawk-single.dts
+ create mode 100644 arch/arm64/boot/dts/renesas/r8a779g2.dtsi
+ create mode 100644 arch/arm64/boot/dts/renesas/r8a779h0-gray-hawk-single.dts
+ create mode 100644 arch/arm64/boot/dts/renesas/r8a779h0.dtsi
+ create mode 100644 arch/arm64/boot/dts/renesas/white-hawk-common.dtsi
+ create mode 100644 arch/arm64/boot/dts/renesas/white-hawk-cpu-common.dtsi
+ rename arch/arm64/boot/dts/renesas/{r8a779g0-white-hawk-csi-dsi.dtsi => white-hawk-csi-dsi.dtsi} (97%)
+ rename arch/arm64/boot/dts/renesas/{r8a779g0-white-hawk-ethernet.dtsi => white-hawk-ethernet.dtsi} (76%)
+ create mode 100644 include/dt-bindings/clock/renesas,r8a779h0-cpg-mssr.h
+ create mode 100644 include/dt-bindings/power/renesas,r8a779h0-sysc.h
+Merging reset/reset/next (c3c46acd5be9 dt-bindings: reset: hisilicon,hi3660-reset: Drop providers and consumers from example)
+$ git merge -m Merge branch 'reset/next' of https://git.pengutronix.de/git/pza/linux reset/reset/next
+Already up to date.
+Merging rockchip/for-next (a3c323226362 Merge branch 'v6.9-clk/next' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mmind/linux-rockchip.git rockchip/for-next
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/arm/rockchip.yaml          |  38 +-
+ .../devicetree/bindings/soc/rockchip/grf.yaml      |   1 +
+ arch/arm/boot/dts/rockchip/rk3128-xpi-3128.dts     |  29 +
+ arch/arm/boot/dts/rockchip/rk3128.dtsi             |  60 ++
+ arch/arm64/boot/dts/rockchip/Makefile              |   5 +
+ arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi    |   6 +
+ .../boot/dts/rockchip/rk3399-kobol-helios64.dts    |   3 -
+ arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4a.dts |   2 +-
+ arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4b.dts |   2 +-
+ arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4c.dts |   2 +-
+ arch/arm64/boot/dts/rockchip/rk3399.dtsi           |  70 +-
+ .../boot/dts/rockchip/rk3566-anbernic-rg-arc-d.dts |  42 ++
+ .../boot/dts/rockchip/rk3566-anbernic-rg-arc-s.dts |  19 +
+ .../boot/dts/rockchip/rk3566-anbernic-rg-arc.dtsi  | 237 +++++++
+ .../boot/dts/rockchip/rk3566-anbernic-rg353x.dtsi  |  74 ++
+ .../boot/dts/rockchip/rk3566-anbernic-rg503.dts    |  74 ++
+ .../boot/dts/rockchip/rk3566-anbernic-rgxx3.dtsi   |  74 --
+ .../dts/rockchip/rk3588-edgeble-neu6a-common.dtsi  | 466 +++++++++++++
+ .../boot/dts/rockchip/rk3588-edgeble-neu6a-io.dts  |  10 +-
+ .../boot/dts/rockchip/rk3588-edgeble-neu6a-io.dtsi | 232 +++++++
+ .../dts/rockchip/rk3588-edgeble-neu6a-wifi.dtso    |  56 ++
+ .../boot/dts/rockchip/rk3588-edgeble-neu6a.dtsi    |  25 +-
+ .../boot/dts/rockchip/rk3588-edgeble-neu6b-io.dts  |  76 +-
+ .../boot/dts/rockchip/rk3588-edgeble-neu6b.dtsi    | 383 +----------
+ arch/arm64/boot/dts/rockchip/rk3588-evb1-v10.dts   |   1 +
+ arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dts  |  31 +-
+ arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts    |   7 +
+ .../arm64/boot/dts/rockchip/rk3588s-nanopi-r6c.dts |  14 +
+ .../arm64/boot/dts/rockchip/rk3588s-nanopi-r6s.dts | 764 +++++++++++++++++++++
+ drivers/clk/rockchip/clk-rk3568.c                  |   1 +
+ 30 files changed, 2211 insertions(+), 593 deletions(-)
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg-arc-d.dts
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg-arc-s.dts
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3566-anbernic-rg-arc.dtsi
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3588-edgeble-neu6a-common.dtsi
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3588-edgeble-neu6a-io.dtsi
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3588-edgeble-neu6a-wifi.dtso
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3588s-nanopi-r6c.dts
+ create mode 100644 arch/arm64/boot/dts/rockchip/rk3588s-nanopi-r6s.dts
+Merging samsung-krzk/for-next (819ce8ab3d99 Merge branch 'next/drivers' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux.git samsung-krzk/for-next
+Auto-merging arch/arm/configs/multi_v7_defconfig
+Merge made by the 'ort' strategy.
+ .../bindings/clock/google,gs101-clock.yaml         |  29 +-
+ .../devicetree/bindings/clock/tesla,fsd-clock.yaml |   2 +-
+ .../devicetree/bindings/i2c/i2c-exynos5.yaml       |   1 +
+ .../soc/samsung/samsung,exynos-sysreg.yaml         |   1 +
+ arch/arm/boot/dts/samsung/exynos4412-p4note.dtsi   |  51 ++
+ arch/arm/boot/dts/samsung/exynos5420-peach-pit.dts |   1 +
+ .../dts/samsung/exynos5422-odroidxu3-common.dtsi   |  16 +-
+ arch/arm/boot/dts/samsung/exynos5800-peach-pi.dts  |   1 +
+ arch/arm/configs/exynos_defconfig                  |   3 +
+ arch/arm/configs/multi_v7_defconfig                |   3 +
+ arch/arm/mach-s5pv210/pm.c                         |   2 +-
+ arch/arm64/boot/dts/exynos/google/gs101-oriole.dts |  14 +
+ arch/arm64/boot/dts/exynos/google/gs101.dtsi       |  75 ++-
+ drivers/clk/samsung/clk-exynos850.c                |  10 +-
+ drivers/clk/samsung/clk-gs101.c                    | 595 ++++++++++++++++++++-
+ include/dt-bindings/clock/exynos850.h              |   2 +
+ include/dt-bindings/clock/google,gs101.h           |  81 +++
+ 17 files changed, 855 insertions(+), 32 deletions(-)
+Merging scmi/for-linux-next (99f798bdfb75 Merge tags 'scmi-fixes-6.8' and 'ffa-fixes-6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/sudeep.holla/linux into for-linux-next)
+$ git merge -m Merge branch 'for-linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sudeep.holla/linux.git scmi/for-linux-next
+Merge made by the 'ort' strategy.
+Merging sophgo/for-next (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'for-next' of https://github.com/sophgo/linux.git sophgo/for-next
+Already up to date.
+Merging stm32/stm32-next (bda732fda193 ARM: dts: stm32: fix DSI peripheral clock on stm32mp15 boards)
+$ git merge -m Merge branch 'stm32-next' of git://git.kernel.org/pub/scm/linux/kernel/git/atorgue/stm32.git stm32/stm32-next
+Merge made by the 'ort' strategy.
+ arch/arm/boot/dts/st/stm32mp157.dtsi              | 2 +-
+ arch/arm/boot/dts/st/stm32mp157a-dk1-scmi.dts     | 2 +-
+ arch/arm/boot/dts/st/stm32mp157c-dk2-scmi.dts     | 2 +-
+ arch/arm/boot/dts/st/stm32mp157c-ed1-scmi.dts     | 2 +-
+ arch/arm/boot/dts/st/stm32mp157c-ev1-scmi.dts     | 2 +-
+ arch/arm/boot/dts/st/stm32mp157c-lxa-tac-gen2.dts | 2 +-
+ arch/arm/boot/dts/st/stm32mp15xc-lxa-tac.dtsi     | 2 +-
+ 7 files changed, 7 insertions(+), 7 deletions(-)
+Merging sunxi/sunxi/for-next (38ed19495066 Merge branch 'sunxi/dt-for-6.9' into sunxi/for-next)
+$ git merge -m Merge branch 'sunxi/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sunxi/linux.git sunxi/sunxi/for-next
+Merge made by the 'ort' strategy.
+ .../sram/allwinner,sun4i-a10-system-control.yaml   |  2 +-
+ .../boot/dts/allwinner/sun50i-h6-beelink-gs1.dts   |  2 +
+ arch/arm64/boot/dts/allwinner/sun50i-h6-tanix.dtsi |  2 +
+ arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi       |  7 ++-
+ arch/arm64/boot/dts/allwinner/sun50i-h616.dtsi     | 61 ++++++++++++++++++++++
+ drivers/clk/sunxi/clk-a20-gmac.c                   | 21 ++++----
+ drivers/clk/sunxi/clk-sun9i-cpus.c                 |  7 +--
+ drivers/clk/sunxi/clk-usb.c                        |  9 ++--
+ 8 files changed, 90 insertions(+), 21 deletions(-)
+Merging tee/next (84ec4fd88831 Merge branch 'tee_iov_iter_for_v6.8' into next)
+$ git merge -m Merge branch 'next' of https://git.linaro.org/people/jens.wiklander/linux-tee.git tee/next
+Merge made by the 'ort' strategy.
+Merging tegra/for-next (5e6333ef8ea5 Merge branch for-6.8/arm/dt into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tegra/linux.git tegra/for-next
+Auto-merging drivers/soc/tegra/fuse/fuse-tegra30.c
+Auto-merging include/linux/string.h
+Auto-merging mm/util.c
+Merge made by the 'ort' strategy.
+ arch/arm/boot/dts/nvidia/tegra30-ouya.dts |   4 +-
+ drivers/soc/tegra/Kconfig                 |   5 ++
+ drivers/soc/tegra/fuse/fuse-tegra.c       | 115 ++++++++++++++++++++++--------
+ drivers/soc/tegra/fuse/fuse-tegra30.c     |  20 ++++++
+ drivers/soc/tegra/fuse/fuse.h             |   8 ++-
+ drivers/soc/tegra/fuse/tegra-apbmisc.c    | 110 +++++++++++++++++++++++-----
+ drivers/soc/tegra/pmc.c                   |  24 -------
+ include/linux/string.h                    |   1 +
+ include/soc/tegra/fuse.h                  |   1 +
+ include/soc/tegra/pmc.h                   |  18 -----
+ mm/util.c                                 |  17 +++++
+ 11 files changed, 233 insertions(+), 90 deletions(-)
+Merging ti/ti-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'ti-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ti/linux.git ti/ti-next
+Already up to date.
+Merging xilinx/for-next (0ee74e0d7b97 Merge remote-tracking branch 'git/zynqmp/dt' into for-next)
+$ git merge -m Merge branch 'for-next' of git://github.com/Xilinx/linux-xlnx.git xilinx/for-next
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ .../firmware/xilinx/xlnx,zynqmp-firmware.yaml      | 78 +++++++++++++++++---
+ .../devicetree/bindings/fpga/xlnx,versal-fpga.yaml |  2 +-
+ .../devicetree/bindings/soc/xilinx/xilinx.yaml     | 70 +++++++++++++++---
+ MAINTAINERS                                        |  2 +-
+ arch/arm/mach-zynq/slcr.c                          |  5 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-clk-ccf.dtsi     | 16 +++-
+ .../boot/dts/xilinx/zynqmp-sck-kv-g-revA.dtso      | 36 ++++++++-
+ .../boot/dts/xilinx/zynqmp-sck-kv-g-revB.dtso      | 37 +++++++++-
+ .../boot/dts/xilinx/zynqmp-zc1751-xm015-dc1.dts    |  2 +-
+ .../boot/dts/xilinx/zynqmp-zc1751-xm016-dc2.dts    |  2 +-
+ .../boot/dts/xilinx/zynqmp-zc1751-xm019-dc5.dts    |  4 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-zcu100-revC.dts  |  2 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-zcu102-revA.dts  |  6 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-zcu104-revA.dts  |  2 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-zcu104-revC.dts  |  2 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-zcu106-revA.dts  |  6 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-zcu111-revA.dts  |  4 +-
+ arch/arm64/boot/dts/xilinx/zynqmp-zcu1275-revA.dts |  2 +-
+ arch/arm64/boot/dts/xilinx/zynqmp.dtsi             | 85 +++++++++++++---------
+ 19 files changed, 280 insertions(+), 83 deletions(-)
+Merging clk/clk-next (efe5a1b888ab Merge branch 'clk-fixes' into clk-next)
+$ git merge -m Merge branch 'clk-next' of git://git.kernel.org/pub/scm/linux/kernel/git/clk/linux.git clk/clk-next
+Merge made by the 'ort' strategy.
+Merging clk-imx/for-next (f52f00069888 clk: imx: pll14xx: change naming of fvco to fout)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/abelvesa/linux.git clk-imx/for-next
+Already up to date.
+Merging clk-renesas/renesas-clk (096311157d2a clk: renesas: r8a779g0: Fix PCIe clock name)
+$ git merge -m Merge branch 'renesas-clk' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-drivers.git clk-renesas/renesas-clk
+Merge made by the 'ort' strategy.
+ drivers/clk/renesas/Kconfig             |   5 +
+ drivers/clk/renesas/Makefile            |   1 +
+ drivers/clk/renesas/clk-mstp.c          |  16 +--
+ drivers/clk/renesas/r8a779g0-cpg-mssr.c |   2 +-
+ drivers/clk/renesas/r8a779h0-cpg-mssr.c | 241 ++++++++++++++++++++++++++++++++
+ drivers/clk/renesas/r9a07g043-cpg.c     |  31 ++++
+ drivers/clk/renesas/r9a08g045-cpg.c     |   3 +
+ drivers/clk/renesas/rcar-gen4-cpg.c     |  10 +-
+ drivers/clk/renesas/renesas-cpg-mssr.c  | 117 +++++++++++++++-
+ drivers/clk/renesas/renesas-cpg-mssr.h  |   1 +
+ drivers/of/base.c                       | 123 +++++++++++-----
+ include/linux/of.h                      |  11 ++
+ 12 files changed, 502 insertions(+), 59 deletions(-)
+ create mode 100644 drivers/clk/renesas/r8a779h0-cpg-mssr.c
+Merging csky/linux-next (2c40c1c6adab Merge tag 'usb-6.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb)
+$ git merge -m Merge branch 'linux-next' of git://github.com/c-sky/csky-linux.git csky/linux-next
+Already up to date.
+Merging loongarch/loongarch-next (48ef9e87b407 LoongArch: KVM: Add returns to SIMD stubs)
+$ git merge -m Merge branch 'loongarch-next' of git://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson.git loongarch/loongarch-next
+Already up to date.
+Merging m68k/for-next (6b9c045b0602 m68k: defconfig: Update defconfigs for v6.7-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k.git m68k/for-next
+Already up to date.
+Merging m68knommu/for-next (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gerg/m68knommu.git m68knommu/for-next
+Already up to date.
+Merging microblaze/next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'next' of git://git.monstr.eu/linux-2.6-microblaze.git microblaze/next
+Already up to date.
+Merging mips/mips-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'mips-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git mips/mips-next
+Already up to date.
+Merging openrisc/for-next (c289330331eb openrisc: Remove kernel-doc marker from ioremap comment)
+$ git merge -m Merge branch 'for-next' of git://github.com/openrisc/linux.git openrisc/for-next
+Already up to date.
+Merging parisc-hd/for-next (913b9d443a01 parisc: BTLB: Fix crash when setting up BTLB at CPU bringup)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/deller/parisc-linux.git parisc-hd/for-next
+Merge made by the 'ort' strategy.
+ arch/parisc/Kconfig                     |  1 -
+ arch/parisc/include/asm/assembly.h      |  1 +
+ arch/parisc/include/asm/extable.h       | 64 +++++++++++++++++++++++++++++++++
+ arch/parisc/include/asm/special_insns.h |  6 ++--
+ arch/parisc/include/asm/uaccess.h       | 48 ++++---------------------
+ arch/parisc/kernel/cache.c              | 10 ++++--
+ arch/parisc/kernel/drivers.c            |  5 ++-
+ arch/parisc/kernel/unaligned.c          | 44 +++++++++++------------
+ arch/parisc/kernel/vmlinux.lds.S        |  2 +-
+ arch/parisc/mm/fault.c                  | 11 ++++--
+ 10 files changed, 118 insertions(+), 74 deletions(-)
+ create mode 100644 arch/parisc/include/asm/extable.h
+Merging powerpc/next (44a1aad2fe6c Merge branch 'topic/ppc-kvm' into next)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git powerpc/next
+Already up to date.
+Merging soc-fsl/next (fb9c384625dd bus: fsl-mc: fsl-mc-allocator: Drop a write-only variable)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/leo/linux.git soc-fsl/next
+Already up to date.
+Merging risc-v/for-next (cb4ede926134 riscv: Avoid code duplication with generic bitops implementation)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git risc-v/for-next
+Auto-merging arch/riscv/Kconfig
+Auto-merging arch/riscv/mm/init.c
+Auto-merging include/linux/mm.h
+Auto-merging mm/mmap.c
+Merge made by the 'ort' strategy.
+ arch/riscv/Kbuild                                  |   1 +
+ arch/riscv/Kconfig                                 |  18 +-
+ arch/riscv/Makefile                                |   5 +
+ arch/riscv/crypto/Kconfig                          |  93 ++++
+ arch/riscv/crypto/Makefile                         |  23 +
+ arch/riscv/crypto/aes-macros.S                     | 156 ++++++
+ arch/riscv/crypto/aes-riscv64-glue.c               | 550 +++++++++++++++++++++
+ arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S   | 312 ++++++++++++
+ arch/riscv/crypto/aes-riscv64-zvkned-zvkb.S        | 146 ++++++
+ arch/riscv/crypto/aes-riscv64-zvkned.S             | 180 +++++++
+ arch/riscv/crypto/chacha-riscv64-glue.c            | 101 ++++
+ arch/riscv/crypto/chacha-riscv64-zvkb.S            | 294 +++++++++++
+ arch/riscv/crypto/ghash-riscv64-glue.c             | 168 +++++++
+ arch/riscv/crypto/ghash-riscv64-zvkg.S             |  72 +++
+ arch/riscv/crypto/sha256-riscv64-glue.c            | 137 +++++
+ .../crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S  | 225 +++++++++
+ arch/riscv/crypto/sha512-riscv64-glue.c            | 133 +++++
+ arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S     | 203 ++++++++
+ arch/riscv/crypto/sm3-riscv64-glue.c               | 112 +++++
+ arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S         | 123 +++++
+ arch/riscv/crypto/sm4-riscv64-glue.c               | 107 ++++
+ arch/riscv/crypto/sm4-riscv64-zvksed-zvkb.S        | 117 +++++
+ arch/riscv/include/asm/asm.h                       |  10 +
+ arch/riscv/include/asm/bitops.h                    | 138 +-----
+ arch/riscv/include/asm/pgalloc.h                   |  53 +-
+ arch/riscv/include/asm/pgtable.h                   |   6 +
+ arch/riscv/include/asm/tlb.h                       |  18 +
+ arch/riscv/include/asm/vector.h                    |  11 +
+ arch/riscv/kernel/entry.S                          |   3 +
+ arch/riscv/kernel/pi/Makefile                      |   3 +
+ arch/riscv/kernel/smpboot.c                        |   1 -
+ arch/riscv/kernel/traps.c                          |  17 +-
+ arch/riscv/lib/uaccess_vector.S                    |   1 -
+ arch/riscv/mm/init.c                               |   6 +
+ crypto/Kconfig                                     |   3 +
+ drivers/clocksource/timer-clint.c                  |   2 +-
+ drivers/clocksource/timer-riscv.c                  |   2 +-
+ include/asm-generic/bitops/__ffs.h                 |   8 +-
+ include/asm-generic/bitops/__fls.h                 |   8 +-
+ include/asm-generic/bitops/ffs.h                   |   8 +-
+ include/asm-generic/bitops/fls.h                   |   8 +-
+ include/linux/mm.h                                 |   2 +-
+ mm/mmap.c                                          |   2 +-
+ 43 files changed, 3445 insertions(+), 141 deletions(-)
+ create mode 100644 arch/riscv/crypto/Kconfig
+ create mode 100644 arch/riscv/crypto/Makefile
+ create mode 100644 arch/riscv/crypto/aes-macros.S
+ create mode 100644 arch/riscv/crypto/aes-riscv64-glue.c
+ create mode 100644 arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S
+ create mode 100644 arch/riscv/crypto/aes-riscv64-zvkned-zvkb.S
+ create mode 100644 arch/riscv/crypto/aes-riscv64-zvkned.S
+ create mode 100644 arch/riscv/crypto/chacha-riscv64-glue.c
+ create mode 100644 arch/riscv/crypto/chacha-riscv64-zvkb.S
+ create mode 100644 arch/riscv/crypto/ghash-riscv64-glue.c
+ create mode 100644 arch/riscv/crypto/ghash-riscv64-zvkg.S
+ create mode 100644 arch/riscv/crypto/sha256-riscv64-glue.c
+ create mode 100644 arch/riscv/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
+ create mode 100644 arch/riscv/crypto/sha512-riscv64-glue.c
+ create mode 100644 arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S
+ create mode 100644 arch/riscv/crypto/sm3-riscv64-glue.c
+ create mode 100644 arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S
+ create mode 100644 arch/riscv/crypto/sm4-riscv64-glue.c
+ create mode 100644 arch/riscv/crypto/sm4-riscv64-zvksed-zvkb.S
+Merging riscv-dt/riscv-dt-for-next (2db68ddbf33a riscv: dts: starfive: beaglev-starlight: Setup phy reset gpio)
+$ git merge -m Merge branch 'riscv-dt-for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git riscv-dt/riscv-dt-for-next
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/pwm/opencores,pwm.yaml     |  55 +++++++++++
+ .../boot/dts/starfive/jh7100-beaglev-starlight.dts |  11 +++
+ arch/riscv/boot/dts/starfive/jh7100-common.dtsi    | 108 +++++++++++++++++++++
+ .../dts/starfive/jh7100-starfive-visionfive-v1.dts |  22 ++++-
+ arch/riscv/boot/dts/starfive/jh7100.dtsi           |  45 +++++++++
+ .../dts/starfive/jh7110-starfive-visionfive-2.dtsi |  22 +++++
+ arch/riscv/boot/dts/starfive/jh7110.dtsi           |   9 ++
+ 7 files changed, 271 insertions(+), 1 deletion(-)
+ create mode 100644 Documentation/devicetree/bindings/pwm/opencores,pwm.yaml
+Merging riscv-soc/riscv-soc-for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'riscv-soc-for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git riscv-soc/riscv-soc-for-next
+Already up to date.
+Merging s390/for-next (8eb3db95a8c8 Merge branch 'features' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git s390/for-next
+Merge made by the 'ort' strategy.
+Merging sh/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git:git.kernel.org/pub/scm/linux/kernel/git/glaubitz/sh-linux.git sh/for-next
+Already up to date.
+Merging uml/next (83aec96c631e um: Mark 32bit syscall helpers as clobbering memory)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/uml/linux.git uml/next
+Already up to date.
+Merging xtensa/xtensa-for-next (a03cd7602a09 xtensa: don't produce FDPIC output with fdpic toolchain)
+$ git merge -m Merge branch 'xtensa-for-next' of git://github.com/jcmvbkbc/linux-xtensa.git xtensa/xtensa-for-next
+Already up to date.
+Merging bcachefs/for-next (6bb3f7f4c3f4 bcachefs: unlock parent dir if entry is not found in subvolume deletion)
+$ git merge -m Merge branch 'for-next' of https://evilpiepirate.org/git/bcachefs.git bcachefs/for-next
+Merge made by the 'ort' strategy.
+ fs/bcachefs/fs-ioctl.c          | 4 ++--
+ fs/bcachefs/mean_and_variance.h | 2 +-
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+Merging pidfd/for-next (a901a3568fd2 Merge tag 'iomap-6.5-merge-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git pidfd/for-next
+Already up to date.
+Merging fscrypt/for-next (c919330dd578 f2fs: fix double free of f2fs_sb_info)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/fs/fscrypt/linux.git fscrypt/for-next
+Already up to date.
+Merging afs/afs-next (abcbd3bfbbfe afs: trace: Log afs_make_call(), including server address)
+$ git merge -m Merge branch 'afs-next' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git afs/afs-next
+Already up to date.
+Merging btrfs/for-next (932ab07c383e Merge branch 'for-next-next-v6.8-20240108' into for-next-20240108)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git btrfs/for-next
+Auto-merging fs/btrfs/extent-tree.c
+Auto-merging fs/btrfs/extent_io.c
+CONFLICT (content): Merge conflict in fs/btrfs/extent_io.c
+Auto-merging fs/btrfs/inode.c
+Auto-merging fs/btrfs/ioctl.c
+Auto-merging fs/btrfs/zoned.c
+Resolved 'fs/btrfs/extent_io.c' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master 3b458c21387e] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git
+$ git diff -M --stat --summary HEAD^..
+ fs/btrfs/accessors.c         | 12 +++++-----
+ fs/btrfs/btrfs_inode.h       |  3 +--
+ fs/btrfs/ctree.c             |  2 +-
+ fs/btrfs/disk-io.c           |  2 +-
+ fs/btrfs/extent_io.c         | 57 +++++++++++++++++++-------------------------
+ fs/btrfs/extent_io.h         | 16 ++++++++++---
+ fs/btrfs/file.c              | 11 ++++-----
+ fs/btrfs/inode.c             | 16 +++++--------
+ fs/btrfs/tests/inode-tests.c | 40 +++++++++++++++----------------
+ 9 files changed, 78 insertions(+), 81 deletions(-)
+Merging ceph/master (ded080c86b3f rbd: don't move requests to the running list on errors)
+$ git merge -m Merge branch 'master' of git://github.com/ceph/ceph-client.git ceph/master
+Already up to date.
+Merging cifs/for-next (2417900a8dce smb: client: parse uid, gid, mode and dev from WSL reparse points)
+$ git merge -m Merge branch 'for-next' of git://git.samba.org/sfrench/cifs-2.6.git cifs/for-next
+Merge made by the 'ort' strategy.
+ fs/smb/client/Makefile     |   2 +-
+ fs/smb/client/cifsglob.h   |  41 ++--
+ fs/smb/client/cifsproto.h  |   4 -
+ fs/smb/client/connect.c    |   2 +
+ fs/smb/client/fs_context.c |  35 +++
+ fs/smb/client/fs_context.h |   9 +
+ fs/smb/client/inode.c      |  84 +-------
+ fs/smb/client/readdir.c    |  20 +-
+ fs/smb/client/reparse.c    | 526 +++++++++++++++++++++++++++++++++++++++++++++
+ fs/smb/client/reparse.h    | 113 ++++++++++
+ fs/smb/client/smb2glob.h   |   3 +-
+ fs/smb/client/smb2inode.c  | 399 ++++++++++++++++++++++++----------
+ fs/smb/client/smb2ops.c    | 250 +--------------------
+ fs/smb/client/smb2pdu.c    |  29 ++-
+ fs/smb/client/smb2pdu.h    |  36 +++-
+ fs/smb/client/smb2proto.h  |   9 +-
+ fs/smb/client/trace.h      |   2 +
+ fs/smb/client/transport.c  |   4 +-
+ fs/smb/common/smbfsctl.h   |   6 -
+ 19 files changed, 1081 insertions(+), 493 deletions(-)
+ create mode 100644 fs/smb/client/reparse.c
+ create mode 100644 fs/smb/client/reparse.h
+Merging configfs/for-next (4425c1d9b44d configfs: improve item creation performance)
+$ git merge -m Merge branch 'for-next' of git://git.infradead.org/users/hch/configfs.git configfs/for-next
+Auto-merging fs/configfs/inode.c
+Merge made by the 'ort' strategy.
+ fs/configfs/configfs_internal.h |  4 ++--
+ fs/configfs/dir.c               | 42 +++++++++++++++++++++++++++++++----------
+ fs/configfs/inode.c             | 24 -----------------------
+ 3 files changed, 34 insertions(+), 36 deletions(-)
+Merging ecryptfs/next (a3d78fe3e1ae fs: ecryptfs: comment typo fix)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/tyhicks/ecryptfs.git ecryptfs/next
+Auto-merging fs/ecryptfs/crypto.c
+Auto-merging fs/ecryptfs/read_write.c
+Merge made by the 'ort' strategy.
+ fs/ecryptfs/crypto.c   | 2 +-
+ fs/ecryptfs/keystore.c | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+Merging erofs/dev (aa12a790d31b erofs: make erofs_{err,info}() support NULL sb parameter)
+$ git merge -m Merge branch 'dev' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git erofs/dev
+Already up to date.
+Merging exfat/dev (8b29fa18400c exfat: ratelimit error msg in exfat_file_mmap())
+$ git merge -m Merge branch 'dev' of git://git.kernel.org/pub/scm/linux/kernel/git/linkinjeon/exfat.git exfat/dev
+Merge made by the 'ort' strategy.
+ fs/exfat/exfat_fs.h | 5 +++++
+ fs/exfat/file.c     | 6 +++++-
+ fs/exfat/inode.c    | 7 +++----
+ 3 files changed, 13 insertions(+), 5 deletions(-)
+Merging exportfs/exportfs-next (42c3732fa807 fs: Create a generic is_dot_dotdot() utility)
+$ git merge -m Merge branch 'exportfs-next' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux exportfs/exportfs-next
+Auto-merging fs/ecryptfs/crypto.c
+Auto-merging fs/f2fs/f2fs.h
+Auto-merging fs/namei.c
+Auto-merging include/linux/fs.h
+Merge made by the 'ort' strategy.
+ fs/crypto/fname.c    |  8 +-------
+ fs/ecryptfs/crypto.c | 10 ----------
+ fs/exportfs/expfs.c  |  2 +-
+ fs/f2fs/f2fs.h       | 11 -----------
+ fs/namei.c           |  6 ++----
+ include/linux/fs.h   | 11 +++++++++++
+ 6 files changed, 15 insertions(+), 33 deletions(-)
+Merging ext3/for_next (cd04011c5859 Merge fsnotify optimization & cleanup from Amir.)
+$ git merge -m Merge branch 'for_next' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git ext3/for_next
+Merge made by the 'ort' strategy.
+ fs/ext2/balloc.c         |  2 +-
+ fs/ext2/inode.c          |  2 +-
+ fs/ext2/xattr.c          |  2 +-
+ fs/notify/fsnotify.c     | 28 +++++++++++++++++-----------
+ fs/ocfs2/quota_global.c  | 12 ++++++++++++
+ fs/ocfs2/quota_local.c   |  3 +++
+ fs/quota/dquot.c         | 13 +++++--------
+ fs/quota/quota_tree.c    | 24 ++++++++++++------------
+ fs/quota/quota_v1.c      |  6 ++++++
+ fs/quota/quota_v2.c      | 20 +++++++++++++++++++-
+ fs/udf/dir.c             |  2 +-
+ fs/udf/inode.c           |  2 +-
+ fs/udf/namei.c           | 23 +++++++++++++----------
+ fs/udf/super.c           |  2 +-
+ include/linux/fsnotify.h | 12 +++++++++---
+ 15 files changed, 102 insertions(+), 51 deletions(-)
+Merging ext4/dev (68da4c44b994 ext4: fix inconsistent between segment fstrim and full fstrim)
+$ git merge -m Merge branch 'dev' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git ext4/dev
+Already up to date.
+Merging f2fs/dev (f31438c16879 f2fs: fix to avoid potential panic during recovery)
+$ git merge -m Merge branch 'dev' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git f2fs/dev
+Auto-merging fs/f2fs/f2fs.h
+Merge made by the 'ort' strategy.
+ Documentation/ABI/testing/sysfs-fs-f2fs | 47 ++++++++++----------
+ Documentation/filesystems/f2fs.rst      | 47 ++++++++++----------
+ fs/f2fs/checkpoint.c                    | 19 ++++++--
+ fs/f2fs/compress.c                      | 45 +++++++++++--------
+ fs/f2fs/data.c                          | 36 ++++++++-------
+ fs/f2fs/dir.c                           |  5 +--
+ fs/f2fs/f2fs.h                          | 78 ++++++++++++++++++++++-----------
+ fs/f2fs/file.c                          | 51 ++++++++++++++-------
+ fs/f2fs/namei.c                         | 11 ++---
+ fs/f2fs/node.c                          |  2 +-
+ fs/f2fs/recovery.c                      | 33 +++++++-------
+ fs/f2fs/segment.c                       |  4 +-
+ fs/f2fs/super.c                         | 57 ++++++++++++++----------
+ 13 files changed, 254 insertions(+), 181 deletions(-)
+Merging fsverity/for-next (919dc320956e fsverity: skip PKCS#7 parser when keyring is empty)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/fs/fsverity/linux.git fsverity/for-next
+Already up to date.
+Merging fuse/for-next (3f29f1c336c0 fuse: disable FOPEN_PARALLEL_DIRECT_WRITES with FUSE_DIRECT_IO_ALLOW_MMAP)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git fuse/for-next
+Already up to date.
+Merging gfs2/for-next (acd2d246f4b2 gfs2: Fix LOOKUP_RCU support in gfs2_drevalidate)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2.git gfs2/for-next
+Merge made by the 'ort' strategy.
+ fs/gfs2/aops.c       |   4 +-
+ fs/gfs2/bmap.c       |  21 +++++-----
+ fs/gfs2/dentry.c     |  32 ++++++++++-----
+ fs/gfs2/dir.c        |  52 +++++++++++++----------
+ fs/gfs2/dir.h        |   2 +-
+ fs/gfs2/file.c       |   4 +-
+ fs/gfs2/glops.c      |   2 +-
+ fs/gfs2/incore.h     |   1 -
+ fs/gfs2/inode.c      |  10 ++---
+ fs/gfs2/meta_io.c    | 114 ++++++++++++++++++++++++++++++++++-----------------
+ fs/gfs2/meta_io.h    |  15 ++++---
+ fs/gfs2/ops_fstype.c |   4 +-
+ fs/gfs2/quota.c      |   2 +-
+ fs/gfs2/recovery.c   |   2 +-
+ fs/gfs2/rgrp.c       |   5 ++-
+ fs/gfs2/super.c      |   6 +--
+ fs/gfs2/xattr.c      |  17 ++++----
+ 17 files changed, 176 insertions(+), 117 deletions(-)
+Merging jfs/jfs-next (e42e29cc4423 Revert "jfs: fix shift-out-of-bounds in dbJoin")
+$ git merge -m Merge branch 'jfs-next' of git://github.com/kleikamp/linux-shaggy.git jfs/jfs-next
+Already up to date.
+Merging ksmbd/ksmbd-for-next (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'ksmbd-for-next' of https://github.com/smfrench/smb3-kernel.git ksmbd/ksmbd-for-next
+Already up to date.
+Merging nfs/linux-next (052d534373b7 Merge tag 'exfat-for-6.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/linkinjeon/exfat)
+$ git merge -m Merge branch 'linux-next' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6.git nfs/linux-next
+Already up to date.
+Merging nfs-anna/linux-next (57331a59ac0d NFSv4.1: Use the nfs_client's rpc timeouts for backchannel)
+$ git merge -m Merge branch 'linux-next' of git://git.linux-nfs.org/projects/anna/linux-nfs.git nfs-anna/linux-next
+Already up to date.
+Merging nfsd/nfsd-next (6c1c91f97746 nfsd: Simplify the allocation of slab caches in nfsd_file_cache_init)
+$ git merge -m Merge branch 'nfsd-next' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux nfsd/nfsd-next
+Auto-merging net/sunrpc/svc.c
+Merge made by the 'ort' strategy.
+ fs/lockd/svc.c                        |   3 -
+ fs/nfs/callback.c                     |   3 -
+ fs/nfsd/blocklayout.c                 |   4 +-
+ fs/nfsd/cache.h                       |   2 -
+ fs/nfsd/filecache.c                   |  76 ++--
+ fs/nfsd/filecache.h                   |   1 +
+ fs/nfsd/netns.h                       |  29 +-
+ fs/nfsd/nfs4callback.c                |  96 +++--
+ fs/nfsd/nfs4layouts.c                 |  63 ++--
+ fs/nfsd/nfs4proc.c                    |   6 +-
+ fs/nfsd/nfs4state.c                   | 642 ++++++++++++++++++++++++----------
+ fs/nfsd/nfs4xdr.c                     |  19 +-
+ fs/nfsd/nfscache.c                    |  40 +--
+ fs/nfsd/nfsctl.c                      |  17 +-
+ fs/nfsd/nfsd.h                        |   2 +
+ fs/nfsd/nfsfh.c                       |   3 +-
+ fs/nfsd/nfssvc.c                      |  16 +-
+ fs/nfsd/pnfs.h                        |   8 +-
+ fs/nfsd/state.h                       |  52 ++-
+ fs/nfsd/stats.c                       |  52 ++-
+ fs/nfsd/stats.h                       |  70 ++--
+ fs/nfsd/trace.h                       | 194 +++++++++-
+ fs/nfsd/vfs.c                         |  48 ++-
+ fs/nfsd/vfs.h                         |   2 +
+ include/linux/sunrpc/svc.h            |   5 +-
+ include/trace/misc/nfs.h              |  34 ++
+ net/sunrpc/auth_gss/gss_krb5_crypto.c |  14 +-
+ net/sunrpc/auth_gss/gss_krb5_mech.c   |  11 +-
+ net/sunrpc/auth_gss/gss_rpc_xdr.c     |  27 +-
+ net/sunrpc/stats.c                    |   2 +-
+ net/sunrpc/svc.c                      |  40 ++-
+ net/sunrpc/xprtsock.c                 |   9 -
+ 32 files changed, 1080 insertions(+), 510 deletions(-)
+Merging ntfs3/master (622cd3daa8ea fs/ntfs3: Slightly simplify ntfs_inode_printk())
+$ git merge -m Merge branch 'master' of https://github.com/Paragon-Software-Group/linux-ntfs3.git ntfs3/master
+Merge made by the 'ort' strategy.
+ fs/ntfs3/attrib.c   |  45 +++++++----
+ fs/ntfs3/attrlist.c |  12 +--
+ fs/ntfs3/bitmap.c   |   4 +-
+ fs/ntfs3/dir.c      |  48 ++++++++---
+ fs/ntfs3/file.c     |  76 ++++++++++++++----
+ fs/ntfs3/frecord.c  |  19 +++--
+ fs/ntfs3/fslog.c    | 228 ++++++++++++++++++++++++----------------------------
+ fs/ntfs3/fsntfs.c   |  29 ++++++-
+ fs/ntfs3/index.c    |   8 +-
+ fs/ntfs3/inode.c    |  32 ++++++--
+ fs/ntfs3/namei.c    |  12 +++
+ fs/ntfs3/ntfs.h     |   4 +-
+ fs/ntfs3/ntfs_fs.h  |  29 +++----
+ fs/ntfs3/record.c   |  18 ++++-
+ fs/ntfs3/super.c    |  54 ++++++++-----
+ fs/ntfs3/xattr.c    |   6 ++
+ 16 files changed, 379 insertions(+), 245 deletions(-)
+Merging orangefs/for-next (31720a2b109b orangefs: Fix kmemleak in orangefs_{kernel,client}_debug_init())
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux orangefs/for-next
+Already up to date.
+Merging overlayfs/overlayfs-next (d17bb4620f90 overlayfs.rst: fix ReST formatting)
+$ git merge -m Merge branch 'overlayfs-next' of git://git.kernel.org/pub/scm/linux/kernel/git/overlayfs/vfs.git overlayfs/overlayfs-next
+Already up to date.
+Merging ubifs/next (adbf4c4954e3 ubi: block: fix memleak in ubiblock_create())
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs.git ubifs/next
+Already up to date.
+Merging v9fs/9p-next (ff49bf186757 net: 9p: avoid freeing uninit memory in p9pdu_vreadf)
+$ git merge -m Merge branch '9p-next' of git://github.com/martinetd/linux v9fs/9p-next
+Already up to date.
+Merging v9fs-ericvh/ericvh/for-next (be57855f5050 fs/9p: fix dups even in uncached mode)
+$ git merge -m Merge branch 'ericvh/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git v9fs-ericvh/ericvh/for-next
+Merge made by the 'ort' strategy.
+ fs/9p/v9fs.h           |  31 ++------
+ fs/9p/v9fs_vfs.h       |  11 ++-
+ fs/9p/vfs_dir.c        |   4 +-
+ fs/9p/vfs_inode.c      | 150 ++++++--------------------------------
+ fs/9p/vfs_inode_dotl.c | 194 +++++++++----------------------------------------
+ fs/9p/vfs_super.c      |  45 +-----------
+ 6 files changed, 71 insertions(+), 364 deletions(-)
+Merging xfs/for-next (881f78f47255 xfs: remove conditional building of rt geometry validator functions)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git xfs/for-next
+Merge made by the 'ort' strategy.
+ fs/xfs/libxfs/xfs_attr.c     |  6 +++---
+ fs/xfs/libxfs/xfs_rtbitmap.c | 14 --------------
+ fs/xfs/libxfs/xfs_rtbitmap.h | 16 ----------------
+ fs/xfs/libxfs/xfs_sb.c       | 14 ++++++++++++++
+ fs/xfs/libxfs/xfs_sb.h       |  2 ++
+ fs/xfs/libxfs/xfs_types.h    | 12 ++++++++++++
+ fs/xfs/scrub/rtbitmap.c      |  1 +
+ fs/xfs/scrub/rtsummary.c     |  1 +
+ 8 files changed, 33 insertions(+), 33 deletions(-)
+Merging zonefs/for-next (8812387d0569 zonefs: set FMODE_CAN_ODIRECT instead of a dummy direct_IO method)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs.git zonefs/for-next
+Already up to date.
+Merging iomap/iomap-for-next (3ac974796e5d iomap: fix short copy in iomap_write_iter())
+$ git merge -m Merge branch 'iomap-for-next' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git iomap/iomap-for-next
+Already up to date.
+Merging djw-vfs/vfs-for-next (ce85a1e04645 xfs: stabilize fs summary counters for online fsck)
+$ git merge -m Merge branch 'vfs-for-next' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git djw-vfs/vfs-for-next
+Already up to date.
+Merging file-locks/locks-next (e0152e7481c6 Merge tag 'riscv-for-linus-6.6-mw1' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux)
+$ git merge -m Merge branch 'locks-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git file-locks/locks-next
+Already up to date.
+Merging iversion/iversion-next (e0152e7481c6 Merge tag 'riscv-for-linus-6.6-mw1' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux)
+$ git merge -m Merge branch 'iversion-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git iversion/iversion-next
+Already up to date.
+Merging vfs-brauner/vfs.all (de9861b0c277 Merge branch 'vfs.fs' into vfs.all)
+  563bd99dc191 ("iov_iter: Avoid wrap-around instrumentation in copy_compat_iovec_from_user()")
+$ git merge -m Merge branch 'vfs.all' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git vfs-brauner/vfs.all
+Auto-merging CREDITS
+Auto-merging MAINTAINERS
+Auto-merging fs/Kconfig
+Auto-merging fs/ntfs3/namei.c
+Auto-merging init/initramfs.c
+Auto-merging kernel/exit.c
+Auto-merging lib/iov_iter.c
+Auto-merging mm/filemap.c
+Merge made by the 'ort' strategy.
+ CREDITS                                            |    5 +
+ Documentation/filesystems/index.rst                |    1 -
+ Documentation/filesystems/ntfs.rst                 |  466 ---
+ MAINTAINERS                                        |   10 -
+ fs/Kconfig                                         |    1 -
+ fs/Makefile                                        |    1 -
+ fs/attr.c                                          |    2 +-
+ fs/backing-file.c                                  |    4 +-
+ fs/buffer.c                                        |   10 +-
+ fs/eventfd.c                                       |   14 +-
+ fs/exec.c                                          |    6 +-
+ fs/fhandle.c                                       |    2 +-
+ fs/fs-writeback.c                                  |   25 +
+ fs/inode.c                                         |    3 +-
+ fs/netfs/buffered_write.c                          |    3 +
+ fs/netfs/direct_write.c                            |    5 +-
+ fs/netfs/io.c                                      |    2 +
+ fs/ntfs/Kconfig                                    |   81 -
+ fs/ntfs/Makefile                                   |   15 -
+ fs/ntfs/aops.c                                     | 1744 -----------
+ fs/ntfs/aops.h                                     |   88 -
+ fs/ntfs/attrib.c                                   | 2624 ----------------
+ fs/ntfs/attrib.h                                   |  102 -
+ fs/ntfs/bitmap.c                                   |  179 --
+ fs/ntfs/bitmap.h                                   |  104 -
+ fs/ntfs/collate.c                                  |  110 -
+ fs/ntfs/collate.h                                  |   36 -
+ fs/ntfs/compress.c                                 |  950 ------
+ fs/ntfs/debug.c                                    |  159 -
+ fs/ntfs/debug.h                                    |   57 -
+ fs/ntfs/dir.c                                      | 1540 ----------
+ fs/ntfs/dir.h                                      |   34 -
+ fs/ntfs/endian.h                                   |   79 -
+ fs/ntfs/file.c                                     | 1997 ------------
+ fs/ntfs/index.c                                    |  440 ---
+ fs/ntfs/index.h                                    |  134 -
+ fs/ntfs/inode.c                                    | 3102 -------------------
+ fs/ntfs/inode.h                                    |  310 --
+ fs/ntfs/layout.h                                   | 2421 ---------------
+ fs/ntfs/lcnalloc.c                                 | 1000 ------
+ fs/ntfs/lcnalloc.h                                 |  131 -
+ fs/ntfs/logfile.c                                  |  849 ------
+ fs/ntfs/logfile.h                                  |  295 --
+ fs/ntfs/malloc.h                                   |   77 -
+ fs/ntfs/mft.c                                      | 2907 ------------------
+ fs/ntfs/mft.h                                      |  110 -
+ fs/ntfs/mst.c                                      |  189 --
+ fs/ntfs/namei.c                                    |  392 ---
+ fs/ntfs/ntfs.h                                     |  150 -
+ fs/ntfs/quota.c                                    |  103 -
+ fs/ntfs/quota.h                                    |   21 -
+ fs/ntfs/runlist.c                                  | 1893 ------------
+ fs/ntfs/runlist.h                                  |   88 -
+ fs/ntfs/super.c                                    | 3202 --------------------
+ fs/ntfs/sysctl.c                                   |   58 -
+ fs/ntfs/sysctl.h                                   |   27 -
+ fs/ntfs/time.h                                     |   89 -
+ fs/ntfs/types.h                                    |   55 -
+ fs/ntfs/unistr.c                                   |  384 ---
+ fs/ntfs/upcase.c                                   |   73 -
+ fs/ntfs/usnjrnl.c                                  |   70 -
+ fs/ntfs/usnjrnl.h                                  |  191 --
+ fs/ntfs/volume.h                                   |  164 -
+ fs/ntfs3/namei.c                                   |    2 +-
+ fs/pipe.c                                          |   81 +-
+ fs/select.c                                        |   13 +-
+ fs/sysv/itree.c                                    |   10 +-
+ include/asm-generic/barrier.h                      |    2 -
+ include/linux/backing-dev.h                        |    1 -
+ include/linux/fs.h                                 |   18 +-
+ include/linux/pid.h                                |    4 +-
+ include/uapi/linux/fs.h                            |    5 +-
+ include/uapi/linux/pidfd.h                         |    3 +-
+ init/initramfs.c                                   |    2 -
+ kernel/exit.c                                      |    7 +
+ kernel/fork.c                                      |   51 +-
+ kernel/pid.c                                       |   16 +-
+ kernel/signal.c                                    |   12 +-
+ lib/iov_iter.c                                     |   55 +-
+ mm/backing-dev.c                                   |   25 -
+ mm/filemap.c                                       |    9 -
+ .../selftests/filesystems/overlayfs/dev_in_maps.c  |   10 +-
+ .../move_mount_set_group_test.c                    |    4 +-
+ 83 files changed, 225 insertions(+), 29489 deletions(-)
+ delete mode 100644 Documentation/filesystems/ntfs.rst
+ delete mode 100644 fs/ntfs/Kconfig
+ delete mode 100644 fs/ntfs/Makefile
+ delete mode 100644 fs/ntfs/aops.c
+ delete mode 100644 fs/ntfs/aops.h
+ delete mode 100644 fs/ntfs/attrib.c
+ delete mode 100644 fs/ntfs/attrib.h
+ delete mode 100644 fs/ntfs/bitmap.c
+ delete mode 100644 fs/ntfs/bitmap.h
+ delete mode 100644 fs/ntfs/collate.c
+ delete mode 100644 fs/ntfs/collate.h
+ delete mode 100644 fs/ntfs/compress.c
+ delete mode 100644 fs/ntfs/debug.c
+ delete mode 100644 fs/ntfs/debug.h
+ delete mode 100644 fs/ntfs/dir.c
+ delete mode 100644 fs/ntfs/dir.h
+ delete mode 100644 fs/ntfs/endian.h
+ delete mode 100644 fs/ntfs/file.c
+ delete mode 100644 fs/ntfs/index.c
+ delete mode 100644 fs/ntfs/index.h
+ delete mode 100644 fs/ntfs/inode.c
+ delete mode 100644 fs/ntfs/inode.h
+ delete mode 100644 fs/ntfs/layout.h
+ delete mode 100644 fs/ntfs/lcnalloc.c
+ delete mode 100644 fs/ntfs/lcnalloc.h
+ delete mode 100644 fs/ntfs/logfile.c
+ delete mode 100644 fs/ntfs/logfile.h
+ delete mode 100644 fs/ntfs/malloc.h
+ delete mode 100644 fs/ntfs/mft.c
+ delete mode 100644 fs/ntfs/mft.h
+ delete mode 100644 fs/ntfs/mst.c
+ delete mode 100644 fs/ntfs/namei.c
+ delete mode 100644 fs/ntfs/ntfs.h
+ delete mode 100644 fs/ntfs/quota.c
+ delete mode 100644 fs/ntfs/quota.h
+ delete mode 100644 fs/ntfs/runlist.c
+ delete mode 100644 fs/ntfs/runlist.h
+ delete mode 100644 fs/ntfs/super.c
+ delete mode 100644 fs/ntfs/sysctl.c
+ delete mode 100644 fs/ntfs/sysctl.h
+ delete mode 100644 fs/ntfs/time.h
+ delete mode 100644 fs/ntfs/types.h
+ delete mode 100644 fs/ntfs/unistr.c
+ delete mode 100644 fs/ntfs/upcase.c
+ delete mode 100644 fs/ntfs/usnjrnl.c
+ delete mode 100644 fs/ntfs/usnjrnl.h
+ delete mode 100644 fs/ntfs/volume.h
+Merging vfs/for-next (052d534373b7 Merge tag 'exfat-for-6.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/linkinjeon/exfat)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git vfs/for-next
+Already up to date.
+Merging printk/for-next (6c3a34e38436 Merge branch 'for-6.8' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux.git printk/for-next
+Merge made by the 'ort' strategy.
+Merging pci/next (95bf9132f8b4 Merge branch 'pci/dpc')
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git pci/next
+Merge made by the 'ort' strategy.
+ drivers/pci/pcie/dpc.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+Merging pstore/for-next/pstore (24a0b5e196cf pstore: inode: Use cleanup.h for struct pstore_private)
+$ git merge -m Merge branch 'for-next/pstore' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git pstore/for-next/pstore
+Already up to date.
+Merging hid/for-next (a54f72c74c2d Merge branch 'for-6.8/upstream-fixes' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git hid/for-next
+Merge made by the 'ort' strategy.
+ drivers/hid/bpf/hid_bpf_dispatch.c                 | 117 ++++--
+ drivers/hid/bpf/hid_bpf_dispatch.h                 |   4 +-
+ drivers/hid/bpf/hid_bpf_jmp_table.c                |  40 +-
+ drivers/hid/hid-ids.h                              |  10 +
+ drivers/hid/hid-lenovo.c                           |  57 ++-
+ drivers/hid/hid-logitech-hidpp.c                   |   2 +
+ drivers/hid/hid-nintendo.c                         |  10 -
+ drivers/hid/hid-nvidia-shield.c                    |   4 +
+ drivers/hid/hid-samsung.c                          | 437 +++++++++++++++++++--
+ drivers/hid/hid-steam.c                            |  36 +-
+ drivers/hid/hidraw.c                               |   7 +-
+ drivers/hid/i2c-hid/i2c-hid-core.c                 |   6 +-
+ drivers/hid/i2c-hid/i2c-hid-of.c                   |   1 +
+ include/linux/hid_bpf.h                            |  11 -
+ .../selftests/hid/tests/test_wacom_generic.py      |   8 +-
+ 15 files changed, 593 insertions(+), 157 deletions(-)
+Merging i2c/i2c/for-next (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'i2c/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git i2c/i2c/for-next
+Already up to date.
+Merging i2c-host/i2c/i2c-host (11f1357336cd i2c: imx: move to generic GPIO recovery)
+$ git merge -m Merge branch 'i2c/i2c-host' of git://git.kernel.org/pub/scm/linux/kernel/git/andi.shyti/linux.git i2c-host/i2c/i2c-host
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/i2c/i2c-mux-pca954x.yaml   | 30 +++++++++++
+ drivers/i2c/busses/i2c-i801.c                      | 12 ++---
+ drivers/i2c/busses/i2c-imx.c                       | 62 ++--------------------
+ drivers/i2c/muxes/i2c-mux-pca954x.c                | 43 ++++++++++++++-
+ 4 files changed, 82 insertions(+), 65 deletions(-)
+Merging i3c/i3c/next (4fa0888f6f3e i3c: document hotjoin sysfs entry)
+$ git merge -m Merge branch 'i3c/next' of git://git.kernel.org/pub/scm/linux/kernel/git/i3c/linux.git i3c/i3c/next
+Already up to date.
+Merging hwmon-staging/hwmon-next (6120fec68e78 hwmon: ltc4282: add support for the LTC4282 chip)
+$ git merge -m Merge branch 'hwmon-next' of git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git hwmon-staging/hwmon-next
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ Documentation/ABI/testing/sysfs-class-hwmon        |    9 +
+ .../devicetree/bindings/hwmon/adi,ltc4282.yaml     |  159 ++
+ .../devicetree/bindings/hwmon/ti,ina2xx.yaml       |    9 +
+ Documentation/hwmon/emc2305.rst                    |    1 -
+ Documentation/hwmon/index.rst                      |    1 +
+ Documentation/hwmon/ltc4282.rst                    |  133 ++
+ Documentation/hwmon/nct6683.rst                    |    1 +
+ MAINTAINERS                                        |   26 +-
+ drivers/hwmon/Kconfig                              |   11 +
+ drivers/hwmon/Makefile                             |    1 +
+ drivers/hwmon/adm1177.c                            |    1 -
+ drivers/hwmon/adt7410.c                            |    2 -
+ drivers/hwmon/ds1621.c                             |    1 -
+ drivers/hwmon/ds620.c                              |    1 -
+ drivers/hwmon/emc2305.c                            |    5 -
+ drivers/hwmon/hwmon.c                              |    1 +
+ drivers/hwmon/ina209.c                             |    1 -
+ drivers/hwmon/ina238.c                             |    1 -
+ drivers/hwmon/ltc4282.c                            | 1784 ++++++++++++++++++++
+ drivers/hwmon/max127.c                             |    1 -
+ drivers/hwmon/max31760.c                           |    1 -
+ drivers/hwmon/max31790.c                           |    1 -
+ drivers/hwmon/max31827.c                           |    1 -
+ drivers/hwmon/max6621.c                            |    1 -
+ drivers/hwmon/max6697.c                            |    1 -
+ drivers/hwmon/nct6683.c                            |    3 +
+ drivers/hwmon/occ/p8_i2c.c                         |    1 -
+ drivers/hwmon/pmbus/ir36021.c                      |    1 -
+ drivers/hwmon/pmbus/pmbus_core.c                   |    2 +-
+ drivers/hwmon/powr1220.c                           |    1 -
+ drivers/hwmon/sbrmi.c                              |    1 -
+ drivers/hwmon/sbtsi_temp.c                         |    1 -
+ drivers/hwmon/w83773g.c                            |    1 -
+ include/linux/hwmon.h                              |   14 +-
+ 34 files changed, 2129 insertions(+), 50 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/hwmon/adi,ltc4282.yaml
+ create mode 100644 Documentation/hwmon/ltc4282.rst
+ create mode 100644 drivers/hwmon/ltc4282.c
+Merging jc_docs/docs-next (5c7944ca7b13 coding-style: Add guidance to prefer dev_dbg)
+$ git merge -m Merge branch 'docs-next' of git://git.lwn.net/linux.git jc_docs/docs-next
+Merge made by the 'ort' strategy.
+ Documentation/RCU/torture.rst                      |   2 +-
+ Documentation/doc-guide/kernel-doc.rst             |  45 ++
+ Documentation/doc-guide/maintainer-profile.rst     |   7 +
+ Documentation/driver-api/index.rst                 | 209 +++---
+ Documentation/process/coding-style.rst             |   3 +-
+ Documentation/subsystem-apis.rst                   |   2 +
+ Documentation/translations/it_IT/RCU/index.rst     |  19 +
+ Documentation/translations/it_IT/RCU/torture.rst   | 369 +++++++++
+ .../translations/it_IT/core-api/index.rst          |  12 +
+ Documentation/translations/it_IT/index.rst         |   1 +
+ Documentation/translations/it_IT/locking/index.rst |  20 +
+ .../translations/it_IT/locking/lockdep-design.rst  | 678 +++++++++++++++++
+ .../translations/it_IT/locking/lockstat.rst        | 230 ++++++
+ .../translations/it_IT/locking/locktorture.rst     | 181 +++++
+ .../translations/it_IT/locking/locktypes.rst       | 547 ++++++++++++++
+ Documentation/userspace-api/index.rst              |  47 +-
+ Documentation/userspace-api/perf_ring_buffer.rst   | 830 +++++++++++++++++++++
+ drivers/gpu/drm/drm_gem_vram_helper.c              |  44 +-
+ include/drm/drm_gem_vram_helper.h                  |  16 +-
+ scripts/kernel-doc                                 |   4 +-
+ scripts/sphinx-pre-install                         |   9 +-
+ 21 files changed, 3138 insertions(+), 137 deletions(-)
+ create mode 100644 Documentation/translations/it_IT/RCU/index.rst
+ create mode 100644 Documentation/translations/it_IT/RCU/torture.rst
+ create mode 100644 Documentation/translations/it_IT/locking/index.rst
+ create mode 100644 Documentation/translations/it_IT/locking/lockdep-design.rst
+ create mode 100644 Documentation/translations/it_IT/locking/lockstat.rst
+ create mode 100644 Documentation/translations/it_IT/locking/locktorture.rst
+ create mode 100644 Documentation/translations/it_IT/locking/locktypes.rst
+ create mode 100644 Documentation/userspace-api/perf_ring_buffer.rst
+Merging v4l-dvb/master (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'master' of git://linuxtv.org/media_tree.git v4l-dvb/master
+Already up to date.
+Merging v4l-dvb-next/master (04447d48afd3 media: mediatek: vcodec: drop excess struct members descriptions)
+$ git merge -m Merge branch 'master' of git://linuxtv.org/mchehab/media-next.git v4l-dvb-next/master
+Merge made by the 'ort' strategy.
+ drivers/media/i2c/alvium-csi2.c                    |  2 +-
+ drivers/media/i2c/ar0521.c                         |  6 +-
+ drivers/media/mc/mc-devnode.c                      |  1 -
+ drivers/media/pci/intel/ipu3/ipu3-cio2.c           | 22 ++-----
+ drivers/media/platform/cadence/cdns-csi2rx.c       | 19 +++++-
+ .../mediatek/vcodec/decoder/vdec/vdec_vp8_req_if.c |  1 -
+ .../vcodec/decoder/vdec/vdec_vp9_req_lat_if.c      |  1 -
+ .../platform/samsung/exynos4-is/fimc-capture.c     | 52 +++++++--------
+ .../media/platform/samsung/exynos4-is/fimc-core.c  | 23 +++----
+ .../media/platform/samsung/exynos4-is/fimc-core.h  | 23 ++++---
+ .../platform/samsung/exynos4-is/fimc-isp-video.c   |  2 +-
+ .../platform/samsung/exynos4-is/fimc-lite-reg.c    | 13 ++--
+ .../platform/samsung/exynos4-is/fimc-lite-reg.h    | 12 ++--
+ .../media/platform/samsung/exynos4-is/fimc-lite.c  |  2 +-
+ .../media/platform/samsung/exynos4-is/fimc-m2m.c   | 23 +++----
+ .../media/platform/samsung/exynos4-is/fimc-reg.c   | 38 +++++------
+ .../media/platform/samsung/exynos4-is/fimc-reg.h   | 10 +--
+ drivers/media/platform/samsung/s5p-mfc/s5p_mfc.c   | 76 +++++++++++-----------
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_cmd.c   |  8 +--
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_cmd.h   |  2 +-
+ .../platform/samsung/s5p-mfc/s5p_mfc_cmd_v5.c      |  6 +-
+ .../platform/samsung/s5p-mfc/s5p_mfc_cmd_v5.h      |  2 +-
+ .../platform/samsung/s5p-mfc/s5p_mfc_cmd_v6.c      |  8 +--
+ .../platform/samsung/s5p-mfc/s5p_mfc_cmd_v6.h      |  2 +-
+ .../platform/samsung/s5p-mfc/s5p_mfc_common.h      | 14 ++--
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_ctrl.c  | 26 ++++----
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_dec.c   | 20 +++---
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_dec.h   |  3 +-
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_enc.c   | 12 ++--
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_enc.h   |  3 +-
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_opr.c   |  7 +-
+ .../platform/samsung/s5p-mfc/s5p_mfc_opr_v5.c      | 28 ++++----
+ .../platform/samsung/s5p-mfc/s5p_mfc_opr_v5.h      |  2 +-
+ .../platform/samsung/s5p-mfc/s5p_mfc_opr_v6.c      | 36 +++++-----
+ .../platform/samsung/s5p-mfc/s5p_mfc_opr_v6.h      |  2 +-
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_pm.c    | 51 ++++++---------
+ .../media/platform/samsung/s5p-mfc/s5p_mfc_pm.h    |  8 +--
+ .../media/platform/ti/j721e-csi2rx/j721e-csi2rx.c  | 24 +++++++
+ drivers/media/platform/xilinx/Kconfig              |  4 +-
+ drivers/media/v4l2-core/v4l2-mc.c                  | 23 +++++--
+ .../staging/media/ipu3/include/uapi/intel-ipu3.h   |  3 -
+ drivers/staging/media/ipu3/ipu3-v4l2.c             | 16 ++---
+ include/media/media-entity.h                       |  4 --
+ 43 files changed, 326 insertions(+), 314 deletions(-)
+Merging pm/linux-next (7543bfcb6b1a Merge branch 'thermal-core' into linux-next)
+$ git merge -m Merge branch 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git pm/linux-next
+Merge made by the 'ort' strategy.
+ drivers/base/power/main.c        | 74 ++++++++++++++++++----------------------
+ drivers/thermal/gov_bang_bang.c  |  2 +-
+ drivers/thermal/gov_fair_share.c | 16 +++++----
+ include/linux/pm.h               | 30 ++++++++--------
+ 4 files changed, 59 insertions(+), 63 deletions(-)
+Merging cpufreq-arm/cpufreq/arm/linux-next (eaffb10b51bf cpufreq: mediatek-hw: Don't error out if supply is not found)
+$ git merge -m Merge branch 'cpufreq/arm/linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git cpufreq-arm/cpufreq/arm/linux-next
+Merge made by the 'ort' strategy.
+ Documentation/power/opp.rst                    |  2 +-
+ Documentation/translations/zh_CN/power/opp.rst |  2 +-
+ drivers/cpufreq/brcmstb-avs-cpufreq.c          |  2 ++
+ drivers/cpufreq/imx6q-cpufreq.c                | 43 +++++++++-----------------
+ drivers/cpufreq/mediatek-cpufreq-hw.c          | 19 +++++++++++-
+ 5 files changed, 36 insertions(+), 32 deletions(-)
+Merging cpupower/cpupower (0086ffec768b tools cpupower bench: Override CFLAGS assignments)
+$ git merge -m Merge branch 'cpupower' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux.git cpupower/cpupower
+Already up to date.
+Merging devfreq/devfreq-next (aed5ed595960 PM / devfreq: Synchronize devfreq_monitor_[start/stop])
+$ git merge -m Merge branch 'devfreq-next' of git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/linux.git devfreq/devfreq-next
+Already up to date.
+Merging pmdomain/next (90a7463fae9e pmdomain: renesas: r8a779h0-sysc: Add r8a779h0 support)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/linux-pm.git pmdomain/next
+Merge made by the 'ort' strategy.
+ drivers/pmdomain/core.c                   | 133 ++++++++++++++++++------------
+ drivers/pmdomain/imx/imx8m-blk-ctrl.c     |   9 +-
+ drivers/pmdomain/imx/imx8mp-blk-ctrl.c    |   9 +-
+ drivers/pmdomain/qcom/rpmpd.c             |  13 ++-
+ drivers/pmdomain/renesas/Kconfig          |   4 +
+ drivers/pmdomain/renesas/Makefile         |   1 +
+ drivers/pmdomain/renesas/r8a779a0-sysc.c  |  12 ---
+ drivers/pmdomain/renesas/r8a779f0-sysc.c  |  12 ---
+ drivers/pmdomain/renesas/r8a779g0-sysc.c  |  12 ---
+ drivers/pmdomain/renesas/r8a779h0-sysc.c  |  54 ++++++++++++
+ drivers/pmdomain/renesas/rcar-gen4-sysc.c |   3 +
+ drivers/pmdomain/renesas/rcar-gen4-sysc.h |   1 +
+ drivers/pmdomain/ti/omap_prm.c            |   2 +
+ 13 files changed, 165 insertions(+), 100 deletions(-)
+ create mode 100644 drivers/pmdomain/renesas/r8a779h0-sysc.c
+Merging opp/opp/linux-next (ace4b31b297d cpufreq: Move dev_pm_opp_{init|free}_cpufreq_table() to pm_opp.h)
+$ git merge -m Merge branch 'opp/linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git opp/opp/linux-next
+Merge made by the 'ort' strategy.
+ include/linux/cpufreq.h | 20 --------------------
+ include/linux/pm_opp.h  | 16 ++++++++++++++++
+ 2 files changed, 16 insertions(+), 20 deletions(-)
+Merging thermal/thermal/linux-next (5314b1543787 thermal/drivers/exynos: Use set_trips ops)
+$ git merge -m Merge branch 'thermal/linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/thermal/linux.git thermal/thermal/linux-next
+Already up to date.
+Merging dlm/next (5beebc1dda47 dlm: update format header reflect current format)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm.git dlm/next
+Already up to date.
+Merging rdma/for-next (a400073ce3dd RDMA/mlx5: Delete unused mlx5_ib_copy_pas prototype)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git rdma/for-next
+Merge made by the 'ort' strategy.
+ drivers/infiniband/hw/cxgb4/iw_cxgb4.h         |   2 -
+ drivers/infiniband/hw/hfi1/tid_rdma.c          |  25 +-
+ drivers/infiniband/hw/hns/hns_roce_cq.c        |  11 +-
+ drivers/infiniband/hw/hns/hns_roce_device.h    |  16 +-
+ drivers/infiniband/hw/hns/hns_roce_hem.c       |  95 ++-----
+ drivers/infiniband/hw/hns/hns_roce_hem.h       |  56 +---
+ drivers/infiniband/hw/hns/hns_roce_hw_v2.c     | 111 ++++----
+ drivers/infiniband/hw/hns/hns_roce_mr.c        | 341 ++++++++++++++++++-------
+ drivers/infiniband/hw/mana/cq.c                |  25 +-
+ drivers/infiniband/hw/mana/main.c              |  40 +--
+ drivers/infiniband/hw/mana/mana_ib.h           |  20 +-
+ drivers/infiniband/hw/mana/mr.c                |  13 +-
+ drivers/infiniband/hw/mana/qp.c                |  88 ++-----
+ drivers/infiniband/hw/mlx5/mlx5_ib.h           |   1 -
+ drivers/infiniband/sw/rxe/rxe.c                |   6 +-
+ drivers/infiniband/sw/rxe/rxe.h                |   6 +-
+ drivers/infiniband/sw/rxe/rxe_comp.c           |   4 +-
+ drivers/infiniband/sw/rxe/rxe_cq.c             |   4 +-
+ drivers/infiniband/sw/rxe/rxe_mr.c             |  16 +-
+ drivers/infiniband/sw/rxe/rxe_mw.c             |   2 +-
+ drivers/infiniband/sw/rxe/rxe_qp.c             |   8 +-
+ drivers/infiniband/sw/rxe/rxe_resp.c           |  12 +-
+ drivers/infiniband/sw/rxe/rxe_task.c           |   4 +-
+ drivers/infiniband/sw/rxe/rxe_verbs.c          | 216 ++++++++--------
+ drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   3 +-
+ 25 files changed, 576 insertions(+), 549 deletions(-)
+Merging net-next/main (e7f8df0e81bf dpll: move xa_erase() call in to match dpll_pin_alloc() error path order)
+$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git net-next/main
+Auto-merging .mailmap
+Auto-merging MAINTAINERS
+Auto-merging drivers/net/dsa/mt7530.c
+Auto-merging drivers/net/ethernet/google/gve/gve_rx.c
+Auto-merging drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+Auto-merging drivers/net/wireless/microchip/wilc1000/netdev.c
+Merge made by the 'ort' strategy.
+ .mailmap                                           |    1 +
+ .../bpf/standardization/instruction-set.rst        |   80 +-
+ Documentation/bpf/verifier.rst                     |    2 +-
+ Documentation/dev-tools/kselftest.rst              |   12 +
+ Documentation/devicetree/bindings/leds/common.yaml |   12 +
+ .../devicetree/bindings/leds/leds-bcm63138.yaml    |    4 -
+ .../devicetree/bindings/leds/leds-bcm6328.yaml     |    4 -
+ .../devicetree/bindings/leds/leds-bcm6358.txt      |    2 -
+ .../bindings/leds/leds-pwm-multicolor.yaml         |    4 -
+ .../devicetree/bindings/leds/leds-pwm.yaml         |    5 -
+ .../devicetree/bindings/net/nfc/ti,trf7970a.yaml   |    2 +-
+ .../devicetree/bindings/net/qca,qca808x.yaml       |   54 +
+ .../devicetree/bindings/net/snps,dwmac.yaml        |   11 +-
+ .../bindings/net/starfive,jh7110-dwmac.yaml        |   72 +-
+ Documentation/networking/devlink/mlx5.rst          |    4 +
+ MAINTAINERS                                        |   10 +
+ arch/arm64/net/bpf_jit_comp.c                      |    5 +
+ arch/x86/net/bpf_jit_comp.c                        |    5 +
+ drivers/dpll/dpll_core.c                           |    2 +-
+ drivers/media/rc/bpf-lirc.c                        |    2 +-
+ drivers/net/arcnet/arcnet.c                        |    1 +
+ drivers/net/dsa/Kconfig                            |    2 +-
+ drivers/net/dsa/b53/b53_common.c                   |   10 +-
+ drivers/net/dsa/b53/b53_priv.h                     |    6 +-
+ drivers/net/dsa/bcm_sf2.c                          |    2 +-
+ drivers/net/dsa/microchip/ksz8795.c                |  410 +++++---
+ drivers/net/dsa/microchip/ksz8795_reg.h            |    1 +
+ drivers/net/dsa/microchip/ksz_common.c             |    4 +-
+ drivers/net/dsa/mt7530-mdio.c                      |    7 +-
+ drivers/net/dsa/mt7530.c                           |  173 ++--
+ drivers/net/dsa/mt7530.h                           |   16 +-
+ drivers/net/dsa/mv88e6xxx/chip.c                   |    4 +-
+ drivers/net/dsa/qca/qca8k-common.c                 |    4 +-
+ drivers/net/dsa/qca/qca8k.h                        |    4 +-
+ .../net/ethernet/aquantia/atlantic/aq_ethtool.c    |   12 +-
+ drivers/net/ethernet/broadcom/asp2/bcmasp.h        |    2 +-
+ .../net/ethernet/broadcom/asp2/bcmasp_ethtool.c    |    8 +-
+ drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c    |    9 +-
+ .../net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c    |   14 +-
+ drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c   |   14 +-
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c          |   20 +-
+ drivers/net/ethernet/broadcom/bnxt/bnxt.h          |    2 +-
+ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c  |   20 +-
+ drivers/net/ethernet/broadcom/genet/bcmgenet.c     |    8 +-
+ drivers/net/ethernet/broadcom/genet/bcmgenet.h     |    2 +-
+ drivers/net/ethernet/broadcom/tg3.c                |   32 +-
+ drivers/net/ethernet/broadcom/tg3.h                |    2 +-
+ drivers/net/ethernet/ec_bhf.c                      |    1 +
+ drivers/net/ethernet/engleder/tsnep_main.c         |   10 +-
+ drivers/net/ethernet/freescale/enetc/enetc.c       |    4 +-
+ drivers/net/ethernet/freescale/fec.h               |    2 +-
+ drivers/net/ethernet/freescale/fec_main.c          |   10 +-
+ drivers/net/ethernet/freescale/gianfar.c           |    4 +-
+ drivers/net/ethernet/google/gve/gve.h              |  144 ++-
+ drivers/net/ethernet/google/gve/gve_dqo.h          |   18 +-
+ drivers/net/ethernet/google/gve/gve_main.c         |  862 ++++++++++------
+ drivers/net/ethernet/google/gve/gve_rx.c           |  135 ++-
+ drivers/net/ethernet/google/gve/gve_rx_dqo.c       |   91 +-
+ drivers/net/ethernet/google/gve/gve_tx.c           |  128 ++-
+ drivers/net/ethernet/google/gve/gve_tx_dqo.c       |  108 +-
+ drivers/net/ethernet/google/gve/gve_utils.c        |   31 +
+ drivers/net/ethernet/google/gve/gve_utils.h        |    5 +
+ drivers/net/ethernet/intel/e1000e/ethtool.c        |   16 +-
+ drivers/net/ethernet/intel/i40e/i40e_ethtool.c     |   16 +-
+ drivers/net/ethernet/intel/ice/ice.h               |    1 -
+ drivers/net/ethernet/intel/ice/ice_ethtool.c       |    2 +-
+ drivers/net/ethernet/intel/ice/ice_main.c          |    4 +-
+ drivers/net/ethernet/intel/ice/ice_ptp.c           |  233 +++--
+ drivers/net/ethernet/intel/ice/ice_ptp.h           |   34 +-
+ drivers/net/ethernet/intel/igb/igb_ethtool.c       |   28 +-
+ drivers/net/ethernet/intel/igc/igc.h               |    2 +-
+ drivers/net/ethernet/intel/igc/igc_ethtool.c       |   20 +-
+ drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c   |   28 +-
+ drivers/net/ethernet/marvell/mvneta.c              |    4 +-
+ drivers/net/ethernet/marvell/octeontx2/af/mbox.h   |    1 +
+ drivers/net/ethernet/marvell/octeontx2/af/npc.h    |   15 +-
+ .../ethernet/marvell/octeontx2/af/npc_profile.h    |  621 ++++++++++--
+ .../net/ethernet/marvell/octeontx2/af/rvu_nix.c    |    7 +
+ .../mellanox/mlxsw/core_acl_flex_actions.c         |   16 +-
+ .../ethernet/mellanox/mlxsw/core_acl_flex_keys.c   |    9 +-
+ drivers/net/ethernet/mellanox/mlxsw/minimal.c      |    1 -
+ drivers/net/ethernet/mellanox/mlxsw/spectrum.c     |  160 +--
+ drivers/net/ethernet/mellanox/mlxsw/spectrum.h     |   15 +-
+ drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c |   11 +-
+ .../ethernet/mellanox/mlxsw/spectrum_acl_tcam.c    |   17 +-
+ .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |   15 +-
+ .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   |    8 +-
+ drivers/net/ethernet/microchip/encx24j600-regmap.c |    1 +
+ drivers/net/ethernet/microchip/lan743x_ethtool.c   |    4 +-
+ .../microchip/lan966x/lan966x_vcap_debugfs.c       |    2 +
+ drivers/net/ethernet/mscc/ocelot.c                 |    1 +
+ drivers/net/ethernet/qlogic/qede/qede_ethtool.c    |   32 +-
+ drivers/net/ethernet/qualcomm/emac/emac.c          |    1 +
+ drivers/net/ethernet/qualcomm/qca_7k.c             |   17 +-
+ drivers/net/ethernet/qualcomm/qca_7k.h             |   16 +-
+ drivers/net/ethernet/qualcomm/qca_7k_common.c      |   17 +-
+ drivers/net/ethernet/qualcomm/qca_7k_common.h      |   29 +-
+ drivers/net/ethernet/qualcomm/qca_debug.c          |   21 +-
+ drivers/net/ethernet/qualcomm/qca_debug.h          |   15 +-
+ drivers/net/ethernet/qualcomm/qca_spi.c            |   71 +-
+ drivers/net/ethernet/qualcomm/qca_spi.h            |   22 +-
+ drivers/net/ethernet/qualcomm/qca_uart.c           |   17 +-
+ drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c |    1 +
+ drivers/net/ethernet/realtek/r8169_main.c          |    4 +-
+ drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c |    4 +-
+ drivers/net/ethernet/smsc/smc91x.c                 |    1 +
+ drivers/net/ethernet/smsc/smsc911x.c               |    1 +
+ drivers/net/ethernet/smsc/smsc9420.c               |    1 +
+ drivers/net/ethernet/stmicro/stmmac/Kconfig        |    6 +-
+ drivers/net/ethernet/stmicro/stmmac/common.h       |    2 +
+ .../net/ethernet/stmicro/stmmac/dwmac-socfpga.c    |    1 +
+ .../net/ethernet/stmicro/stmmac/dwmac-starfive.c   |   32 +-
+ drivers/net/ethernet/stmicro/stmmac/stmmac_est.c   |    6 +
+ .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   |    4 +-
+ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |   22 +
+ drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c    |   87 +-
+ drivers/net/ethernet/ti/am65-cpsw-ethtool.c        |    4 +-
+ drivers/net/ethernet/ti/cpsw-common.c              |    1 +
+ drivers/net/ethernet/ti/cpsw_ethtool.c             |    4 +-
+ drivers/net/ethernet/ti/cpsw_priv.h                |    4 +-
+ drivers/net/ethernet/ti/icssg/icssg_ethtool.c      |    4 +-
+ drivers/net/ethernet/wangxun/libwx/wx_hw.c         |    2 -
+ drivers/net/ethernet/wangxun/libwx/wx_lib.c        |   20 +-
+ drivers/net/ethernet/wangxun/libwx/wx_type.h       |    1 -
+ drivers/net/ethernet/wangxun/txgbe/Makefile        |    1 +
+ drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c     |  269 +++++
+ drivers/net/ethernet/wangxun/txgbe/txgbe_irq.h     |    7 +
+ drivers/net/ethernet/wangxun/txgbe/txgbe_main.c    |  140 +--
+ drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c     |   59 +-
+ drivers/net/ethernet/wangxun/txgbe/txgbe_phy.h     |    2 +
+ drivers/net/ethernet/wangxun/txgbe/txgbe_type.h    |   17 +
+ drivers/net/pcs/pcs-lynx.c                         |    1 +
+ drivers/net/pcs/pcs-mtk-lynxi.c                    |    1 +
+ drivers/net/pcs/pcs-xpcs.c                         |    1 +
+ drivers/net/phy/at803x.c                           |  327 ++++++
+ drivers/net/phy/marvell.c                          |    2 +-
+ drivers/net/phy/micrel.c                           |   61 +-
+ drivers/net/phy/phy-c45.c                          |   44 +-
+ drivers/net/phy/phy.c                              |    8 +-
+ drivers/net/phy/phy_device.c                       |   16 +
+ drivers/net/phy/phylink.c                          |    8 +-
+ drivers/net/tun.c                                  |    7 +-
+ drivers/net/usb/ax88179_178a.c                     |   20 +-
+ drivers/net/usb/lan78xx.c                          |    4 +-
+ drivers/net/usb/r8152.c                            |   28 +-
+ drivers/net/wireless/broadcom/b43/b43.h            |   16 +
+ drivers/net/wireless/broadcom/b43/dma.c            |    4 +-
+ drivers/net/wireless/broadcom/b43/main.c           |   16 +-
+ drivers/net/wireless/broadcom/b43/pio.c            |    6 +-
+ .../broadcom/brcm80211/brcmfmac/bca/core.c         |   30 +-
+ .../broadcom/brcm80211/brcmfmac/cfg80211.c         |   64 +-
+ .../broadcom/brcm80211/brcmfmac/cfg80211.h         |    2 +
+ .../wireless/broadcom/brcm80211/brcmfmac/common.c  |   18 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/core.c    |   12 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/core.h    |    2 +-
+ .../broadcom/brcm80211/brcmfmac/cyw/core.c         |   50 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/feature.c |   11 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/fweh.c    |  154 ++-
+ .../wireless/broadcom/brcm80211/brcmfmac/fweh.h    |   60 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/fwil.c    |  116 +--
+ .../wireless/broadcom/brcm80211/brcmfmac/fwil.h    |  125 ++-
+ .../broadcom/brcm80211/brcmfmac/fwil_types.h       |    2 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/fwvid.c   |   13 +-
+ .../wireless/broadcom/brcm80211/brcmfmac/fwvid.h   |   48 +-
+ .../broadcom/brcm80211/brcmfmac/wcc/core.c         |   31 +-
+ .../broadcom/brcm80211/brcmsmac/phy/phy_cmn.c      |    3 +-
+ .../broadcom/brcm80211/brcmsmac/phy/phy_int.h      |    2 +-
+ .../broadcom/brcm80211/brcmsmac/phy/phy_n.c        |   11 +-
+ drivers/net/wireless/intel/iwlegacy/common.c       |    4 +-
+ drivers/net/wireless/marvell/mwifiex/cfg80211.c    |    2 +-
+ drivers/net/wireless/marvell/mwifiex/debugfs.c     |    3 -
+ drivers/net/wireless/marvell/mwifiex/wmm.c         |    2 +-
+ drivers/net/wireless/microchip/wilc1000/cfg80211.c |   12 +-
+ drivers/net/wireless/microchip/wilc1000/hif.c      |   40 +-
+ drivers/net/wireless/microchip/wilc1000/netdev.c   |   12 +-
+ drivers/net/wireless/microchip/wilc1000/wlan.c     |   35 +-
+ drivers/net/wireless/microchip/wilc1000/wlan.h     |    6 +
+ drivers/net/wireless/ralink/rt2x00/rt2x00crypto.c  |    5 +-
+ drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.h   |   20 +-
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8188e.c |    3 +-
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8188f.c |    2 +
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192c.c |    1 +
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c |    1 +
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192f.c |   33 +-
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8710b.c |    1 +
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723a.c |    1 +
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723b.c |    1 +
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c  |  409 ++++++--
+ .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_regs.h  |   15 +
+ drivers/net/wireless/realtek/rtlwifi/efuse.c       |   36 +-
+ drivers/net/wireless/realtek/rtlwifi/efuse.h       |    4 +-
+ drivers/net/wireless/realtek/rtlwifi/pci.c         |   12 +-
+ .../net/wireless/realtek/rtlwifi/rtl8192ce/trx.c   |    4 -
+ .../net/wireless/realtek/rtlwifi/rtl8192cu/sw.c    |    6 +-
+ .../net/wireless/realtek/rtlwifi/rtl8192cu/trx.c   |    3 -
+ .../net/wireless/realtek/rtlwifi/rtl8192de/trx.c   |    5 +-
+ .../net/wireless/realtek/rtlwifi/rtl8723ae/trx.c   |    6 +-
+ drivers/net/wireless/realtek/rtlwifi/usb.c         |  164 +--
+ drivers/net/wireless/realtek/rtlwifi/wifi.h        |   38 +-
+ drivers/net/wireless/realtek/rtw88/debug.c         |   44 +-
+ drivers/net/wireless/realtek/rtw88/pci.c           |    4 +
+ drivers/net/wireless/realtek/rtw88/reg.h           |    3 +
+ drivers/net/wireless/realtek/rtw89/cam.c           |   61 ++
+ drivers/net/wireless/realtek/rtw89/cam.h           |  109 ++
+ drivers/net/wireless/realtek/rtw89/chan.c          |    2 +-
+ drivers/net/wireless/realtek/rtw89/core.c          |  344 +++++--
+ drivers/net/wireless/realtek/rtw89/core.h          |  136 ++-
+ drivers/net/wireless/realtek/rtw89/fw.c            |  944 ++++++++++++++++--
+ drivers/net/wireless/realtek/rtw89/fw.h            |  810 ++++++++-------
+ drivers/net/wireless/realtek/rtw89/mac.c           |   96 +-
+ drivers/net/wireless/realtek/rtw89/mac.h           |    5 +-
+ drivers/net/wireless/realtek/rtw89/mac80211.c      |   18 +-
+ drivers/net/wireless/realtek/rtw89/mac_be.c        |    4 +-
+ drivers/net/wireless/realtek/rtw89/pci.c           |   69 +-
+ drivers/net/wireless/realtek/rtw89/pci.h           |    1 +
+ drivers/net/wireless/realtek/rtw89/phy.c           |   46 +-
+ drivers/net/wireless/realtek/rtw89/phy.h           |   72 ++
+ drivers/net/wireless/realtek/rtw89/phy_be.c        |  312 ++++++
+ drivers/net/wireless/realtek/rtw89/reg.h           |  278 +++++-
+ drivers/net/wireless/realtek/rtw89/rtw8851b.c      |   15 +-
+ .../net/wireless/realtek/rtw89/rtw8851b_table.c    |   72 +-
+ drivers/net/wireless/realtek/rtw89/rtw8852a.c      |   11 +-
+ drivers/net/wireless/realtek/rtw89/rtw8852b.c      |   15 +-
+ .../net/wireless/realtek/rtw89/rtw8852b_table.c    |  142 +--
+ drivers/net/wireless/realtek/rtw89/rtw8852c.c      |   14 +-
+ drivers/net/wireless/realtek/rtw89/rtw8922a.c      |  705 ++++++++++++-
+ drivers/net/wireless/realtek/rtw89/wow.c           |    2 +-
+ drivers/ptp/Kconfig                                |   12 +
+ drivers/ptp/Makefile                               |    1 +
+ drivers/ptp/ptp_clock.c                            |   16 +-
+ drivers/ptp/ptp_fc3.c                              | 1016 +++++++++++++++++++
+ drivers/ptp/ptp_fc3.h                              |   45 +
+ drivers/ptp/ptp_sysfs.c                            |   13 +-
+ include/linux/bpf.h                                |  142 ++-
+ include/linux/bpf_verifier.h                       |    3 +-
+ include/linux/btf.h                                |   13 +
+ include/linux/ethtool.h                            |   17 +-
+ include/linux/filter.h                             |    3 +-
+ include/linux/inet_diag.h                          |    1 +
+ include/linux/lsm_hook_defs.h                      |   15 +-
+ include/linux/mfd/idtRC38xxx_reg.h                 |  273 +++++
+ include/linux/phy.h                                |   30 +-
+ include/linux/phylink.h                            |    4 +-
+ include/linux/ptp_clock_kernel.h                   |    3 +
+ include/linux/security.h                           |   43 +-
+ include/linux/sock_diag.h                          |   10 +-
+ include/linux/stmmac.h                             |    1 +
+ include/net/af_unix.h                              |   14 +-
+ include/net/dsa.h                                  |    4 +-
+ include/net/ip6_fib.h                              |    6 -
+ include/net/netfilter/nf_tables.h                  |    6 +
+ include/net/request_sock.h                         |   39 +
+ include/net/scm.h                                  |    1 +
+ include/net/sock.h                                 |   25 -
+ include/net/tcp.h                                  |   45 +
+ include/uapi/linux/bpf.h                           |   78 +-
+ include/uapi/linux/netfilter/nf_tables.h           |    6 +-
+ include/uapi/linux/ptp_clock.h                     |   13 +-
+ kernel/bpf/Makefile                                |    2 +-
+ kernel/bpf/arraymap.c                              |    2 +-
+ kernel/bpf/bpf_lsm.c                               |   15 +-
+ kernel/bpf/bpf_struct_ops.c                        |  447 +++++----
+ kernel/bpf/bpf_struct_ops_types.h                  |   12 -
+ kernel/bpf/btf.c                                   |  276 ++++-
+ kernel/bpf/cgroup.c                                |    6 +-
+ kernel/bpf/core.c                                  |   13 +-
+ kernel/bpf/helpers.c                               |    7 +-
+ kernel/bpf/inode.c                                 |  276 ++++-
+ kernel/bpf/syscall.c                               |  234 +++--
+ kernel/bpf/token.c                                 |  278 ++++++
+ kernel/bpf/verifier.c                              |  148 ++-
+ kernel/trace/bpf_trace.c                           |   17 +-
+ net/bpf/bpf_dummy_struct_ops.c                     |   22 +-
+ net/bridge/netfilter/Kconfig                       |    7 +
+ net/bridge/netfilter/Makefile                      |    2 +-
+ net/core/dev.c                                     |   27 +-
+ net/core/dev.h                                     |    1 +
+ net/core/filter.c                                  |  155 ++-
+ net/core/scm.c                                     |    5 +
+ net/core/sock.c                                    |   14 +-
+ net/core/sock_diag.c                               |  120 ++-
+ net/core/xdp.c                                     |    6 +-
+ net/dccp/diag.c                                    |    1 +
+ net/dsa/user.c                                     |    4 +-
+ net/ethtool/common.c                               |    5 +
+ net/ethtool/common.h                               |    1 +
+ net/ethtool/eee.c                                  |   75 +-
+ net/ethtool/ioctl.c                                |   69 +-
+ net/ieee802154/6lowpan/core.c                      |    1 +
+ net/ieee802154/socket.c                            |    1 +
+ net/ipv4/bpf_tcp_ca.c                              |   22 +-
+ net/ipv4/inet_diag.c                               |  101 +-
+ net/ipv4/netfilter/Kconfig                         |   43 +-
+ net/ipv4/netfilter/Makefile                        |    2 +-
+ net/ipv4/raw_diag.c                                |    1 +
+ net/ipv4/syncookies.c                              |   40 +-
+ net/ipv4/tcp_diag.c                                |    1 +
+ net/ipv4/tcp_input.c                               |   18 +-
+ net/ipv4/udp_diag.c                                |    2 +
+ net/ipv6/ip6_fib.c                                 |   19 +-
+ net/ipv6/netfilter/Kconfig                         |   20 +-
+ net/ipv6/netfilter/Makefile                        |    2 +-
+ net/ipv6/route.c                                   |    8 +-
+ net/ipv6/syncookies.c                              |   13 +-
+ net/mptcp/mptcp_diag.c                             |    1 +
+ net/netfilter/Kconfig                              |   12 +-
+ net/netfilter/ipvs/ip_vs_conn.c                    |    4 +-
+ net/netfilter/nf_bpf_link.c                        |    2 +-
+ net/netfilter/nf_conncount.c                       |    8 +-
+ net/netfilter/nf_tables_api.c                      |   35 +-
+ net/netlink/diag.c                                 |    1 +
+ net/packet/diag.c                                  |    1 +
+ net/rds/connection.c                               |    4 +-
+ net/sched/sch_taprio.c                             |   72 +-
+ net/sctp/diag.c                                    |    1 +
+ net/smc/smc_diag.c                                 |    1 +
+ net/tipc/diag.c                                    |    1 +
+ net/tipc/node.c                                    |    2 -
+ net/tipc/socket.c                                  |    1 -
+ net/unix/af_unix.c                                 |   10 +-
+ net/unix/diag.c                                    |    1 +
+ net/unix/garbage.c                                 |   98 +-
+ net/unix/scm.c                                     |   27 +-
+ net/vmw_vsock/diag.c                               |    1 +
+ net/xdp/xsk_diag.c                                 |    1 +
+ rust/kernel/net/phy.rs                             |   24 +-
+ security/security.c                                |  101 +-
+ security/selinux/hooks.c                           |   47 +-
+ tools/bpf/bpftool/link.c                           |   96 +-
+ tools/bpf/bpftool/prog.c                           |    2 +-
+ tools/include/uapi/linux/bpf.h                     |   79 +-
+ tools/lib/bpf/Build                                |    2 +-
+ tools/lib/bpf/bpf.c                                |   42 +-
+ tools/lib/bpf/bpf.h                                |   38 +-
+ tools/lib/bpf/bpf_core_read.h                      |    2 +-
+ tools/lib/bpf/btf.c                                |   10 +-
+ tools/lib/bpf/elf.c                                |    2 -
+ tools/lib/bpf/features.c                           |  503 ++++++++++
+ tools/lib/bpf/libbpf.c                             |  604 +++--------
+ tools/lib/bpf/libbpf.h                             |   21 +-
+ tools/lib/bpf/libbpf.map                           |    1 +
+ tools/lib/bpf/libbpf_internal.h                    |   50 +-
+ tools/lib/bpf/libbpf_probes.c                      |   12 +-
+ tools/lib/bpf/str_error.h                          |    3 +
+ tools/testing/selftests/Makefile                   |    7 +-
+ tools/testing/selftests/bpf/README.rst             |   32 +-
+ tools/testing/selftests/bpf/bpf_experimental.h     |   21 +-
+ tools/testing/selftests/bpf/bpf_kfuncs.h           |   10 +
+ .../selftests/bpf/bpf_testmod/bpf_testmod.c        |   75 ++
+ .../selftests/bpf/bpf_testmod/bpf_testmod.h        |    5 +
+ tools/testing/selftests/bpf/config                 |    1 +
+ .../selftests/bpf/prog_tests/bpf_verif_scale.c     |    2 +-
+ .../testing/selftests/bpf/prog_tests/ctx_rewrite.c |   44 -
+ .../selftests/bpf/prog_tests/fill_link_info.c      |  114 ++-
+ .../selftests/bpf/prog_tests/kptr_xchg_inline.c    |   51 +
+ .../selftests/bpf/prog_tests/libbpf_probes.c       |    4 +
+ .../testing/selftests/bpf/prog_tests/libbpf_str.c  |    6 +
+ .../testing/selftests/bpf/prog_tests/reg_bounds.c  |    2 +-
+ .../testing/selftests/bpf/prog_tests/tc_redirect.c |   90 +-
+ .../bpf/prog_tests/tcp_custom_syncookie.c          |  150 +++
+ .../bpf/prog_tests/test_struct_ops_module.c        |   75 ++
+ tools/testing/selftests/bpf/prog_tests/token.c     | 1052 ++++++++++++++++++++
+ tools/testing/selftests/bpf/prog_tests/xdpwall.c   |    2 +-
+ tools/testing/selftests/bpf/progs/bpf_misc.h       |    2 +-
+ .../testing/selftests/bpf/progs/bpf_tracing_net.h  |   16 +
+ tools/testing/selftests/bpf/progs/iters.c          |    4 +-
+ .../testing/selftests/bpf/progs/kptr_xchg_inline.c |   48 +
+ tools/testing/selftests/bpf/progs/priv_map.c       |   13 +
+ tools/testing/selftests/bpf/progs/priv_prog.c      |   13 +
+ .../selftests/bpf/progs/struct_ops_module.c        |   30 +
+ .../selftests/bpf/progs/test_core_reloc_type_id.c  |    2 +-
+ .../selftests/bpf/progs/test_fill_link_info.c      |    6 +
+ .../testing/selftests/bpf/progs/test_map_in_map.c  |   26 +
+ tools/testing/selftests/bpf/progs/test_siphash.h   |   64 ++
+ .../bpf/progs/test_tcp_custom_syncookie.c          |  572 +++++++++++
+ .../bpf/progs/test_tcp_custom_syncookie.h          |  140 +++
+ .../testing/selftests/bpf/progs/test_tcpbpf_kern.c |    2 +-
+ .../testing/selftests/bpf/progs/test_xdp_dynptr.c  |   10 +-
+ tools/testing/selftests/bpf/progs/token_lsm.c      |   32 +
+ .../bpf/progs/verifier_direct_packet_access.c      |    2 +-
+ .../testing/selftests/bpf/progs/verifier_loops1.c  |   24 +
+ .../selftests/bpf/progs/verifier_spill_fill.c      |  229 ++++-
+ tools/testing/selftests/bpf/test_loader.c          |    4 +-
+ tools/testing/selftests/bpf/test_maps.c            |    6 +-
+ tools/testing/selftests/bpf/test_progs.c           |   18 -
+ tools/testing/selftests/bpf/test_sock_addr.c       |    3 +-
+ tools/testing/selftests/bpf/test_verifier.c        |   60 +-
+ tools/testing/selftests/bpf/testing_helpers.c      |   92 +-
+ tools/testing/selftests/bpf/testing_helpers.h      |    8 +
+ .../selftests/bpf/verifier/bpf_loop_inline.c       |    6 +
+ tools/testing/selftests/bpf/verifier/precise.c     |    6 +-
+ .../testing/selftests/drivers/net/bonding/Makefile |    7 +-
+ .../drivers/net/bonding/bond-eth-type-change.sh    |    2 +-
+ .../drivers/net/bonding/bond_topo_2d1c.sh          |    2 +-
+ .../drivers/net/bonding/dev_addr_lists.sh          |    2 +-
+ .../drivers/net/bonding/mode-1-recovery-updelay.sh |    2 +-
+ .../drivers/net/bonding/mode-2-recovery-updelay.sh |    2 +-
+ .../drivers/net/bonding/net_forwarding_lib.sh      |    1 -
+ tools/testing/selftests/drivers/net/dsa/Makefile   |   18 +-
+ .../drivers/net/dsa/bridge_locked_port.sh          |    2 +-
+ .../selftests/drivers/net/dsa/bridge_mdb.sh        |    2 +-
+ .../selftests/drivers/net/dsa/bridge_mld.sh        |    2 +-
+ .../selftests/drivers/net/dsa/bridge_vlan_aware.sh |    2 +-
+ .../selftests/drivers/net/dsa/bridge_vlan_mcast.sh |    2 +-
+ .../drivers/net/dsa/bridge_vlan_unaware.sh         |    2 +-
+ tools/testing/selftests/drivers/net/dsa/lib.sh     |    1 -
+ .../selftests/drivers/net/dsa/local_termination.sh |    2 +-
+ .../selftests/drivers/net/dsa/no_forwarding.sh     |    2 +-
+ .../drivers/net/dsa/run_net_forwarding_test.sh     |    9 +
+ .../selftests/drivers/net/dsa/tc_actions.sh        |    2 +-
+ .../testing/selftests/drivers/net/dsa/tc_common.sh |    1 -
+ .../drivers/net/dsa/test_bridge_fdb_stress.sh      |    2 +-
+ tools/testing/selftests/drivers/net/team/Makefile  |    7 +-
+ .../selftests/drivers/net/team/dev_addr_lists.sh   |    4 +-
+ .../testing/selftests/drivers/net/team/lag_lib.sh  |    1 -
+ .../drivers/net/team/net_forwarding_lib.sh         |    1 -
+ tools/testing/selftests/lib.mk                     |   19 +
+ tools/testing/selftests/net/fcnal-test.sh          |   25 +-
+ tools/testing/selftests/net/forwarding/Makefile    |    3 +
+ tools/testing/selftests/net/forwarding/config      |   28 +
+ tools/testing/selftests/net/forwarding/lib.sh      |   37 +-
+ .../selftests/net/forwarding/mirror_gre_lib.sh     |    2 +-
+ .../net/forwarding/mirror_gre_topo_lib.sh          |    2 +-
+ tools/testing/selftests/net/fq_band_pktlimit.sh    |   14 +-
+ tools/testing/selftests/net/txtimestamp.sh         |   12 +-
+ tools/testing/selftests/tc-testing/config          |    1 +
+ .../selftests/tc-testing/tc-tests/qdiscs/fq.json   |    2 +-
+ .../tc-testing/tc-tests/qdiscs/taprio.json         |    2 +
+ tools/testing/selftests/tc-testing/tdc.py          |    2 +-
+ tools/testing/selftests/tc-testing/tdc.sh          |    3 +-
+ tools/testing/vsock/util.c                         |   17 +-
+ tools/testing/vsock/util.h                         |    4 +
+ tools/testing/vsock/vsock_diag_test.c              |   23 +-
+ tools/testing/vsock/vsock_test.c                   |  102 +-
+ tools/testing/vsock/vsock_test_zerocopy.c          |   12 +-
+ tools/testing/vsock/vsock_uring_test.c             |   17 +-
+ 436 files changed, 16458 insertions(+), 4928 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/net/qca,qca808x.yaml
+ create mode 100644 drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c
+ create mode 100644 drivers/net/ethernet/wangxun/txgbe/txgbe_irq.h
+ create mode 100644 drivers/ptp/ptp_fc3.c
+ create mode 100644 drivers/ptp/ptp_fc3.h
+ create mode 100644 include/linux/mfd/idtRC38xxx_reg.h
+ delete mode 100644 kernel/bpf/bpf_struct_ops_types.h
+ create mode 100644 kernel/bpf/token.c
+ create mode 100644 tools/lib/bpf/features.c
+ create mode 100644 tools/testing/selftests/bpf/prog_tests/kptr_xchg_inline.c
+ create mode 100644 tools/testing/selftests/bpf/prog_tests/tcp_custom_syncookie.c
+ create mode 100644 tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c
+ create mode 100644 tools/testing/selftests/bpf/prog_tests/token.c
+ create mode 100644 tools/testing/selftests/bpf/progs/kptr_xchg_inline.c
+ create mode 100644 tools/testing/selftests/bpf/progs/priv_map.c
+ create mode 100644 tools/testing/selftests/bpf/progs/priv_prog.c
+ create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_module.c
+ create mode 100644 tools/testing/selftests/bpf/progs/test_siphash.h
+ create mode 100644 tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.c
+ create mode 100644 tools/testing/selftests/bpf/progs/test_tcp_custom_syncookie.h
+ create mode 100644 tools/testing/selftests/bpf/progs/token_lsm.c
+ delete mode 120000 tools/testing/selftests/drivers/net/bonding/net_forwarding_lib.sh
+ delete mode 120000 tools/testing/selftests/drivers/net/dsa/lib.sh
+ create mode 100755 tools/testing/selftests/drivers/net/dsa/run_net_forwarding_test.sh
+ delete mode 120000 tools/testing/selftests/drivers/net/dsa/tc_common.sh
+ delete mode 120000 tools/testing/selftests/drivers/net/team/lag_lib.sh
+ delete mode 120000 tools/testing/selftests/drivers/net/team/net_forwarding_lib.sh
+Merging bpf-next/for-next (cd1c194ffe28 Merge branch 'annotate-kfuncs-in-btf_ids-section')
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git bpf-next/for-next
+Auto-merging drivers/hid/bpf/hid_bpf_dispatch.c
+Auto-merging net/core/xdp.c
+Merge made by the 'ort' strategy.
+ Documentation/bpf/kfuncs.rst                       |   8 +-
+ .../bpf/standardization/instruction-set.rst        |  13 +-
+ arch/riscv/net/bpf_jit.h                           | 134 +++++++++++++
+ arch/riscv/net/bpf_jit_comp64.c                    | 210 +++++++--------------
+ drivers/hid/bpf/hid_bpf_dispatch.c                 |   8 +-
+ fs/verity/measure.c                                |   4 +-
+ include/linux/bpf.h                                |   1 -
+ include/linux/bpf_verifier.h                       |   1 +
+ include/linux/btf_ids.h                            |  21 ++-
+ kernel/bpf/btf.c                                   | 138 +++++++++++---
+ kernel/bpf/cpumask.c                               |   4 +-
+ kernel/bpf/helpers.c                               |   8 +-
+ kernel/bpf/map_iter.c                              |   4 +-
+ kernel/bpf/token.c                                 |  16 +-
+ kernel/bpf/verifier.c                              |  24 +++
+ kernel/cgroup/rstat.c                              |   4 +-
+ kernel/events/core.c                               |   6 +-
+ kernel/trace/bpf_trace.c                           |   8 +-
+ net/bpf/test_run.c                                 |   8 +-
+ net/core/filter.c                                  |  20 +-
+ net/core/xdp.c                                     |   4 +-
+ net/ipv4/bpf_tcp_ca.c                              |   4 +-
+ net/ipv4/fou_bpf.c                                 |   4 +-
+ net/ipv4/tcp_bbr.c                                 |   4 +-
+ net/ipv4/tcp_cubic.c                               |   4 +-
+ net/ipv4/tcp_dctcp.c                               |   4 +-
+ net/netfilter/nf_conntrack_bpf.c                   |   4 +-
+ net/netfilter/nf_nat_bpf.c                         |   4 +-
+ net/xfrm/xfrm_interface_bpf.c                      |   4 +-
+ net/xfrm/xfrm_state_bpf.c                          |   4 +-
+ scripts/bpf_doc.py                                 |   2 +-
+ tools/bpf/bpftool/gen.c                            |   9 +-
+ tools/lib/bpf/bpf_core_read.h                      |  13 ++
+ tools/lib/bpf/bpf_helpers.h                        |   2 +
+ tools/lib/bpf/btf.c                                |  22 ++-
+ tools/lib/bpf/features.c                           |  58 ++++++
+ tools/lib/bpf/libbpf.c                             |  86 +++------
+ tools/lib/bpf/libbpf_internal.h                    |  16 ++
+ tools/testing/selftests/bpf/Makefile               |  31 ++-
+ tools/testing/selftests/bpf/bpf_kfuncs.h           |   2 +-
+ .../selftests/bpf/bpf_testmod/bpf_testmod.c        |  10 +-
+ .../selftests/bpf/prog_tests/decap_sanity.c        |   2 +-
+ .../testing/selftests/bpf/prog_tests/fib_lookup.c  |   2 +-
+ .../selftests/bpf/prog_tests/ip_check_defrag.c     |   4 +-
+ .../selftests/bpf/prog_tests/lwt_redirect.c        |   3 +-
+ .../testing/selftests/bpf/prog_tests/lwt_reroute.c |   2 +-
+ tools/testing/selftests/bpf/prog_tests/mptcp.c     |   2 +-
+ .../selftests/bpf/prog_tests/sock_destroy.c        |   2 +-
+ .../selftests/bpf/prog_tests/sock_iter_batch.c     |   4 +-
+ .../testing/selftests/bpf/prog_tests/test_tunnel.c |  18 +-
+ tools/testing/selftests/bpf/prog_tests/verifier.c  |   2 +
+ .../selftests/bpf/progs/connect_unix_prog.c        |   3 +-
+ .../selftests/bpf/progs/getpeername_unix_prog.c    |   3 +-
+ .../selftests/bpf/progs/getsockname_unix_prog.c    |   3 +-
+ .../selftests/bpf/progs/recvmsg_unix_prog.c        |   3 +-
+ .../selftests/bpf/progs/sendmsg_unix_prog.c        |   3 +-
+ .../selftests/bpf/progs/sk_storage_omem_uncharge.c |   4 +-
+ .../testing/selftests/bpf/progs/sock_iter_batch.c  |   4 +-
+ tools/testing/selftests/bpf/progs/type_cast.c      |  13 +-
+ .../selftests/bpf/progs/verifier_global_ptr_args.c | 156 +++++++++++++++
+ tools/testing/selftests/bpf/test_progs.h           |   7 +-
+ 61 files changed, 791 insertions(+), 380 deletions(-)
+ create mode 100644 tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c
+Merging ipsec-next/master (ab1e1a38de24 xfrm6_tunnel: Use KMEM_CACHE instead of kmem_cache_create)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git ipsec-next/master
+Auto-merging net/xfrm/xfrm_policy.c
+Merge made by the 'ort' strategy.
+ net/ipv6/xfrm6_tunnel.c |   5 +-
+ net/xfrm/xfrm_policy.c  | 142 +++++++++++++++++++++++++++++++++++++++++++++++-
+ 2 files changed, 141 insertions(+), 6 deletions(-)
+Merging mlx5-next/mlx5-next (d727d27db536 RDMA/mlx5: Expose register c0 for RDMA device)
+$ git merge -m Merge branch 'mlx5-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git mlx5-next/mlx5-next
+Already up to date.
+Merging netfilter-next/main (5264ab612e28 selftests/net: calibrate txtimestamp)
+$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next.git netfilter-next/main
+Already up to date.
+Merging ipvs-next/main (7ad269787b66 netfilter: ebtables: allow xtables-nft only builds)
+$ git merge -m Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git ipvs-next/main
+Already up to date.
+Merging bluetooth/master (64692e12507b Bluetooth: qca: Fix wrong event type for patch config command)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git bluetooth/master
+Auto-merging drivers/bluetooth/btnxpuart.c
+Auto-merging drivers/bluetooth/btusb.c
+Merge made by the 'ort' strategy.
+ drivers/bluetooth/btintel.c      |   2 +-
+ drivers/bluetooth/btnxpuart.c    |  24 ++++++-
+ drivers/bluetooth/btqca.c        |   2 +-
+ drivers/bluetooth/btrtl.c        |  14 ++++
+ drivers/bluetooth/btusb.c        |   8 +++
+ drivers/bluetooth/hci_bcm4377.c  |   3 +-
+ include/net/bluetooth/hci.h      |   4 +-
+ include/net/bluetooth/hci_sync.h |   2 +-
+ net/bluetooth/hci_core.c         | 135 ++++++++++++++++++++++++++++-----------
+ net/bluetooth/hci_event.c        |  23 ++++---
+ net/bluetooth/hci_request.c      |   2 +-
+ net/bluetooth/hci_sock.c         |   4 +-
+ net/bluetooth/hci_sync.c         |  57 ++++++++++-------
+ net/bluetooth/l2cap_core.c       |   8 ++-
+ net/bluetooth/mgmt.c             |  36 +++++------
+ net/bluetooth/rfcomm/core.c      |   2 +-
+ 16 files changed, 226 insertions(+), 100 deletions(-)
+Merging wireless-next/for-next (3fbf61207c66 Revert "mlx5 updates 2023-12-20")
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git wireless-next/for-next
+Already up to date.
+Merging wpan-next/master (2373699560a7 mac802154: Avoid new associations while disassociating)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git wpan-next/master
+Already up to date.
+Merging wpan-staging/staging (2373699560a7 mac802154: Avoid new associations while disassociating)
+$ git merge -m Merge branch 'staging' of git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git wpan-staging/staging
+Already up to date.
+Merging mtd/mtd/next (98d4fda8f2d4 Merge tag 'nand/for-6.8' into mtd/next)
+$ git merge -m Merge branch 'mtd/next' of git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git mtd/mtd/next
+Already up to date.
+Merging nand/nand/next (023e6aad7e5e mtd: rawnand: s3c2410: fix Excess struct member description kernel-doc warnings)
+$ git merge -m Merge branch 'nand/next' of git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git nand/nand/next
+Already up to date.
+Merging spi-nor/spi-nor/next (3c0e1dfa703c MAINTAINERS: change my mail to the kernel.org one)
+$ git merge -m Merge branch 'spi-nor/next' of git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git spi-nor/spi-nor/next
+Already up to date.
+Merging crypto/master (4d314d27130b dt-bindings: crypto: ice: Document SC7180 inline crypto engine)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git crypto/master
+Merge made by the 'ort' strategy.
+ Documentation/ABI/testing/debugfs-hisi-hpre        |   7 +
+ Documentation/ABI/testing/debugfs-hisi-sec         |   7 +
+ Documentation/ABI/testing/debugfs-hisi-zip         |   7 +
+ .../bindings/crypto/qcom,inline-crypto-engine.yaml |   1 +
+ .../devicetree/bindings/crypto/qcom-qce.yaml       |   1 +
+ arch/arm64/crypto/Kconfig                          |   1 +
+ arch/arm64/crypto/aes-ce-ccm-core.S                | 265 ++++++++-------------
+ arch/arm64/crypto/aes-ce-ccm-glue.c                | 154 ++++++++----
+ arch/arm64/crypto/aes-glue.c                       |   1 +
+ arch/powerpc/crypto/Kconfig                        |  20 ++
+ arch/powerpc/crypto/Makefile                       |  20 +-
+ {drivers/crypto/vmx => arch/powerpc/crypto}/aes.c  |   0
+ .../crypto/vmx => arch/powerpc/crypto}/aes_cbc.c   |   0
+ .../crypto/vmx => arch/powerpc/crypto}/aes_ctr.c   |   0
+ .../crypto/vmx => arch/powerpc/crypto}/aes_xts.c   |   0
+ .../crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.h |   0
+ .../vmx => arch/powerpc/crypto}/aesp8-ppc.pl       |   0
+ .../crypto/vmx => arch/powerpc/crypto}/ghash.c     |   0
+ .../vmx => arch/powerpc/crypto}/ghashp8-ppc.pl     |   0
+ {drivers/crypto/vmx => arch/powerpc/crypto}/vmx.c  |   0
+ crypto/asymmetric_keys/verify_pefile.c             |   4 +-
+ crypto/pcbc.c                                      |   4 +-
+ crypto/testmgr.c                                   |   8 -
+ drivers/crypto/Kconfig                             |  14 +-
+ drivers/crypto/Makefile                            |   2 +-
+ drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c  |   2 +-
+ drivers/crypto/hisilicon/debugfs.c                 |  53 +++++
+ drivers/crypto/hisilicon/hpre/hpre_main.c          |   2 +-
+ drivers/crypto/hisilicon/sec2/sec_main.c           |   2 +-
+ drivers/crypto/hisilicon/zip/zip_main.c            |   2 +-
+ drivers/crypto/intel/iaa/iaa_crypto.h              |  25 --
+ drivers/crypto/intel/iaa/iaa_crypto_comp_fixed.c   |   1 -
+ drivers/crypto/intel/iaa/iaa_crypto_main.c         | 108 +--------
+ drivers/crypto/intel/iaa/iaa_crypto_stats.c        |   2 -
+ .../crypto/intel/qat/qat_common/adf_gen4_hw_data.c |   3 +
+ drivers/crypto/intel/qat/qat_common/adf_isr.c      |   2 +-
+ .../crypto/virtio/virtio_crypto_akcipher_algs.c    |  12 +-
+ drivers/crypto/vmx/.gitignore                      |   3 -
+ drivers/crypto/vmx/Kconfig                         |  14 --
+ drivers/crypto/vmx/Makefile                        |  23 --
+ drivers/crypto/vmx/ppc-xlate.pl                    | 231 ------------------
+ include/crypto/public_key.h                        |   1 +
+ 42 files changed, 344 insertions(+), 658 deletions(-)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes.c (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_cbc.c (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_ctr.c (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/aes_xts.c (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.h (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/aesp8-ppc.pl (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/ghash.c (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/ghashp8-ppc.pl (100%)
+ rename {drivers/crypto/vmx => arch/powerpc/crypto}/vmx.c (100%)
+ delete mode 100644 drivers/crypto/vmx/.gitignore
+ delete mode 100644 drivers/crypto/vmx/Kconfig
+ delete mode 100644 drivers/crypto/vmx/Makefile
+ delete mode 100644 drivers/crypto/vmx/ppc-xlate.pl
+Merging drm/drm-next (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'drm-next' of git://git.freedesktop.org/git/drm/drm.git drm/drm-next
+Already up to date.
+Merging drm-ci/topic/drm-ci (ad6bfe1b66a5 drm: ci: docs: fix build warning - add missing escape)
+$ git merge -m Merge branch 'topic/drm-ci' of git://git.freedesktop.org/git/drm/drm.git drm-ci/topic/drm-ci
+Already up to date.
+Merging drm-exynos/for-linux-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/daeinki/drm-exynos.git drm-exynos/for-linux-next
+Already up to date.
+Merging drm-misc/for-linux-next (1f1626ac0428 drm/ttm: fix ttm pool initialization for no-dma-device drivers)
+$ git merge -m Merge branch 'for-linux-next' of git://anongit.freedesktop.org/drm/drm-misc drm-misc/for-linux-next
+Already up to date.
+Merging amdgpu/drm-next (9217b91c6458 drm/amdgpu: Reset IH OVERFLOW_CLEAR bit)
+$ git merge -m Merge branch 'drm-next' of https://gitlab.freedesktop.org/agd5f/linux amdgpu/drm-next
+Auto-merging drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+CONFLICT (content): Merge conflict in drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+Auto-merging drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+Auto-merging drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+Auto-merging drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+Auto-merging drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+Auto-merging drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+Auto-merging drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+Auto-merging drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+Auto-merging drivers/gpu/drm/amd/display/dc/core/dc.c
+Auto-merging drivers/gpu/drm/amd/display/dc/core/dc_resource.c
+Auto-merging drivers/gpu/drm/amd/display/dc/dc.h
+CONFLICT (content): Merge conflict in drivers/gpu/drm/amd/display/dc/dc.h
+Auto-merging drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c
+Auto-merging drivers/gpu/drm/amd/display/dc/link/link_dpms.c
+Auto-merging drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h
+Auto-merging drivers/gpu/drm/amd/include/amd_shared.h
+Auto-merging drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+Auto-merging drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
+Auto-merging drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+Auto-merging drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+Auto-merging drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+Auto-merging drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
+Resolved 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c' using previous resolution.
+Resolved 'drivers/gpu/drm/amd/display/dc/dc.h' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master ea89d33405ba] Merge branch 'drm-next' of https://gitlab.freedesktop.org/agd5f/linux
+$ git diff -M --stat --summary HEAD^..
+ Documentation/gpu/amdgpu/dgpu-asic-info-table.csv  |   2 +
+ Documentation/gpu/amdgpu/display/dcn-blocks.rst    |  78 ++
+ .../gpu/amdgpu/display/display-contributing.rst    | 168 ++++
+ .../gpu/amdgpu/display/display-manager.rst         |   3 -
+ Documentation/gpu/amdgpu/display/index.rst         |  78 +-
+ drivers/gpu/drm/amd/amdgpu/Makefile                |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu.h                |  14 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c            | 879 +++++++++++++++++++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h            | 202 +++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c         |  37 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h         |  16 +-
+ .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c    |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c   |  59 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c       |  24 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c   |   3 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.h   |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c             |   6 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c            |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c        |   8 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c         |  80 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c      |   6 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c        |   4 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c            |   8 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c            |  35 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c            |   4 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c            |  55 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h            |   4 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c            |   7 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c            |  17 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c            |  33 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h            |   1 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c            |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c            | 186 +++--
+ drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h            |   9 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c            | 689 ++++++++++++----
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h            |  60 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c           |  12 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h           |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c            |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h            |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c          |  70 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h          |   9 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c            | 155 +++-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h            |  10 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_umr.h            |   4 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c       |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c            |   2 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c           |  83 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h           |   7 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c           |   3 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c             |  69 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h             |  18 +-
+ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c           |  81 +-
+ drivers/gpu/drm/amd/amdgpu/atom.c                  |  41 +-
+ drivers/gpu/drm/amd/amdgpu/atom.h                  |   2 +-
+ drivers/gpu/drm/amd/amdgpu/atombios_crtc.c         |  28 +-
+ drivers/gpu/drm/amd/amdgpu/atombios_dp.c           |   4 +-
+ drivers/gpu/drm/amd/amdgpu/atombios_encoders.c     |  16 +-
+ drivers/gpu/drm/amd/amdgpu/atombios_i2c.c          |   4 +-
+ drivers/gpu/drm/amd/amdgpu/cik_ih.c                |   6 +
+ drivers/gpu/drm/amd/amdgpu/clearstate_gfx9.h       |  27 +-
+ drivers/gpu/drm/amd/amdgpu/clearstate_si.h         |  24 +-
+ drivers/gpu/drm/amd/amdgpu/cz_ih.c                 |   5 +
+ drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c             |   4 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c             |  38 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c           |   2 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c              |   4 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c              |   4 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c              |   2 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c              |   5 +-
+ drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c            |  92 ++-
+ drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c             |   9 +-
+ drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c             |   9 +-
+ drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c              |   9 +-
+ drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c              |   9 +-
+ drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c              |   9 +-
+ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c              |  16 +-
+ drivers/gpu/drm/amd/amdgpu/iceland_ih.c            |   5 +
+ drivers/gpu/drm/amd/amdgpu/ih_v6_0.c               |   6 +
+ drivers/gpu/drm/amd/amdgpu/ih_v6_1.c               |   7 +
+ drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c             |  10 +-
+ drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c            |  87 ++
+ drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c              |   3 +-
+ drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c              |  29 +-
+ drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h              |   1 +
+ drivers/gpu/drm/amd/amdgpu/navi10_ih.c             |   9 +-
+ drivers/gpu/drm/amd/amdgpu/psp_v13_0.c             |  99 +--
+ drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c             |  15 +-
+ drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c           |  72 ++
+ drivers/gpu/drm/amd/amdgpu/si_ih.c                 |   6 +
+ drivers/gpu/drm/amd/amdgpu/ta_ras_if.h             |  36 +
+ drivers/gpu/drm/amd/amdgpu/tonga_ih.c              |   6 +
+ drivers/gpu/drm/amd/amdgpu/umc_v12_0.c             | 252 ++++--
+ drivers/gpu/drm/amd/amdgpu/umc_v12_0.h             |   3 +
+ drivers/gpu/drm/amd/amdgpu/umc_v6_0.c              |   2 +-
+ drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c              |  17 -
+ drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c            |  19 -
+ drivers/gpu/drm/amd/amdgpu/vega10_ih.c             |   6 +
+ drivers/gpu/drm/amd/amdgpu/vega20_ih.c             |   6 +
+ drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h     |  14 +-
+ .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm |   2 +-
+ .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm  |   2 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_chardev.c           |   4 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_debug.c             |   4 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_events.c            |   6 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c   |   7 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c   |   7 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c    |   7 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_migrate.c           |   2 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |   9 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_svm.c               |  10 +-
+ drivers/gpu/drm/amd/display/TODO                   | 110 ---
+ drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c  |  65 +-
+ drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h  |   1 +
+ .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c |  72 +-
+ .../drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c  |  55 +-
+ .../drm/amd/display/amdgpu_dm/amdgpu_dm_replay.c   | 119 +--
+ .../drm/amd/display/amdgpu_dm/amdgpu_dm_replay.h   |   4 +-
+ drivers/gpu/drm/amd/display/dc/basics/conversion.c |  34 +
+ drivers/gpu/drm/amd/display/dc/basics/conversion.h |   4 +
+ .../gpu/drm/amd/display/dc/bios/command_table.c    |   2 +-
+ .../gpu/drm/amd/display/dc/bios/command_table2.c   |   2 +-
+ .../amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c   |  40 +-
+ .../amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c   |  72 +-
+ .../drm/amd/display/dc/clk_mgr/dcn35/dcn35_smu.c   |  15 +
+ drivers/gpu/drm/amd/display/dc/core/dc.c           |  45 +-
+ drivers/gpu/drm/amd/display/dc/core/dc_resource.c  |  14 +
+ drivers/gpu/drm/amd/display/dc/dc.h                |   5 +-
+ drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c       |  14 +-
+ drivers/gpu/drm/amd/display/dc/dc_hw_types.h       |   1 +
+ drivers/gpu/drm/amd/display/dc/dc_stream.h         |   2 -
+ drivers/gpu/drm/amd/display/dc/dce/dce_audio.c     | 299 ++++++-
+ drivers/gpu/drm/amd/display/dc/dce/dce_audio.h     |   3 +-
+ drivers/gpu/drm/amd/display/dc/dce/dmub_replay.c   |   4 +-
+ .../gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c |  20 +
+ .../gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.h |   4 +-
+ drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c   |   3 +-
+ drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.h   |   3 +
+ .../gpu/drm/amd/display/dc/dcn10/dcn10_dpp_cm.c    |  70 +-
+ drivers/gpu/drm/amd/display/dc/dcn20/dcn20_dpp.c   |  31 +-
+ drivers/gpu/drm/amd/display/dc/dcn20/dcn20_dpp.h   |   3 +
+ .../gpu/drm/amd/display/dc/dcn20/dcn20_dpp_cm.c    |  55 ++
+ drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c   |  24 +-
+ drivers/gpu/drm/amd/display/dc/dcn201/dcn201_dpp.c |   1 +
+ drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c   |  38 +-
+ drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.h   |   2 +
+ .../gpu/drm/amd/display/dc/dcn30/dcn30_dpp_cm.c    |  54 ++
+ drivers/gpu/drm/amd/display/dc/dcn30/dcn30_mpc.c   | 106 ++-
+ drivers/gpu/drm/amd/display/dc/dcn30/dcn30_mpc.h   |   4 +
+ .../amd/display/dc/dcn32/dcn32_dio_link_encoder.c  |   4 +-
+ drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dpp.c   |   1 +
+ .../amd/display/dc/dcn32/dcn32_resource_helpers.c  |  14 -
+ .../amd/display/dc/dcn35/dcn35_dio_link_encoder.c  |   4 +-
+ .../amd/display/dc/dml/dcn30/display_mode_vba_30.c |  16 +-
+ .../gpu/drm/amd/display/dc/dml/dcn303/dcn303_fpu.c |  11 +
+ .../gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c   |  15 +-
+ .../gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c   |   8 +-
+ .../amd/display/dc/dml2/dml2_dc_resource_mgmt.c    |  41 +-
+ .../amd/display/dc/dml2/dml2_translation_helper.c  |  31 +-
+ drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c        |   5 +
+ .../drm/amd/display/dc/hwss/dce110/dce110_hwseq.c  |  58 +-
+ .../drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c    |  99 ++-
+ .../drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c    | 126 ++-
+ .../drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.h    |   2 +
+ .../drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c    | 167 +++-
+ .../drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.h    |   6 +-
+ .../gpu/drm/amd/display/dc/hwss/dcn30/dcn30_init.c |   2 +-
+ .../drm/amd/display/dc/hwss/dcn31/dcn31_hwseq.c    |  18 +
+ .../drm/amd/display/dc/hwss/dcn31/dcn31_hwseq.h    |   4 +
+ .../gpu/drm/amd/display/dc/hwss/dcn31/dcn31_init.c |   2 +-
+ .../drm/amd/display/dc/hwss/dcn314/dcn314_init.c   |   2 +-
+ .../gpu/drm/amd/display/dc/hwss/dcn32/dcn32_init.c |   2 +-
+ .../gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c |   2 +-
+ .../drm/amd/display/dc/hwss/dcn351/dcn351_init.c   |   2 +-
+ drivers/gpu/drm/amd/display/dc/hwss/hw_sequencer.h |   2 +
+ drivers/gpu/drm/amd/display/dc/inc/core_types.h    |   2 +
+ drivers/gpu/drm/amd/display/dc/inc/hw/audio.h      |   3 +-
+ .../drm/amd/display/dc/inc/hw/clk_mgr_internal.h   |   6 +
+ drivers/gpu/drm/amd/display/dc/inc/hw/dchubbub.h   |   6 +
+ drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h        |  39 +
+ drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h       |  13 +-
+ drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h        | 253 ++++--
+ drivers/gpu/drm/amd/display/dc/inc/hw/opp.h        |  16 +
+ drivers/gpu/drm/amd/display/dc/inc/resource.h      |   3 +
+ .../drm/amd/display/dc/link/hwss/link_hwss_dio.h   |  10 +
+ .../gpu/drm/amd/display/dc/link/link_detection.c   |  18 +
+ drivers/gpu/drm/amd/display/dc/link/link_dpms.c    |  58 ++
+ .../display/dc/link/protocols/link_dp_dpia_bw.c    |   2 +-
+ .../amd/display/dc/resource/dcn30/dcn30_resource.c |  11 +
+ .../amd/display/dc/resource/dcn32/dcn32_resource.c |   2 +-
+ .../amd/display/dc/resource/dcn32/dcn32_resource.h |   3 -
+ .../display/dc/resource/dcn321/dcn321_resource.c   |   3 +-
+ .../amd/display/dc/resource/dcn35/dcn35_resource.c |   3 +-
+ drivers/gpu/drm/amd/display/dmub/dmub_srv.h        |  15 +-
+ drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h    |  19 +-
+ drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c    | 121 ++-
+ drivers/gpu/drm/amd/display/include/audio_types.h  |  15 +
+ drivers/gpu/drm/amd/include/amd_shared.h           |   1 +
+ drivers/gpu/drm/amd/include/arct_ip_offset.h       |   6 +-
+ .../amd/include/asic_reg/dcn/dcn_3_1_6_offset.h    |   4 +
+ .../amd/include/asic_reg/dcn/dcn_3_1_6_sh_mask.h   |  10 +
+ .../amd/include/asic_reg/dcn/dcn_3_5_0_offset.h    |  24 +
+ .../amd/include/asic_reg/dcn/dcn_3_5_0_sh_mask.h   |  65 ++
+ drivers/gpu/drm/amd/include/atom-bits.h            |   2 +-
+ drivers/gpu/drm/amd/include/beige_goby_ip_offset.h |   6 +-
+ drivers/gpu/drm/amd/include/cgs_common.h           |  23 +-
+ .../gpu/drm/amd/include/cyan_skillfish_ip_offset.h |   6 +-
+ .../drm/amd/include/dimgrey_cavefish_ip_offset.h   |   6 +-
+ drivers/gpu/drm/amd/include/dm_pp_interface.h      |   9 +-
+ drivers/gpu/drm/amd/include/kgd_pp_interface.h     |   6 +-
+ drivers/gpu/drm/amd/include/navi12_ip_offset.h     |   6 +-
+ drivers/gpu/drm/amd/include/navi14_ip_offset.h     |   6 +-
+ drivers/gpu/drm/amd/include/pptable.h              |   6 +-
+ drivers/gpu/drm/amd/include/renoir_ip_offset.h     |   6 +-
+ .../gpu/drm/amd/include/sienna_cichlid_ip_offset.h |   6 +-
+ drivers/gpu/drm/amd/include/v10_structs.h          |   3 +-
+ drivers/gpu/drm/amd/include/vangogh_ip_offset.h    |   6 +-
+ drivers/gpu/drm/amd/include/vega10_ip_offset.h     |   6 +-
+ drivers/gpu/drm/amd/include/vega20_ip_offset.h     |  78 +-
+ .../gpu/drm/amd/pm/powerplay/hwmgr/ppatomctrl.c    |  42 +-
+ .../gpu/drm/amd/pm/powerplay/hwmgr/ppatomfwctrl.c  |   4 +-
+ drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c          |  33 +-
+ drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h      |   1 -
+ drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c     |  16 +-
+ drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c     |   2 +-
+ drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c     |  18 +-
+ .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c   |   8 +-
+ .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   | 158 +++-
+ .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c   |   8 +-
+ drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c     |   2 +-
+ drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c             |   9 +-
+ drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h             |  10 +
+ drivers/gpu/drm/radeon/atom-bits.h                 |   2 +-
+ drivers/gpu/drm/radeon/atom.c                      |  47 +-
+ drivers/gpu/drm/radeon/atom.h                      |   4 +-
+ drivers/gpu/drm/radeon/atombios_crtc.c             |  28 +-
+ drivers/gpu/drm/radeon/atombios_dp.c               |   4 +-
+ drivers/gpu/drm/radeon/atombios_encoders.c         |  38 +-
+ drivers/gpu/drm/radeon/atombios_i2c.c              |   2 +-
+ drivers/gpu/drm/radeon/btc_dpm.c                   |  90 +--
+ drivers/gpu/drm/radeon/ci_dpm.c                    |  31 +-
+ drivers/gpu/drm/radeon/ci_dpm.h                    |   6 +-
+ drivers/gpu/drm/radeon/clearstate_cayman.h         |   9 +-
+ drivers/gpu/drm/radeon/clearstate_ci.h             |   3 +-
+ drivers/gpu/drm/radeon/evergreen.c                 |  20 +-
+ drivers/gpu/drm/radeon/evergreen_cs.c              |   4 +-
+ drivers/gpu/drm/radeon/evergreen_reg.h             |  10 +-
+ drivers/gpu/drm/radeon/evergreen_smc.h             |   9 +-
+ drivers/gpu/drm/radeon/kv_dpm.c                    |   9 +-
+ drivers/gpu/drm/radeon/kv_smc.c                    |   2 +-
+ drivers/gpu/drm/radeon/ni.c                        |  31 +-
+ drivers/gpu/drm/radeon/ni_dpm.c                    |   3 -
+ drivers/gpu/drm/radeon/ni_dpm.h                    |  12 +-
+ drivers/gpu/drm/radeon/nislands_smc.h              |  51 +-
+ drivers/gpu/drm/radeon/r100.c                      |   2 +-
+ drivers/gpu/drm/radeon/r300_reg.h                  |   2 +-
+ drivers/gpu/drm/radeon/r600.c                      |   3 +-
+ drivers/gpu/drm/radeon/r600_dpm.c                  |   6 +-
+ drivers/gpu/drm/radeon/r600_dpm.h                  |   3 +-
+ drivers/gpu/drm/radeon/radeon.h                    |   6 +-
+ drivers/gpu/drm/radeon/radeon_asic.c               |   8 +-
+ drivers/gpu/drm/radeon/radeon_atombios.c           |  44 +-
+ drivers/gpu/drm/radeon/radeon_atpx_handler.c       |  12 +-
+ drivers/gpu/drm/radeon/radeon_audio.c              |  11 +-
+ drivers/gpu/drm/radeon/radeon_audio.h              |   6 +-
+ drivers/gpu/drm/radeon/radeon_mode.h               |   9 +-
+ drivers/gpu/drm/radeon/radeon_pm.c                 |   4 +-
+ drivers/gpu/drm/radeon/rs400.c                     |   4 +-
+ drivers/gpu/drm/radeon/rs600.c                     |   3 +-
+ drivers/gpu/drm/radeon/rv515.c                     |   3 +-
+ drivers/gpu/drm/radeon/rv6xx_dpm.h                 |   3 +-
+ drivers/gpu/drm/radeon/rv770_dpm.c                 |   4 +-
+ drivers/gpu/drm/radeon/rv770_smc.h                 |  27 +-
+ drivers/gpu/drm/radeon/si.c                        |  63 +-
+ drivers/gpu/drm/radeon/si_dpm.c                    | 132 ++--
+ drivers/gpu/drm/radeon/si_dpm.h                    |  21 +-
+ drivers/gpu/drm/radeon/smu7.h                      |   6 +-
+ drivers/gpu/drm/radeon/smu7_discrete.h             |  51 +-
+ drivers/gpu/drm/radeon/smu7_fusion.h               |  42 +-
+ drivers/gpu/drm/radeon/sumo_dpm.c                  |  18 +-
+ drivers/gpu/drm/radeon/trinity_dpm.c               |  22 +-
+ drivers/gpu/drm/radeon/trinity_dpm.h               |   3 +-
+ drivers/gpu/drm/radeon/uvd_v1_0.c                  |   2 +-
+ include/uapi/drm/amdgpu_drm.h                      |   2 +
+ include/uapi/linux/kfd_ioctl.h                     |   3 +-
+ 285 files changed, 6426 insertions(+), 2236 deletions(-)
+ create mode 100644 Documentation/gpu/amdgpu/display/dcn-blocks.rst
+ create mode 100644 Documentation/gpu/amdgpu/display/display-contributing.rst
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+ create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
+ delete mode 100644 drivers/gpu/drm/amd/display/TODO
+Merging drm-intel/for-linux-next (fe4c6ff50c68 drm/i915/xe2lpd: Move registers to PICA)
+$ git merge -m Merge branch 'for-linux-next' of git://anongit.freedesktop.org/drm-intel drm-intel/for-linux-next
+Auto-merging drivers/gpu/drm/i915/display/intel_backlight.c
+Auto-merging drivers/gpu/drm/i915/display/intel_dp.c
+Auto-merging drivers/gpu/drm/i915/display/intel_gmbus.c
+Auto-merging drivers/gpu/drm/i915/display/intel_psr.c
+Auto-merging drivers/gpu/drm/i915/display/intel_sdvo.c
+Merge made by the 'ort' strategy.
+ drivers/gpu/drm/i915/display/intel_atomic_plane.c  |   6 +-
+ drivers/gpu/drm/i915/display/intel_backlight.c     |   2 +-
+ drivers/gpu/drm/i915/display/intel_bios.c          |  36 +-
+ drivers/gpu/drm/i915/display/intel_bios.h          |   5 +-
+ drivers/gpu/drm/i915/display/intel_cdclk.c         | 361 ++++++++++-----------
+ drivers/gpu/drm/i915/display/intel_crt.c           |   5 +
+ drivers/gpu/drm/i915/display/intel_crtc.c          | 128 +-------
+ drivers/gpu/drm/i915/display/intel_cursor.c        |  63 +++-
+ drivers/gpu/drm/i915/display/intel_cx0_phy.c       | 231 ++++++-------
+ drivers/gpu/drm/i915/display/intel_cx0_phy_regs.h  |  63 +++-
+ drivers/gpu/drm/i915/display/intel_ddi.c           |  67 ++--
+ drivers/gpu/drm/i915/display/intel_display.c       |  29 +-
+ drivers/gpu/drm/i915/display/intel_display_core.h  |  16 +-
+ .../gpu/drm/i915/display/intel_display_debugfs.c   |  26 +-
+ .../gpu/drm/i915/display/intel_display_device.c    |   2 +-
+ .../gpu/drm/i915/display/intel_display_driver.c    | 149 ++++++++-
+ .../gpu/drm/i915/display/intel_display_driver.h    |   6 +
+ drivers/gpu/drm/i915/display/intel_display_irq.c   |  10 +-
+ drivers/gpu/drm/i915/display/intel_display_types.h |  25 +-
+ drivers/gpu/drm/i915/display/intel_dmc.c           |   2 +-
+ drivers/gpu/drm/i915/display/intel_dp.c            | 192 ++++++-----
+ drivers/gpu/drm/i915/display/intel_dp.h            |  10 +-
+ drivers/gpu/drm/i915/display/intel_dp_aux.c        |  29 +-
+ drivers/gpu/drm/i915/display/intel_dp_mst.c        |   4 +
+ drivers/gpu/drm/i915/display/intel_dpll_mgr.c      |  83 +++--
+ drivers/gpu/drm/i915/display/intel_dpll_mgr.h      |  18 +-
+ drivers/gpu/drm/i915/display/intel_dsb.c           |   4 +
+ drivers/gpu/drm/i915/display/intel_dvo.c           |   5 +
+ drivers/gpu/drm/i915/display/intel_gmbus.c         |   5 +-
+ drivers/gpu/drm/i915/display/intel_hdcp.c          |  78 +++--
+ drivers/gpu/drm/i915/display/intel_hdcp_regs.h     |  28 +-
+ drivers/gpu/drm/i915/display/intel_hdmi.c          |  16 +-
+ drivers/gpu/drm/i915/display/intel_hotplug.c       | 165 ++++++++--
+ drivers/gpu/drm/i915/display/intel_hotplug.h       |   4 +
+ drivers/gpu/drm/i915/display/intel_hotplug_irq.c   |   6 +-
+ drivers/gpu/drm/i915/display/intel_opregion.c      | 176 +++++++---
+ drivers/gpu/drm/i915/display/intel_opregion.h      |  47 ++-
+ drivers/gpu/drm/i915/display/intel_panel.c         |   4 +
+ drivers/gpu/drm/i915/display/intel_pps.c           |   2 +-
+ drivers/gpu/drm/i915/display/intel_psr.c           | 130 ++++++--
+ drivers/gpu/drm/i915/display/intel_psr.h           |   6 -
+ drivers/gpu/drm/i915/display/intel_psr_regs.h      |   6 +
+ drivers/gpu/drm/i915/display/intel_sdvo.c          |   6 +
+ drivers/gpu/drm/i915/display/intel_tc.c            |  40 +--
+ drivers/gpu/drm/i915/display/intel_tc.h            |   2 +-
+ drivers/gpu/drm/i915/display/intel_tv.c            |   7 +-
+ drivers/gpu/drm/i915/display/intel_vblank.c        | 130 ++++++++
+ drivers/gpu/drm/i915/display/intel_vblank.h        |  12 +
+ drivers/gpu/drm/i915/display/skl_watermark.c       |  16 +-
+ drivers/gpu/drm/i915/gem/i915_gem_ttm.c            |   5 +-
+ drivers/gpu/drm/i915/gvt/kvmgt.c                   |   2 +-
+ drivers/gpu/drm/i915/i915_driver.c                 |  25 +-
+ drivers/gpu/drm/i915/i915_reg.h                    |   8 +
+ drivers/gpu/drm/i915/soc/intel_pch.c               |  16 +-
+ drivers/gpu/drm/i915/soc/intel_pch.h               |   6 +-
+ include/drm/display/drm_dp.h                       |   1 +
+ include/drm/drm_print.h                            |   3 +
+ include/drm/i915_pciids.h                          |   3 +
+ 58 files changed, 1613 insertions(+), 919 deletions(-)
+Merging drm-tegra/for-next (2429b3c529da drm/tegra: Avoid potential 32-bit integer overflow)
+$ git merge -m Merge branch 'for-next' of https://gitlab.freedesktop.org/drm/tegra.git drm-tegra/for-next
+Already up to date.
+Merging drm-msm/msm-next (d4ca26ac4be0 drm/msm/dp: call dp_display_get_next_bridge() during probe)
+$ git merge -m Merge branch 'msm-next' of https://gitlab.freedesktop.org/drm/msm.git drm-msm/msm-next
+Already up to date.
+Merging drm-msm-lumag/msm-next-lumag (d4ca26ac4be0 drm/msm/dp: call dp_display_get_next_bridge() during probe)
+$ git merge -m Merge branch 'msm-next-lumag' of https://gitlab.freedesktop.org/lumag/msm.git drm-msm-lumag/msm-next-lumag
+Already up to date.
+Merging etnaviv/etnaviv/next (c9959996a8fc drm/etnaviv: add sensitive state for PE_RT_ADDR_4_PIPE(3, 0|1) address)
+$ git merge -m Merge branch 'etnaviv/next' of https://git.pengutronix.de/git/lst/linux etnaviv/etnaviv/next
+Auto-merging drivers/gpu/drm/etnaviv/etnaviv_drv.c
+Auto-merging drivers/gpu/drm/etnaviv/etnaviv_gpu.c
+Merge made by the 'ort' strategy.
+ drivers/gpu/drm/etnaviv/etnaviv_cmd_parser.c |  1 +
+ drivers/gpu/drm/etnaviv/etnaviv_drv.c        | 93 ++++++++++++++++++----------
+ drivers/gpu/drm/etnaviv/etnaviv_gem.c        | 12 ++--
+ drivers/gpu/drm/etnaviv/etnaviv_gpu.c        | 33 +++++++++-
+ drivers/gpu/drm/etnaviv/etnaviv_gpu.h        | 12 ++++
+ drivers/gpu/drm/etnaviv/etnaviv_hwdb.c       | 34 ++++++++++
+ drivers/gpu/drm/etnaviv/etnaviv_mmu.c        |  4 +-
+ drivers/gpu/drm/etnaviv/etnaviv_perfmon.c    |  4 +-
+ include/uapi/drm/etnaviv_drm.h               |  5 ++
+ 9 files changed, 154 insertions(+), 44 deletions(-)
+Merging fbdev/for-next (72fee6b0a3a4 fbdev: Restrict FB_SH_MOBILE_LCDC to SuperH)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/deller/linux-fbdev.git fbdev/for-next
+Merge made by the 'ort' strategy.
+ drivers/video/fbdev/Kconfig | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+Merging regmap/for-next (a1214cdfe92b Merge branch 'regmap-linus' into regmap-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap.git regmap/for-next
+Merge made by the 'ort' strategy.
+Merging sound/for-next (8b87a7863fa5 Merge branch 'topic/format-kunit' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git sound/for-next
+Auto-merging MAINTAINERS
+Auto-merging sound/pci/hda/patch_realtek.c
+Merge made by the 'ort' strategy.
+ MAINTAINERS                            |   7 +
+ include/sound/emux_synth.h             |   2 +-
+ sound/core/Kconfig                     |  16 ++
+ sound/core/Makefile                    |   2 +
+ sound/core/pcm.c                       |   6 +-
+ sound/core/sound_kunit.c               | 310 +++++++++++++++++++++++++++++++++
+ sound/firewire/Kconfig                 |   2 +
+ sound/firewire/motu/motu-protocol-v3.c |   9 +
+ sound/firewire/motu/motu.c             |   2 +
+ sound/firewire/motu/motu.h             |   1 +
+ sound/pci/hda/Kconfig                  |   4 +
+ sound/pci/hda/Makefile                 |   2 +
+ sound/pci/hda/cs35l41_hda_property.c   |  90 ++++++++--
+ sound/pci/hda/hda_component.c          | 169 ++++++++++++++++++
+ sound/pci/hda/hda_component.h          |  59 +++++++
+ sound/pci/hda/patch_realtek.c          | 271 +++++++++-------------------
+ sound/synth/emux/emux.c                |   4 +-
+ 17 files changed, 744 insertions(+), 212 deletions(-)
+ create mode 100644 sound/core/sound_kunit.c
+ create mode 100644 sound/pci/hda/hda_component.c
+Merging ieee1394/for-next (dd754748f1be firewire: Convert snprintf/sprintf to sysfs_emit)
+$ git merge -m Merge branch 'for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/ieee1394/linux1394.git ieee1394/for-next
+Merge made by the 'ort' strategy.
+ drivers/firewire/core-device.c | 16 ++++------------
+ 1 file changed, 4 insertions(+), 12 deletions(-)
+Merging sound-asoc/for-next (e3468b7aab5c Merge remote-tracking branch 'asoc/for-6.9' into asoc-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git sound-asoc/for-next
+Merge made by the 'ort' strategy.
+ .../bindings/sound/audio-graph-port.yaml           |    2 +-
+ .../devicetree/bindings/sound/fsl,easrc.yaml       |    4 +-
+ .../devicetree/bindings/sound/fsl,micfil.yaml      |   14 +-
+ .../devicetree/bindings/sound/fsl,sai.yaml         |    6 +
+ .../bindings/sound/infineon,peb2466.yaml           |    2 +-
+ .../devicetree/bindings/sound/qcom,wcd938x.yaml    |   81 +-
+ .../bindings/sound/qcom,wcd939x-sdw.yaml           |   69 +
+ .../devicetree/bindings/sound/qcom,wcd939x.yaml    |   96 +
+ .../bindings/sound/qcom,wcd93xx-common.yaml        |   95 +
+ .../devicetree/bindings/sound/samsung,tm2.yaml     |    7 +-
+ drivers/soundwire/Makefile                         |    2 +-
+ drivers/soundwire/amd_init.c                       |  235 ++
+ drivers/soundwire/amd_init.h                       |   13 +
+ drivers/soundwire/amd_manager.c                    |   47 +-
+ drivers/soundwire/amd_manager.h                    |   16 +-
+ include/linux/soundwire/sdw_amd.h                  |   83 +-
+ include/sound/sof/dai-amd.h                        |    7 +
+ include/sound/sof/dai.h                            |    2 +
+ include/uapi/sound/sof/tokens.h                    |    4 +
+ sound/soc/amd/acp/Kconfig                          |    7 +
+ sound/soc/amd/acp/Makefile                         |    2 +
+ sound/soc/amd/acp/acp-mach-common.c                |    6 +-
+ sound/soc/amd/acp/acp-sof-mach.c                   |   26 +-
+ sound/soc/amd/acp/amd-sdw-acpi.c                   |   62 +
+ sound/soc/atmel/mikroe-proto.c                     |    8 +-
+ sound/soc/codecs/Kconfig                           |   34 +
+ sound/soc/codecs/Makefile                          |    9 +
+ sound/soc/codecs/cs42l43-jack.c                    |   27 +-
+ sound/soc/codecs/cs42l43-sdw.c                     |    1 +
+ sound/soc/codecs/cs42l43.c                         |   64 +-
+ sound/soc/codecs/cs42l43.h                         |   25 +-
+ sound/soc/codecs/es8326.c                          |   92 +-
+ sound/soc/codecs/es8326.h                          |    5 +-
+ sound/soc/codecs/framer-codec.c                    |  413 +++
+ sound/soc/codecs/nau8540.c                         |  112 +-
+ sound/soc/codecs/nau8540.h                         |   13 +-
+ sound/soc/codecs/wcd-clsh-v2.h                     |    1 +
+ sound/soc/codecs/wcd-mbhc-v2.c                     |   95 +-
+ sound/soc/codecs/wcd-mbhc-v2.h                     |    3 +
+ sound/soc/codecs/wcd939x-sdw.c                     | 1551 ++++++++
+ sound/soc/codecs/wcd939x.c                         | 3686 ++++++++++++++++++++
+ sound/soc/codecs/wcd939x.h                         |  989 ++++++
+ sound/soc/fsl/eukrea-tlv320.c                      |    8 +-
+ sound/soc/fsl/fsl_sai.c                            |   13 +
+ sound/soc/fsl/p1022_rdk.c                          |   33 +-
+ sound/soc/intel/common/soc-acpi-intel-mtl-match.c  |   57 +
+ sound/soc/qcom/common.c                            |    2 +-
+ sound/soc/sh/rz-ssi.c                              |    2 +-
+ sound/soc/sof/amd/Kconfig                          |   18 +
+ sound/soc/sof/amd/acp-common.c                     |   65 +-
+ sound/soc/sof/amd/acp-dsp-offset.h                 |   10 +
+ sound/soc/sof/amd/acp-loader.c                     |   34 +-
+ sound/soc/sof/amd/acp.c                            |  232 +-
+ sound/soc/sof/amd/acp.h                            |   26 +-
+ sound/soc/sof/amd/pci-acp63.c                      |    7 +
+ sound/soc/sof/fw-file-profile.c                    |   18 +-
+ sound/soc/sof/ipc3-pcm.c                           |   25 +
+ sound/soc/sof/ipc3-topology.c                      |   40 +
+ sound/soc/sof/sof-audio.h                          |    1 +
+ sound/soc/sof/topology.c                           |    5 +
+ sound/soc/ti/j721e-evm.c                           |    4 +-
+ sound/soc/ti/omap-hdmi.c                           |   10 +-
+ 62 files changed, 8283 insertions(+), 343 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/sound/qcom,wcd939x-sdw.yaml
+ create mode 100644 Documentation/devicetree/bindings/sound/qcom,wcd939x.yaml
+ create mode 100644 Documentation/devicetree/bindings/sound/qcom,wcd93xx-common.yaml
+ create mode 100644 drivers/soundwire/amd_init.c
+ create mode 100644 drivers/soundwire/amd_init.h
+ create mode 100644 sound/soc/amd/acp/amd-sdw-acpi.c
+ create mode 100644 sound/soc/codecs/framer-codec.c
+ create mode 100644 sound/soc/codecs/wcd939x-sdw.c
+ create mode 100644 sound/soc/codecs/wcd939x.c
+ create mode 100644 sound/soc/codecs/wcd939x.h
+Merging modules/modules-next (3559ad395bf0 module: Change module_enable_{nx/x/ro}() to more explicit names)
+$ git merge -m Merge branch 'modules-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git modules/modules-next
+Auto-merging kernel/module/main.c
+Merge made by the 'ort' strategy.
+ kernel/module/internal.h   |  6 +++---
+ kernel/module/main.c       |  8 ++++----
+ kernel/module/strict_rwx.c | 16 +++++++++-------
+ 3 files changed, 16 insertions(+), 14 deletions(-)
+Merging input/next (7d0f351da460 Input: matrix_keypad - switch to using managed resources)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input.git input/next
+Merge made by the 'ort' strategy.
+ .../bindings/input/touchscreen/goodix,gt9916.yaml  |  95 +++
+ .../bindings/input/touchscreen/goodix.yaml         |   5 +-
+ .../bindings/input/touchscreen/melfas,mms114.yaml  |   6 +-
+ .../bindings/input/touchscreen/silead,gsl1680.yaml |   2 +-
+ drivers/input/input-leds.c                         |   8 +-
+ drivers/input/input.c                              |  14 +-
+ drivers/input/keyboard/bcm-keypad.c                |   2 +-
+ drivers/input/keyboard/matrix_keypad.c             | 170 ++---
+ drivers/input/misc/88pm80x_onkey.c                 |  14 +-
+ drivers/input/mouse/Kconfig                        |  12 -
+ drivers/input/mouse/Makefile                       |   1 -
+ drivers/input/mouse/navpoint.c                     | 350 ----------
+ drivers/input/rmi4/rmi_driver.c                    |   6 +-
+ drivers/input/touchscreen/Kconfig                  |  31 +
+ drivers/input/touchscreen/Makefile                 |   3 +
+ drivers/input/touchscreen/goodix_berlin.h          |  24 +
+ drivers/input/touchscreen/goodix_berlin_core.c     | 755 +++++++++++++++++++++
+ drivers/input/touchscreen/goodix_berlin_i2c.c      |  75 ++
+ drivers/input/touchscreen/goodix_berlin_spi.c      | 178 +++++
+ include/linux/input/navpoint.h                     |   8 -
+ 20 files changed, 1244 insertions(+), 515 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/input/touchscreen/goodix,gt9916.yaml
+ delete mode 100644 drivers/input/mouse/navpoint.c
+ create mode 100644 drivers/input/touchscreen/goodix_berlin.h
+ create mode 100644 drivers/input/touchscreen/goodix_berlin_core.c
+ create mode 100644 drivers/input/touchscreen/goodix_berlin_i2c.c
+ create mode 100644 drivers/input/touchscreen/goodix_berlin_spi.c
+ delete mode 100644 include/linux/input/navpoint.h
+Merging block/for-next (b48b5a7c9bc1 Merge branch 'block-deadline' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.dk/linux-block.git block/for-next
+Auto-merging include/linux/sched.h
+Merge made by the 'ort' strategy.
+ block/bfq-cgroup.c        |  14 ++---
+ block/bfq-iosched.c       | 148 +++++++++++++++++++++++++++++++++++-----------
+ block/bfq-iosched.h       |  16 ++++-
+ block/blk-cgroup.c        |   2 +-
+ block/blk-cgroup.h        |   1 +
+ block/blk-core.c          |   3 +
+ block/blk-flush.c         |   2 +-
+ block/blk-iocost.c        |   8 +--
+ block/blk-iolatency.c     |   6 +-
+ block/blk-mq.c            |  16 ++---
+ block/blk-throttle.c      |   6 +-
+ block/blk-wbt.c           |   6 +-
+ block/blk.h               |  67 +++++++++++++++++++++
+ block/mq-deadline.c       | 114 ++++++++++++++++++++++++++++-------
+ include/linux/blk_types.h |  42 -------------
+ include/linux/blkdev.h    |  17 ++++++
+ include/linux/sched.h     |   2 +-
+ kernel/sched/core.c       |   6 +-
+ 18 files changed, 342 insertions(+), 134 deletions(-)
+Merging device-mapper/for-next (4eacc39d5529 dm-crypt, dm-verity: disable tasklets)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git device-mapper/for-next
+Merge made by the 'ort' strategy.
+ drivers/md/dm-core.h          |  2 ++
+ drivers/md/dm-crypt.c         | 38 ++------------------------------------
+ drivers/md/dm-ioctl.c         |  3 ++-
+ drivers/md/dm-stats.c         |  9 +++++++++
+ drivers/md/dm-table.c         |  9 +++++++--
+ drivers/md/dm-verity-target.c | 27 ++-------------------------
+ drivers/md/dm-writecache.c    |  8 ++++----
+ 7 files changed, 28 insertions(+), 68 deletions(-)
+Merging libata/for-next (c8474c7273ac Merge remote-tracking branch 'libata/for-6.8-fixes' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/libata/linux libata/for-next
+Merge made by the 'ort' strategy.
+ drivers/ata/ahci.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+Merging pcmcia/pcmcia-next (4f733de8b78a pcmcia: tcic: remove unneeded "&" in call to setup_timer())
+$ git merge -m Merge branch 'pcmcia-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brodo/linux.git pcmcia/pcmcia-next
+Already up to date.
+Merging mmc/next (4e99ffb173fa mmc: core Drop BLK_BOUNCE_HIGH)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git mmc/next
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml |  3 ++-
+ drivers/mmc/core/block.c                                 | 12 ++++++------
+ drivers/mmc/core/host.c                                  |  5 +++--
+ drivers/mmc/core/queue.c                                 |  2 --
+ 4 files changed, 11 insertions(+), 11 deletions(-)
+Merging mfd/for-mfd-next (1e0ea9e75ff3 mfd: wm831x: Remove redundant forever while loop)
+$ git merge -m Merge branch 'for-mfd-next' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git mfd/for-mfd-next
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/mfd/iqs62x.yaml  |  2 +-
+ .../devicetree/bindings/mfd/qcom,tcsr.yaml         |  2 +
+ drivers/mfd/cros_ec_dev.c                          |  9 +++++
+ drivers/mfd/intel-lpss-pci.c                       | 28 ++++++++++----
+ drivers/mfd/intel-lpss.c                           |  9 ++++-
+ drivers/mfd/intel-lpss.h                           | 14 ++++++-
+ drivers/mfd/lpc_ich.c                              |  3 +-
+ drivers/mfd/omap-usb-host.c                        |  2 +-
+ drivers/mfd/rave-sp.c                              |  2 +-
+ drivers/mfd/wm831x-auxadc.c                        | 43 +++++++++-------------
+ include/linux/mfd/sun4i-gpadc.h                    |  4 +-
+ 11 files changed, 77 insertions(+), 41 deletions(-)
+Merging backlight/for-backlight-next (3b75d271e161 backlight: hx8357: Fix potential NULL pointer dereference)
+$ git merge -m Merge branch 'for-backlight-next' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/backlight.git backlight/for-backlight-next
+Merge made by the 'ort' strategy.
+ drivers/video/backlight/hx8357.c | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+Merging battery/for-next (4c5d387d79a6 power: supply: twl4030_madc: Use devm_power_supply_register() helper)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply.git battery/for-next
+Merge made by the 'ort' strategy.
+ drivers/power/supply/bq27xxx_battery.c      | 41 +++++++++++-----
+ drivers/power/supply/bq27xxx_battery_i2c.c  | 46 ++++++++----------
+ drivers/power/supply/da9030_battery.c       |  6 +--
+ drivers/power/supply/da9052-battery.c       |  4 +-
+ drivers/power/supply/da9150-charger.c       | 72 ++++++++---------------------
+ drivers/power/supply/ds2760_battery.c       |  4 +-
+ drivers/power/supply/goldfish_battery.c     | 24 +++-------
+ drivers/power/supply/lp8727_charger.c       | 35 +++-----------
+ drivers/power/supply/lp8788-charger.c       | 21 +++------
+ drivers/power/supply/pcf50633-charger.c     | 23 ++++-----
+ drivers/power/supply/rt5033_battery.c       | 14 ++----
+ drivers/power/supply/rx51_battery.c         | 57 +++++------------------
+ drivers/power/supply/tps65090-charger.c     | 18 +++-----
+ drivers/power/supply/twl4030_madc_battery.c | 59 ++++++-----------------
+ drivers/power/supply/wm831x_backup.c        | 13 +-----
+ drivers/power/supply/wm831x_power.c         | 24 ++++------
+ include/linux/power/bq27xxx_battery.h       |  1 -
+ 17 files changed, 147 insertions(+), 315 deletions(-)
+Merging regulator/for-next (a2fc922ece40 Merge remote-tracking branch 'regulator/for-6.9' into regulator-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git regulator/for-next
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/regulator/ti,tps65132.yaml | 84 ++++++++++++++++++++++
+ .../bindings/regulator/tps65132-regulator.txt      | 46 ------------
+ drivers/regulator/fixed-helper.c                   |  4 +-
+ drivers/regulator/qcom_smd-regulator.c             | 19 ++---
+ 4 files changed, 96 insertions(+), 57 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/regulator/ti,tps65132.yaml
+ delete mode 100644 Documentation/devicetree/bindings/regulator/tps65132-regulator.txt
+Merging security/next (5a287d3d2b9d lsm: fix default return value of the socket_getpeersec_*() hooks)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git security/next
+Auto-merging include/linux/lsm_hook_defs.h
+Auto-merging security/security.c
+Merge made by the 'ort' strategy.
+ include/linux/lsm_hook_defs.h |  4 ++--
+ security/security.c           | 45 ++++++++++++++++++++++++++++++++++++++-----
+ 2 files changed, 42 insertions(+), 7 deletions(-)
+Merging apparmor/apparmor-next (8ead196be219 apparmor: Fix memory leak in unpack_profile())
+$ git merge -m Merge branch 'apparmor-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jj/linux-apparmor apparmor/apparmor-next
+Already up to date.
+Merging integrity/next-integrity (1ed4b5631002 Revert "KEYS: encrypted: Add check for strsep")
+$ git merge -m Merge branch 'next-integrity' of git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity integrity/next-integrity
+Already up to date.
+Merging selinux/next (90593caf7db7 selinux: reduce the object class calculations at inode init time)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git selinux/next
+Auto-merging security/selinux/hooks.c
+Merge made by the 'ort' strategy.
+ security/selinux/hooks.c | 9 ++++-----
+ 1 file changed, 4 insertions(+), 5 deletions(-)
+Merging smack/next (f0816d4332c3 ramfs: Initialize security of in-memory inodes)
+$ git merge -m Merge branch 'next' of git://github.com/cschaufler/smack-next smack/next
+Merge made by the 'ort' strategy.
+ fs/ramfs/inode.c           |  32 +++++++++++++-
+ security/smack/smack_lsm.c | 105 ++++++++++++++++++++++++++-------------------
+ 2 files changed, 91 insertions(+), 46 deletions(-)
+Merging tomoyo/master (0bb80ecc33a8 Linux 6.6-rc1)
+$ git merge -m Merge branch 'master' of https://scm.osdn.net/gitroot/tomoyo/tomoyo-test1.git tomoyo/master
+Already up to date.
+Merging tpmdd/next (610347effc2e Merge tag 'Wstringop-overflow-for-6.8-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git tpmdd/next
+Already up to date.
+Merging watchdog/master (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'master' of git://www.linux-watchdog.org/linux-watchdog-next.git watchdog/master
+Already up to date.
+Merging iommu/next (75f74f85a42e Merge branches 'apple/dart', 'arm/rockchip', 'arm/smmu', 'virtio', 'x86/vt-d', 'x86/amd' and 'core' into next)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git iommu/next
+Already up to date.
+Merging audit/next (aa13b709084a audit: use KMEM_CACHE() instead of kmem_cache_create())
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit.git audit/next
+Merge made by the 'ort' strategy.
+ kernel/audit.c       | 4 +---
+ kernel/auditfilter.c | 2 +-
+ 2 files changed, 2 insertions(+), 4 deletions(-)
+Merging devicetree/for-next (85f838adad54 dt-bindings: fpga: Convert fpga-region binding to yaml)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git devicetree/for-next
+Auto-merging Documentation/devicetree/bindings/Makefile
+Auto-merging MAINTAINERS
+Auto-merging drivers/of/property.c
+Merge made by the 'ort' strategy.
+ Documentation/devicetree/bindings/Makefile         |   3 -
+ .../devicetree/bindings/fpga/fpga-region.txt       | 479 ---------------------
+ .../devicetree/bindings/fpga/fpga-region.yaml      | 358 +++++++++++++++
+ .../devicetree/bindings/gpio/mrvl-gpio.yaml        |   2 +-
+ Documentation/devicetree/bindings/i2c/i2c-pxa.yaml |   2 +-
+ .../mediatek,mt6577-sysirq.yaml                    |  85 ++++
+ .../interrupt-controller/mediatek,sysirq.txt       |  44 --
+ .../devicetree/bindings/rtc/sa1100-rtc.yaml        |   2 +-
+ .../devicetree/bindings/submitting-patches.rst     |  23 +-
+ .../devicetree/bindings/timer/mrvl,mmp-timer.yaml  |   2 +-
+ .../devicetree/bindings/trivial-devices.yaml       |   2 +
+ MAINTAINERS                                        |   5 +-
+ drivers/of/property.c                              |   2 +-
+ 13 files changed, 461 insertions(+), 548 deletions(-)
+ delete mode 100644 Documentation/devicetree/bindings/fpga/fpga-region.txt
+ create mode 100644 Documentation/devicetree/bindings/fpga/fpga-region.yaml
+ create mode 100644 Documentation/devicetree/bindings/interrupt-controller/mediatek,mt6577-sysirq.yaml
+ delete mode 100644 Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt
+Merging dt-krzk/for-next (8c82b4eef297 ARM: dts: sti: minor whitespace cleanup around '=')
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-dt.git dt-krzk/for-next
+Merge made by the 'ort' strategy.
+ arch/arm/boot/dts/marvell/dove-cubox.dts      | 4 ++--
+ arch/arm/boot/dts/marvell/mmp2-brownstone.dts | 2 +-
+ arch/arm/boot/dts/st/stih407-pinctrl.dtsi     | 8 ++++----
+ arch/arm/boot/dts/ti/davinci/da850.dtsi       | 4 ++--
+ 4 files changed, 9 insertions(+), 9 deletions(-)
+Merging mailbox/for-next (cd795fb0c352 mailbox: mtk-cmdq: Add CMDQ driver support for mt8188)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jassibrar/mailbox.git mailbox/for-next
+Already up to date.
+Merging spi/for-next (60fbb72e3018 Merge remote-tracking branch 'spi/for-6.9' into spi-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git spi/for-next
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/spi/samsung,spi.yaml       |  1 +
+ .../devicetree/bindings/spi/spi-fsl-lpspi.yaml     |  1 +
+ .../devicetree/bindings/spi/spi-nxp-fspi.yaml      | 18 ++++--
+ drivers/spi/Kconfig                                |  2 +-
+ drivers/spi/spi-mt65xx.c                           |  5 ++
+ drivers/spi/spi-nxp-fspi.c                         |  2 +-
+ drivers/spi/spi-s3c64xx.c                          | 27 ++++++---
+ drivers/spi/spi.c                                  | 69 +++-------------------
+ include/linux/spi/spi.h                            |  2 +-
+ 9 files changed, 51 insertions(+), 76 deletions(-)
+Merging tip/master (078b7b997b47 Merge x86/boot into tip/master)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git tip/master
+Auto-merging arch/x86/Kconfig
+Auto-merging arch/x86/Makefile
+Auto-merging arch/x86/include/asm/pgtable.h
+Auto-merging arch/x86/kernel/Makefile
+Auto-merging arch/x86/kernel/alternative.c
+Auto-merging arch/x86/kernel/cpu/mshyperv.c
+Auto-merging arch/x86/kernel/kvm.c
+Auto-merging arch/x86/kernel/smp.c
+Auto-merging arch/x86/mm/dump_pagetables.c
+Auto-merging arch/x86/mm/tlb.c
+Auto-merging arch/x86/net/bpf_jit_comp.c
+Auto-merging scripts/mod/modpost.c
+Auto-merging tools/arch/x86/include/asm/cpufeatures.h
+Auto-merging tools/arch/x86/include/asm/msr-index.h
+Merge made by the 'ort' strategy.
+ Documentation/admin-guide/hw-vuln/spectre.rst   |   8 +-
+ Documentation/admin-guide/kernel-parameters.txt |  11 +-
+ Documentation/arch/x86/pti.rst                  |   6 +-
+ Documentation/arch/x86/x86_64/fred.rst          |  96 ++++++++
+ Documentation/arch/x86/x86_64/index.rst         |   1 +
+ Documentation/process/maintainer-tip.rst        |   4 +-
+ arch/x86/Kconfig                                |  53 +++--
+ arch/x86/Makefile                               |   8 +-
+ arch/x86/boot/compressed/ident_map_64.c         |   4 +-
+ arch/x86/boot/compressed/sev.c                  |   4 +
+ arch/x86/configs/i386_defconfig                 |   2 +-
+ arch/x86/entry/Makefile                         |   5 +-
+ arch/x86/entry/calling.h                        |  55 +++--
+ arch/x86/entry/entry_32.S                       |   6 +-
+ arch/x86/entry/entry_64.S                       |  29 ++-
+ arch/x86/entry/entry_64_fred.S                  | 131 +++++++++++
+ arch/x86/entry/entry_fred.c                     | 294 ++++++++++++++++++++++++
+ arch/x86/entry/vdso/Makefile                    |   4 +-
+ arch/x86/entry/vsyscall/vsyscall_64.c           |   2 +-
+ arch/x86/include/asm/asm-prototypes.h           |   1 +
+ arch/x86/include/asm/cpufeatures.h              |   2 +
+ arch/x86/include/asm/current.h                  |   9 +-
+ arch/x86/include/asm/desc.h                     |   2 -
+ arch/x86/include/asm/disabled-features.h        |  18 +-
+ arch/x86/include/asm/extable_fixup_types.h      |   4 +-
+ arch/x86/include/asm/fpu/sched.h                |  10 +-
+ arch/x86/include/asm/fred.h                     |  97 ++++++++
+ arch/x86/include/asm/ia32.h                     |   4 +-
+ arch/x86/include/asm/idtentry.h                 |  88 ++++++-
+ arch/x86/include/asm/linkage.h                  |  16 +-
+ arch/x86/include/asm/msr-index.h                |  13 +-
+ arch/x86/include/asm/msr.h                      |  18 ++
+ arch/x86/include/asm/nospec-branch.h            |  53 ++---
+ arch/x86/include/asm/page.h                     |   6 +-
+ arch/x86/include/asm/percpu.h                   | 191 +++++++++++----
+ arch/x86/include/asm/pgalloc.h                  |   2 +-
+ arch/x86/include/asm/pgtable-3level.h           |   2 +-
+ arch/x86/include/asm/pgtable.h                  |  18 +-
+ arch/x86/include/asm/pgtable_64.h               |   3 +-
+ arch/x86/include/asm/preempt.h                  |   2 +-
+ arch/x86/include/asm/processor-flags.h          |   2 +-
+ arch/x86/include/asm/processor.h                |   3 +
+ arch/x86/include/asm/pti.h                      |   2 +-
+ arch/x86/include/asm/ptrace.h                   | 104 +++++++--
+ arch/x86/include/asm/static_call.h              |   2 +-
+ arch/x86/include/asm/switch_to.h                |   8 +-
+ arch/x86/include/asm/text-patching.h            |   2 +
+ arch/x86/include/asm/thread_info.h              |  12 +-
+ arch/x86/include/asm/trapnr.h                   |  12 +
+ arch/x86/include/asm/uaccess_64.h               |  11 +-
+ arch/x86/include/asm/vmx.h                      |  17 +-
+ arch/x86/include/uapi/asm/processor-flags.h     |   7 +
+ arch/x86/kernel/Makefile                        |   1 +
+ arch/x86/kernel/acpi/wakeup_64.S                |  24 +-
+ arch/x86/kernel/alternative.c                   |  23 +-
+ arch/x86/kernel/asm-offsets.c                   |   2 +-
+ arch/x86/kernel/callthunks.c                    |  32 ++-
+ arch/x86/kernel/cpu/acrn.c                      |   4 +-
+ arch/x86/kernel/cpu/amd.c                       |   2 +-
+ arch/x86/kernel/cpu/bugs.c                      |  43 ++--
+ arch/x86/kernel/cpu/common.c                    |  39 +++-
+ arch/x86/kernel/cpu/cpuid-deps.c                |   2 +
+ arch/x86/kernel/cpu/mce/core.c                  |  26 +++
+ arch/x86/kernel/cpu/mshyperv.c                  |  15 +-
+ arch/x86/kernel/cpu/resctrl/core.c              |  18 +-
+ arch/x86/kernel/cpu/resctrl/internal.h          |   8 +-
+ arch/x86/kernel/cpu/resctrl/monitor.c           |  48 ++--
+ arch/x86/kernel/cpu/resctrl/rdtgroup.c          |  29 +--
+ arch/x86/kernel/dumpstack.c                     |   2 +-
+ arch/x86/kernel/espfix_64.c                     |   8 +
+ arch/x86/kernel/fred.c                          |  59 +++++
+ arch/x86/kernel/ftrace.c                        |   3 +-
+ arch/x86/kernel/head_32.S                       |   4 +-
+ arch/x86/kernel/head_64.S                       |  39 +---
+ arch/x86/kernel/idt.c                           |   4 +-
+ arch/x86/kernel/irqinit.c                       |   7 +-
+ arch/x86/kernel/kprobes/opt.c                   |   2 +-
+ arch/x86/kernel/kvm.c                           |   2 +-
+ arch/x86/kernel/ldt.c                           |   8 +-
+ arch/x86/kernel/nmi.c                           |  48 +++-
+ arch/x86/kernel/process_32.c                    |   7 +-
+ arch/x86/kernel/process_64.c                    |  74 ++++--
+ arch/x86/kernel/sev-shared.c                    | 102 +++++++-
+ arch/x86/kernel/sev.c                           |   5 +-
+ arch/x86/kernel/smp.c                           |  10 +-
+ arch/x86/kernel/static_call.c                   |   2 +-
+ arch/x86/kernel/traps.c                         |  78 ++++++-
+ arch/x86/kernel/vmlinux.lds.S                   |  11 +-
+ arch/x86/kvm/mmu/mmu.c                          |   2 +-
+ arch/x86/kvm/mmu/mmu_internal.h                 |   2 +-
+ arch/x86/kvm/svm/svm.c                          |   2 +-
+ arch/x86/kvm/svm/vmenter.S                      |   4 +-
+ arch/x86/kvm/vmx/vmx.c                          |  14 +-
+ arch/x86/lib/Makefile                           |   2 +-
+ arch/x86/lib/cmpxchg16b_emu.S                   |  12 +-
+ arch/x86/lib/cmpxchg8b_emu.S                    |  30 ++-
+ arch/x86/lib/getuser.S                          |  24 +-
+ arch/x86/lib/putuser.S                          |  20 +-
+ arch/x86/lib/retpoline.S                        |  26 +--
+ arch/x86/lib/x86-opcode-map.txt                 |   4 +-
+ arch/x86/mm/Makefile                            |   2 +-
+ arch/x86/mm/debug_pagetables.c                  |   4 +-
+ arch/x86/mm/dump_pagetables.c                   |   4 +-
+ arch/x86/mm/extable.c                           |  78 +++++++
+ arch/x86/mm/fault.c                             |  32 +--
+ arch/x86/mm/mem_encrypt.c                       |  56 ++---
+ arch/x86/mm/mem_encrypt_identity.c              |  10 +-
+ arch/x86/mm/pgtable.c                           |   4 +-
+ arch/x86/mm/tlb.c                               |  10 +-
+ arch/x86/net/bpf_jit_comp.c                     |   4 +-
+ arch/x86/net/bpf_jit_comp32.c                   |   2 +-
+ arch/x86/purgatory/Makefile                     |   2 +-
+ arch/x86/xen/xen-asm.S                          |  10 +-
+ drivers/irqchip/irq-gic-v3.c                    |  51 ++--
+ drivers/irqchip/irq-gic.c                       |  27 +--
+ drivers/xen/events/events_base.c                |   2 +-
+ include/linux/bitmap.h                          |   3 +
+ include/linux/compiler-gcc.h                    |   2 +-
+ include/linux/compiler.h                        |   2 +-
+ include/linux/indirect_call_wrapper.h           |   2 +-
+ include/linux/irq.h                             |   2 +-
+ include/linux/irqhandler.h                      |   2 +-
+ include/linux/module.h                          |   2 +-
+ include/linux/objtool.h                         |   2 +-
+ include/linux/pti.h                             |   2 +-
+ include/net/netfilter/nf_tables_core.h          |   2 +-
+ include/net/tc_wrapper.h                        |   2 +-
+ kernel/cpu.c                                    |   5 +-
+ kernel/irq/irq_sim.c                            |  28 +--
+ kernel/irq/irqdesc.c                            | 112 +++++----
+ kernel/trace/ring_buffer.c                      |   2 +-
+ net/netfilter/Makefile                          |   2 +-
+ net/netfilter/nf_tables_core.c                  |   6 +-
+ net/netfilter/nft_ct.c                          |   4 +-
+ net/netfilter/nft_lookup.c                      |   2 +-
+ net/sched/sch_api.c                             |   2 +-
+ scripts/Makefile.lib                            |   8 +-
+ scripts/Makefile.vmlinux_o                      |   2 +-
+ scripts/generate_rust_target.rs                 |   2 +-
+ scripts/mod/modpost.c                           |   2 +-
+ tools/arch/x86/include/asm/cpufeatures.h        |   2 +
+ tools/arch/x86/include/asm/disabled-features.h  |  18 +-
+ tools/arch/x86/include/asm/msr-index.h          |  13 +-
+ tools/arch/x86/lib/x86-opcode-map.txt           |   4 +-
+ tools/objtool/arch/x86/decode.c                 |  19 +-
+ tools/objtool/arch/x86/special.c                |   2 +-
+ tools/objtool/check.c                           |   4 +-
+ 147 files changed, 2201 insertions(+), 756 deletions(-)
+ create mode 100644 Documentation/arch/x86/x86_64/fred.rst
+ create mode 100644 arch/x86/entry/entry_64_fred.S
+ create mode 100644 arch/x86/entry/entry_fred.c
+ create mode 100644 arch/x86/include/asm/fred.h
+ create mode 100644 arch/x86/kernel/fred.c
+Merging clockevents/timers/drivers/next (0076a37a426b dt-bindings: timer: renesas,tmu: Document input capture interrupt)
+$ git merge -m Merge branch 'timers/drivers/next' of git://git.linaro.org/people/daniel.lezcano/linux.git clockevents/timers/drivers/next
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/timer/renesas,tmu.yaml         | 18 ++++++++++++++++--
+ .../bindings/timer/samsung,exynos4210-mct.yaml         |  2 ++
+ drivers/clocksource/timer-imx-gpt.c                    |  3 +--
+ drivers/clocksource/timer-stm32.c                      |  4 ++--
+ drivers/clocksource/timer-ti-32k.c                     |  2 +-
+ 5 files changed, 22 insertions(+), 7 deletions(-)
+Merging edac/edac-for-next (5f9d6dfd6c4a Merge ras/edac-amd-atl into for-next)
+$ git merge -m Merge branch 'edac-for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac/edac-for-next
+Auto-merging MAINTAINERS
+Merge made by the 'ort' strategy.
+ Documentation/RAS/address-translation.rst         |  24 +
+ Documentation/RAS/{ras.rst => error-decoding.rst} |  11 +-
+ Documentation/RAS/index.rst                       |  14 +
+ Documentation/index.rst                           |   2 +-
+ MAINTAINERS                                       |   7 +
+ drivers/edac/Kconfig                              |   1 +
+ drivers/edac/amd64_edac.c                         | 286 +--------
+ drivers/edac/synopsys_edac.c                      |   4 +-
+ drivers/ras/Kconfig                               |   1 +
+ drivers/ras/Makefile                              |   2 +
+ drivers/ras/amd/atl/Kconfig                       |  20 +
+ drivers/ras/amd/atl/Makefile                      |  18 +
+ drivers/ras/amd/atl/access.c                      | 133 ++++
+ drivers/ras/amd/atl/core.c                        | 225 +++++++
+ drivers/ras/amd/atl/dehash.c                      | 500 +++++++++++++++
+ drivers/ras/amd/atl/denormalize.c                 | 719 ++++++++++++++++++++++
+ drivers/ras/amd/atl/internal.h                    | 305 +++++++++
+ drivers/ras/amd/atl/map.c                         | 682 ++++++++++++++++++++
+ drivers/ras/amd/atl/reg_fields.h                  | 606 ++++++++++++++++++
+ drivers/ras/amd/atl/system.c                      | 284 +++++++++
+ drivers/ras/amd/atl/umc.c                         |  92 +++
+ drivers/ras/ras.c                                 |  31 +
+ include/linux/ras.h                               |  16 +
+ 23 files changed, 3694 insertions(+), 289 deletions(-)
+ create mode 100644 Documentation/RAS/address-translation.rst
+ rename Documentation/RAS/{ras.rst => error-decoding.rst} (73%)
+ create mode 100644 Documentation/RAS/index.rst
+ create mode 100644 drivers/ras/amd/atl/Kconfig
+ create mode 100644 drivers/ras/amd/atl/Makefile
+ create mode 100644 drivers/ras/amd/atl/access.c
+ create mode 100644 drivers/ras/amd/atl/core.c
+ create mode 100644 drivers/ras/amd/atl/dehash.c
+ create mode 100644 drivers/ras/amd/atl/denormalize.c
+ create mode 100644 drivers/ras/amd/atl/internal.h
+ create mode 100644 drivers/ras/amd/atl/map.c
+ create mode 100644 drivers/ras/amd/atl/reg_fields.h
+ create mode 100644 drivers/ras/amd/atl/system.c
+ create mode 100644 drivers/ras/amd/atl/umc.c
+Merging ftrace/for-next (4af12c95cbe8 Merge probes/for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git ftrace/for-next
+Merge made by the 'ort' strategy.
+Merging rcu/rcu/next (bc31e6cb27a9 rcu-tasks: Eliminate deadlocks involving do_exit() and RCU tasks)
+$ git merge -m Merge branch 'rcu/next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git rcu/rcu/next
+Auto-merging Documentation/admin-guide/kernel-parameters.rst
+Auto-merging Documentation/admin-guide/kernel-parameters.txt
+Auto-merging include/linux/sched.h
+Auto-merging kernel/fork.c
+Merge made by the 'ort' strategy.
+ Documentation/RCU/checklist.rst                 |  32 +-
+ Documentation/RCU/rcu_dereference.rst           |   5 +-
+ Documentation/RCU/whatisRCU.rst                 |  19 +-
+ Documentation/admin-guide/kernel-parameters.rst |   1 +
+ Documentation/admin-guide/kernel-parameters.txt | 489 ++++++++++++------------
+ arch/x86/kernel/tsc.c                           |   2 +-
+ fs/proc/bootconfig.c                            |  12 +-
+ include/linux/hrtimer.h                         |   4 +-
+ include/linux/rcu_sync.h                        |   1 -
+ include/linux/rcupdate.h                        |   4 +-
+ include/linux/sched.h                           |   2 +
+ init/init_task.c                                |   1 +
+ kernel/context_tracking.c                       |   4 +
+ kernel/fork.c                                   |   1 +
+ kernel/rcu/Kconfig                              |  13 +
+ kernel/rcu/rcu.h                                |  13 +-
+ kernel/rcu/rcuscale.c                           |   6 +-
+ kernel/rcu/rcutorture.c                         |  13 +-
+ kernel/rcu/srcutree.c                           |  24 +-
+ kernel/rcu/sync.c                               |  16 -
+ kernel/rcu/tasks.h                              |  89 +++--
+ kernel/rcu/tree.c                               | 235 ++++++++----
+ kernel/rcu/tree.h                               |  20 +-
+ kernel/rcu/tree_exp.h                           |  83 +---
+ kernel/rcu/tree_nocb.h                          |  69 ++--
+ kernel/rcu/tree_plugin.h                        |  52 +--
+ kernel/time/hrtimer.c                           |   3 +
+ 27 files changed, 651 insertions(+), 562 deletions(-)
+Merging kvm/next (a9ef277488cf x86/kvm: Fix SEV check in sev_map_percpu_data())
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm.git kvm/next
+Auto-merging arch/x86/kernel/kvm.c
+Merge made by the 'ort' strategy.
+ arch/riscv/include/uapi/asm/kvm.h                |  27 ++++++
+ arch/riscv/kvm/vcpu_onereg.c                     |  54 ++++++++++++
+ arch/x86/include/asm/kvm_host.h                  |   2 +
+ arch/x86/kernel/kvm.c                            |   3 +-
+ arch/x86/kvm/hyperv.c                            |  50 +++++++++++
+ arch/x86/kvm/hyperv.h                            |   3 +
+ arch/x86/kvm/x86.c                               |   7 ++
+ tools/testing/selftests/kvm/riscv/get-reg-list.c | 108 +++++++++++++++++++++++
+ 8 files changed, 253 insertions(+), 1 deletion(-)
+Merging kvm-arm/next (87bbb6a32237 Merge branch kvm-arm64/misc into kvmarm/next)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git kvm-arm/next
+Merge made by the 'ort' strategy.
+ tools/testing/selftests/kvm/aarch64/set_id_regs.c | 18 +++++++++++-------
+ 1 file changed, 11 insertions(+), 7 deletions(-)
+Merging kvms390/next (10f7b1dcdfe0 KVM: s390: cpu model: Use proper define for facility mask size)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git kvms390/next
+Already up to date.
+Merging kvm-ppc/topic/ppc-kvm (180c6b072bf3 KVM: PPC: Book3S HV nestedv2: Do not cancel pending decrementer exception)
+$ git merge -m Merge branch 'topic/ppc-kvm' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git kvm-ppc/topic/ppc-kvm
+Already up to date.
+Merging kvm-riscv/riscv_kvm_next (4d0e8f9a361b KVM: riscv: selftests: Add Zfa extension to get-reg-list test)
+$ git merge -m Merge branch 'riscv_kvm_next' of https://github.com/kvm-riscv/linux.git kvm-riscv/riscv_kvm_next
+Already up to date.
+Merging kvm-x86/next (f0f3b810edda Merge branches 'generic', 'misc', 'pmu' and 'selftests')
+$ git merge -m Merge branch 'next' of https://github.com/kvm-x86/linux.git kvm-x86/next
+Auto-merging arch/x86/kvm/x86.c
+Auto-merging tools/testing/selftests/kvm/riscv/get-reg-list.c
+Merge made by the 'ort' strategy.
+ arch/x86/include/asm/kvm-x86-pmu-ops.h             |   3 +-
+ arch/x86/kvm/emulate.c                             |   2 +-
+ arch/x86/kvm/kvm_emulate.h                         |   2 +-
+ arch/x86/kvm/pmu.c                                 |  20 +-
+ arch/x86/kvm/pmu.h                                 |   5 +-
+ arch/x86/kvm/svm/pmu.c                             |  17 +-
+ arch/x86/kvm/vmx/pmu_intel.c                       | 178 +++---
+ arch/x86/kvm/x86.c                                 |  26 +-
+ tools/testing/selftests/kvm/Makefile               |   2 +
+ tools/testing/selftests/kvm/aarch64/arch_timer.c   |  12 +-
+ tools/testing/selftests/kvm/aarch64/hypercalls.c   |  16 +-
+ .../selftests/kvm/aarch64/page_fault_test.c        |   6 +-
+ tools/testing/selftests/kvm/aarch64/smccc_filter.c |   2 +-
+ .../selftests/kvm/aarch64/vpmu_counter_access.c    |  12 +-
+ tools/testing/selftests/kvm/demand_paging_test.c   |   4 +-
+ tools/testing/selftests/kvm/dirty_log_perf_test.c  |   4 +-
+ tools/testing/selftests/kvm/dirty_log_test.c       |   4 +-
+ tools/testing/selftests/kvm/get-reg-list.c         |   2 +-
+ tools/testing/selftests/kvm/guest_print_test.c     |   8 +-
+ .../testing/selftests/kvm/hardware_disable_test.c  |   6 +-
+ .../testing/selftests/kvm/include/kvm_util_base.h  |   4 +
+ tools/testing/selftests/kvm/include/test_util.h    |   2 +
+ tools/testing/selftests/kvm/include/x86_64/pmu.h   |  97 ++++
+ .../selftests/kvm/include/x86_64/processor.h       | 150 +++--
+ tools/testing/selftests/kvm/kvm_create_max_vcpus.c |   2 +-
+ tools/testing/selftests/kvm/kvm_page_table_test.c  |   4 +-
+ .../testing/selftests/kvm/lib/aarch64/processor.c  |   2 +-
+ tools/testing/selftests/kvm/lib/aarch64/vgic.c     |   4 +-
+ tools/testing/selftests/kvm/lib/elf.c              |   2 +-
+ tools/testing/selftests/kvm/lib/kvm_util.c         |  81 ++-
+ tools/testing/selftests/kvm/lib/memstress.c        |   2 +-
+ tools/testing/selftests/kvm/lib/riscv/processor.c  |   2 +-
+ tools/testing/selftests/kvm/lib/s390x/processor.c  |   2 +-
+ tools/testing/selftests/kvm/lib/test_util.c        |  25 +
+ tools/testing/selftests/kvm/lib/userfaultfd_util.c |   2 +-
+ tools/testing/selftests/kvm/lib/x86_64/pmu.c       |  31 ++
+ tools/testing/selftests/kvm/lib/x86_64/processor.c |  36 +-
+ tools/testing/selftests/kvm/lib/x86_64/vmx.c       |   6 +-
+ .../kvm/memslot_modification_stress_test.c         |   2 +-
+ tools/testing/selftests/kvm/memslot_perf_test.c    |   6 +-
+ tools/testing/selftests/kvm/riscv/get-reg-list.c   |   2 +-
+ tools/testing/selftests/kvm/rseq_test.c            |   4 +-
+ tools/testing/selftests/kvm/s390x/resets.c         |   4 +-
+ tools/testing/selftests/kvm/s390x/sync_regs_test.c |  20 +-
+ .../testing/selftests/kvm/set_memory_region_test.c |   6 +-
+ .../selftests/kvm/system_counter_offset_test.c     |   2 +-
+ tools/testing/selftests/kvm/x86_64/amx_test.c      |   6 +-
+ tools/testing/selftests/kvm/x86_64/cpuid_test.c    |   4 +-
+ .../testing/selftests/kvm/x86_64/flds_emulation.h  |   2 +-
+ tools/testing/selftests/kvm/x86_64/hyperv_clock.c  |   5 +-
+ .../testing/selftests/kvm/x86_64/hyperv_features.c |   9 +-
+ tools/testing/selftests/kvm/x86_64/hyperv_ipi.c    |   2 +-
+ .../selftests/kvm/x86_64/hyperv_tlb_flush.c        |   2 +-
+ .../testing/selftests/kvm/x86_64/kvm_clock_test.c  |  42 +-
+ .../selftests/kvm/x86_64/nx_huge_pages_test.c      |   6 +-
+ .../selftests/kvm/x86_64/platform_info_test.c      |   2 +-
+ .../selftests/kvm/x86_64/pmu_counters_test.c       | 617 +++++++++++++++++++++
+ .../selftests/kvm/x86_64/pmu_event_filter_test.c   | 145 ++---
+ .../selftests/kvm/x86_64/sev_migrate_tests.c       |  28 +-
+ .../kvm/x86_64/smaller_maxphyaddr_emulation_test.c |   6 +-
+ .../testing/selftests/kvm/x86_64/sync_regs_test.c  |  10 +-
+ .../selftests/kvm/x86_64/ucna_injection_test.c     |   8 +-
+ .../selftests/kvm/x86_64/userspace_io_test.c       |   2 +-
+ .../selftests/kvm/x86_64/userspace_msr_exit_test.c |  29 +-
+ .../selftests/kvm/x86_64/vmx_apic_access_test.c    |   2 +-
+ .../selftests/kvm/x86_64/vmx_dirty_log_test.c      |  16 +-
+ .../vmx_exception_with_invalid_guest_state.c       |   2 +-
+ .../kvm/x86_64/vmx_nested_tsc_scaling_test.c       |  19 +-
+ .../selftests/kvm/x86_64/vmx_pmu_caps_test.c       |   2 +-
+ .../testing/selftests/kvm/x86_64/xapic_ipi_test.c  |   8 +-
+ .../testing/selftests/kvm/x86_64/xcr0_cpuid_test.c |   2 +-
+ .../testing/selftests/kvm/x86_64/xen_shinfo_test.c |  36 +-
+ tools/testing/selftests/kvm/x86_64/xss_msr_test.c  |   2 +-
+ virt/kvm/kvm_main.c                                |   4 +-
+ 74 files changed, 1315 insertions(+), 534 deletions(-)
+ create mode 100644 tools/testing/selftests/kvm/include/x86_64/pmu.h
+ create mode 100644 tools/testing/selftests/kvm/lib/x86_64/pmu.c
+ create mode 100644 tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
+Merging xen-tip/linux-next (2d2db7d40254 xen/gntdev: Fix the abuse of underlying struct page in DMA-buf import)
+$ git merge -m Merge branch 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git xen-tip/linux-next
+Already up to date.
+Merging percpu/for-next (2d9ad81ef935 Merge branch 'for-6.8-fixes' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu.git percpu/for-next
+Merge made by the 'ort' strategy.
+ arch/riscv/mm/tlbflush.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+Merging workqueues/for-next (15930da42f89 workqueue: Don't call cpumask_test_cpu() with -1 CPU in wq_update_node_max_active())
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git workqueues/for-next
+Merge made by the 'ort' strategy.
+ include/linux/workqueue.h  |  76 +++--
+ kernel/workqueue.c         | 816 +++++++++++++++++++++++++++++++++++++--------
+ tools/workqueue/wq_dump.py |  95 +++++-
+ 3 files changed, 813 insertions(+), 174 deletions(-)
+Merging drivers-x86/for-next (3f399b5d7189 platform/x86: wmi: Use ACPI device name in netlink event)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git drivers-x86/for-next
+Auto-merging drivers/platform/x86/wmi.c
+Merge made by the 'ort' strategy.
+ .../admin-guide/laptops/thinkpad-acpi.rst          |  7 +++++-
+ drivers/platform/mellanox/mlxbf-bootctl.c          | 14 +++++------
+ drivers/platform/x86/Kconfig                       |  6 -----
+ drivers/platform/x86/asus-wmi.c                    |  1 -
+ drivers/platform/x86/dell/Kconfig                  |  3 ---
+ drivers/platform/x86/dell/dell-laptop.c            |  2 --
+ drivers/platform/x86/dell/dell-wmi-privacy.c       |  1 -
+ drivers/platform/x86/huawei-wmi.c                  |  1 -
+ drivers/platform/x86/silicom-platform.c            |  7 +-----
+ drivers/platform/x86/thinkpad_acpi.c               | 29 ++++++++++++++++------
+ drivers/platform/x86/wmi.c                         |  2 +-
+ 11 files changed, 37 insertions(+), 36 deletions(-)
+Merging chrome-platform/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git chrome-platform/for-next
+Already up to date.
+Merging chrome-platform-firmware/for-firmware-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-firmware-next' of git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git chrome-platform-firmware/for-firmware-next
+Already up to date.
+Merging hsi/for-next (fa72d143471d HSI: omap_ssi: Remove usage of the deprecated ida_simple_xx() API)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-hsi.git hsi/for-next
+Already up to date.
+Merging leds-lj/for-leds-next (54602f38551e leds: Make flash and multicolor dependencies unconditional)
+$ git merge -m Merge branch 'for-leds-next' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/leds.git leds-lj/for-leds-next
+Merge made by the 'ort' strategy.
+ .../ABI/testing/sysfs-class-led-trigger-netdev     |  12 +
+ .../ABI/testing/sysfs-class-led-trigger-tty        |  14 +-
+ .../devicetree/bindings/leds/leds-qcom-lpg.yaml    |  82 ++++-
+ drivers/leds/Kconfig                               |   4 +-
+ drivers/leds/flash/Kconfig                         |   4 +-
+ drivers/leds/led-class.c                           |   6 +
+ drivers/leds/led-triggers.c                        |   9 +
+ drivers/leds/leds-aw200xx.c                        |   2 +-
+ drivers/leds/rgb/leds-qcom-lpg.c                   | 346 +++++++++++++++++++--
+ drivers/leds/trigger/ledtrig-audio.c               |   2 +
+ drivers/leds/trigger/ledtrig-default-on.c          |   1 +
+ drivers/leds/trigger/ledtrig-netdev.c              |  98 +++++-
+ drivers/leds/trigger/ledtrig-panic.c               |  23 +-
+ drivers/staging/greybus/Kconfig                    |   2 +-
+ drivers/staging/greybus/light.c                    |  21 --
+ include/dt-bindings/leds/common.h                  |   3 +
+ include/linux/led-class-flash.h                    |  24 --
+ include/linux/led-class-multicolor.h               |  29 --
+ include/linux/leds.h                               |  19 --
+ 19 files changed, 541 insertions(+), 160 deletions(-)
+$ git reset --hard HEAD^
+HEAD is now at 4b7d6cc566fd Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git
+Merging next-20240125 version of leds-lj
+$ git merge -m next-20240125/leds-lj 4289e434c46c8cbd32cf8b67fa7689b3d2ca4361
+Already up to date.
+Merging ipmi/for-next (296455ade1fd Merge tag 'char-misc-6.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc)
+$ git merge -m Merge branch 'for-next' of git://github.com/cminyard/linux-ipmi.git ipmi/for-next
+Already up to date.
+Merging driver-core/driver-core-next (f297a3844aa0 driver core: component: fix spellos)
+$ git merge -m Merge branch 'driver-core-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git driver-core/driver-core-next
+Auto-merging drivers/base/cpu.c
+Merge made by the 'ort' strategy.
+ drivers/base/component.c    |  4 ++--
+ drivers/base/cpu.c          |  2 +-
+ fs/kernfs/dir.c             | 31 ++++++++++++++++++++-----------
+ fs/kernfs/file.c            |  8 +++++---
+ fs/kernfs/kernfs-internal.h |  2 ++
+ include/linux/cpu.h         |  2 +-
+ include/linux/kernfs.h      | 10 ++++++----
+ 7 files changed, 37 insertions(+), 22 deletions(-)
+Merging usb/usb-next (f1a27f081c1f usb: typec: qcom-pmic-typec: allow different implementations for the port backend)
+$ git merge -m Merge branch 'usb-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git usb/usb-next
+Auto-merging Documentation/usb/gadget-testing.rst
+Auto-merging drivers/usb/core/hub.c
+Auto-merging drivers/usb/dwc3/host.c
+Auto-merging drivers/usb/host/xhci.h
+Auto-merging drivers/usb/typec/tcpm/tcpm.c
+Auto-merging tools/testing/selftests/Makefile
+Merge made by the 'ort' strategy.
+ Documentation/ABI/testing/configfs-usb-gadget-ffs  |   12 +-
+ .../devicetree/bindings/usb/ci-hdrc-usb2.yaml      |    2 +-
+ .../devicetree/bindings/usb/fcs,fsa4480.yaml       |   12 +-
+ .../devicetree/bindings/usb/generic-ehci.yaml      |    1 +
+ .../devicetree/bindings/usb/gpio-sbu-mux.yaml      |   12 +-
+ .../devicetree/bindings/usb/ite,it5205.yaml        |   72 ++
+ .../devicetree/bindings/usb/mediatek,mtu3.yaml     |    5 +-
+ .../devicetree/bindings/usb/nxp,ptn36502.yaml      |   12 +-
+ .../devicetree/bindings/usb/onnn,nb7vpq904m.yaml   |   13 +-
+ .../bindings/usb/qcom,wcd939x-usbss.yaml           |   12 +-
+ .../devicetree/bindings/usb/snps,dwc3.yaml         |    7 +
+ .../devicetree/bindings/usb/usb-nop-xceiv.yaml     |   11 +-
+ .../devicetree/bindings/usb/usb-switch.yaml        |   67 +
+ Documentation/devicetree/bindings/usb/usb.yaml     |    2 +
+ Documentation/usb/gadget-testing.rst               |    8 +
+ drivers/phy/Kconfig                                |    1 +
+ drivers/phy/Makefile                               |    1 +
+ drivers/phy/phy-core.c                             |   47 +
+ drivers/phy/realtek/Kconfig                        |   32 +
+ drivers/phy/realtek/Makefile                       |    3 +
+ drivers/phy/realtek/phy-rtk-usb2.c                 | 1312 ++++++++++++++++++++
+ drivers/phy/realtek/phy-rtk-usb3.c                 |  748 +++++++++++
+ drivers/usb/core/Kconfig                           |   17 +
+ drivers/usb/core/driver.c                          |    8 +-
+ drivers/usb/core/hcd.c                             |   20 +-
+ drivers/usb/core/hub.c                             |   29 +
+ drivers/usb/core/phy.c                             |  120 ++
+ drivers/usb/core/phy.h                             |    3 +
+ drivers/usb/dwc3/core.c                            |    3 +
+ drivers/usb/dwc3/core.h                            |    2 +
+ drivers/usb/dwc3/dwc3-of-simple.c                  |    3 +-
+ drivers/usb/dwc3/host.c                            |   51 +
+ drivers/usb/gadget/function/f_fs.c                 |   20 +
+ drivers/usb/host/ehci-orion.c                      |   18 +-
+ drivers/usb/host/xhci-caps.h                       |   85 ++
+ drivers/usb/host/xhci-port.h                       |  176 +++
+ drivers/usb/host/xhci.h                            |  262 +---
+ drivers/usb/mtu3/mtu3_host.c                       |   30 +
+ drivers/usb/phy/phy-generic.c                      |   55 +-
+ drivers/usb/storage/sddr55.c                       |    4 +-
+ drivers/usb/typec/altmodes/displayport.c           |  162 ++-
+ drivers/usb/typec/bus.c                            |  102 ++
+ drivers/usb/typec/class.c                          |   59 +
+ drivers/usb/typec/class.h                          |    1 +
+ drivers/usb/typec/mux/Kconfig                      |   10 +
+ drivers/usb/typec/mux/Makefile                     |    1 +
+ drivers/usb/typec/mux/it5205.c                     |  294 +++++
+ drivers/usb/typec/tcpm/fusb302.c                   |    2 +-
+ drivers/usb/typec/tcpm/qcom/qcom_pmic_typec.c      |  228 +---
+ drivers/usb/typec/tcpm/qcom/qcom_pmic_typec.h      |   27 +
+ .../usb/typec/tcpm/qcom/qcom_pmic_typec_pdphy.c    |  159 ++-
+ .../usb/typec/tcpm/qcom/qcom_pmic_typec_pdphy.h    |   92 +-
+ drivers/usb/typec/tcpm/qcom/qcom_pmic_typec_port.c |  290 ++++-
+ drivers/usb/typec/tcpm/qcom/qcom_pmic_typec_port.h |  172 +--
+ drivers/usb/typec/tcpm/tcpci.c                     |   26 +-
+ drivers/usb/typec/tcpm/tcpci_maxim.h               |    1 +
+ drivers/usb/typec/tcpm/tcpci_maxim_core.c          |   38 +-
+ drivers/usb/typec/tcpm/tcpm.c                      | 1028 ++++++++++++---
+ drivers/usb/typec/tcpm/wcove.c                     |    2 +-
+ drivers/usb/typec/ucsi/ucsi_ccg.c                  |   92 +-
+ include/linux/phy/phy.h                            |   21 +
+ include/linux/usb/audio-v2.h                       |    4 +-
+ include/linux/usb/pd.h                             |    1 +
+ include/linux/usb/pd_vdo.h                         |    8 +-
+ include/linux/usb/tcpci.h                          |   13 +
+ include/linux/usb/tcpm.h                           |   16 +-
+ include/linux/usb/typec.h                          |    7 +
+ include/linux/usb/typec_altmode.h                  |   30 +
+ include/uapi/linux/usb/ch9.h                       |    2 +
+ tools/testing/selftests/Makefile                   |    1 +
+ tools/testing/selftests/devices/Makefile           |    4 +
+ .../devices/boards/Dell Inc.,XPS 13 9300.yaml      |   40 +
+ .../selftests/devices/boards/google,spherion.yaml  |   50 +
+ tools/testing/selftests/devices/ksft.py            |   90 ++
+ .../selftests/devices/test_discoverable_devices.py |  318 +++++
+ 75 files changed, 5652 insertions(+), 1037 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/usb/ite,it5205.yaml
+ create mode 100644 Documentation/devicetree/bindings/usb/usb-switch.yaml
+ create mode 100644 drivers/phy/realtek/Kconfig
+ create mode 100644 drivers/phy/realtek/Makefile
+ create mode 100644 drivers/phy/realtek/phy-rtk-usb2.c
+ create mode 100644 drivers/phy/realtek/phy-rtk-usb3.c
+ create mode 100644 drivers/usb/host/xhci-caps.h
+ create mode 100644 drivers/usb/host/xhci-port.h
+ create mode 100644 drivers/usb/typec/mux/it5205.c
+ create mode 100644 drivers/usb/typec/tcpm/qcom/qcom_pmic_typec.h
+ create mode 100644 tools/testing/selftests/devices/Makefile
+ create mode 100644 tools/testing/selftests/devices/boards/Dell Inc.,XPS 13 9300.yaml
+ create mode 100644 tools/testing/selftests/devices/boards/google,spherion.yaml
+ create mode 100644 tools/testing/selftests/devices/ksft.py
+ create mode 100755 tools/testing/selftests/devices/test_discoverable_devices.py
+Merging thunderbolt/next (dec6a613574c thunderbolt: Remove usage of the deprecated ida_simple_xx() API)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git thunderbolt/next
+Auto-merging drivers/thunderbolt/tb_regs.h
+Auto-merging drivers/thunderbolt/usb4.c
+Merge made by the 'ort' strategy.
+ drivers/thunderbolt/domain.c  |  11 ++--
+ drivers/thunderbolt/icm.c     |   2 +-
+ drivers/thunderbolt/lc.c      |  45 ++++++++++++++
+ drivers/thunderbolt/nhi.c     |  25 +++++---
+ drivers/thunderbolt/nvm.c     |   4 +-
+ drivers/thunderbolt/path.c    |  13 ++++
+ drivers/thunderbolt/switch.c  | 138 ++++++++++++++++++++++++++++++++++++------
+ drivers/thunderbolt/tb.c      |  26 +++++---
+ drivers/thunderbolt/tb.h      |   7 ++-
+ drivers/thunderbolt/tb_regs.h |   6 ++
+ drivers/thunderbolt/usb4.c    |  39 ++++++++++++
+ drivers/thunderbolt/xdomain.c |  12 ++--
+ 12 files changed, 278 insertions(+), 50 deletions(-)
+Merging usb-serial/usb-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'usb-next' of git://git.kernel.org/pub/scm/linux/kernel/git/johan/usb-serial.git usb-serial/usb-next
+Already up to date.
+Merging tty/tty-next (fccc9d9233f9 tty: serial: uartps: Add rs485 support to uartps driver)
+$ git merge -m Merge branch 'tty-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git tty/tty-next
+Auto-merging drivers/bluetooth/btnxpuart.c
+Auto-merging drivers/mfd/rave-sp.c
+Auto-merging drivers/net/ethernet/qualcomm/qca_uart.c
+Auto-merging drivers/tty/serial/8250/8250_pci1xxxx.c
+Auto-merging drivers/tty/serial/max310x.c
+Auto-merging drivers/video/fbdev/core/fbcon.c
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/serial/cdns,uart.yaml      |   1 +
+ .../devicetree/bindings/serial/fsl-lpuart.yaml     |   1 +
+ .../devicetree/bindings/serial/renesas,hscif.yaml  |   1 +
+ .../devicetree/bindings/serial/samsung_uart.yaml   |   2 +
+ Documentation/driver-api/tty/console.rst           |  45 ++
+ Documentation/driver-api/tty/index.rst             |   1 +
+ arch/m68k/amiga/config.c                           |   2 +-
+ arch/m68k/hp300/config.c                           |   6 +-
+ drivers/bluetooth/btmtkuart.c                      |   4 +-
+ drivers/bluetooth/btnxpuart.c                      |   4 +-
+ drivers/bluetooth/hci_serdev.c                     |   4 +-
+ drivers/gnss/serial.c                              |   2 +-
+ drivers/gnss/sirf.c                                |   2 +-
+ drivers/greybus/gb-beagleplay.c                    |   6 +-
+ drivers/iio/chemical/pms7003.c                     |   4 +-
+ drivers/iio/chemical/scd30_serial.c                |   4 +-
+ drivers/iio/chemical/sps30_serial.c                |   4 +-
+ drivers/iio/imu/bno055/bno055_ser_core.c           |   4 +-
+ drivers/input/keyboard/amikbd.c                    |   6 +-
+ drivers/mfd/rave-sp.c                              |   4 +-
+ drivers/net/ethernet/qualcomm/qca_uart.c           |   2 +-
+ drivers/nfc/pn533/uart.c                           |   4 +-
+ drivers/nfc/s3fwrn5/uart.c                         |   4 +-
+ drivers/platform/chrome/cros_ec_uart.c             |   4 +-
+ drivers/platform/surface/aggregator/core.c         |   4 +-
+ drivers/tty/Kconfig                                |   7 +-
+ drivers/tty/serdev/serdev-ttyport.c                |  10 +-
+ drivers/tty/serial/8250/8250_pci1xxxx.c            | 140 ++++-
+ drivers/tty/serial/8250/8250_port.c                |   7 +
+ drivers/tty/serial/amba-pl011.c                    |  82 ---
+ drivers/tty/serial/fsl_linflexuart.c               |   1 -
+ drivers/tty/serial/max310x.c                       | 327 +++++-----
+ drivers/tty/serial/qcom_geni_serial.c              |  27 +-
+ drivers/tty/serial/samsung_tty.c                   | 251 ++++----
+ drivers/tty/serial/serial_txx9.c                   |   3 +-
+ drivers/tty/serial/stm32-usart.c                   | 223 ++++---
+ drivers/tty/serial/stm32-usart.h                   |  38 +-
+ drivers/tty/serial/xilinx_uartps.c                 | 236 +++++++-
+ drivers/tty/vt/Makefile                            |   4 +-
+ drivers/tty/vt/selection.c                         |  43 +-
+ drivers/tty/vt/vt.c                                | 659 ++++++++++-----------
+ drivers/tty/vt/vt_ioctl.c                          |   6 +-
+ drivers/video/console/dummycon.c                   |  38 +-
+ drivers/video/console/mdacon.c                     |  43 +-
+ drivers/video/console/newport_con.c                |  69 +--
+ drivers/video/console/sticon.c                     |  79 +--
+ drivers/video/console/vgacon.c                     | 152 +++--
+ drivers/video/fbdev/core/bitblit.c                 |  13 +-
+ drivers/video/fbdev/core/fbcon.c                   | 123 ++--
+ drivers/video/fbdev/core/fbcon.h                   |   4 +-
+ drivers/video/fbdev/core/fbcon_ccw.c               |  13 +-
+ drivers/video/fbdev/core/fbcon_cw.c                |  13 +-
+ drivers/video/fbdev/core/fbcon_ud.c                |  13 +-
+ drivers/video/fbdev/core/tileblit.c                |   4 +-
+ drivers/video/fbdev/tgafb.c                        |   2 +-
+ include/linux/console.h                            | 126 ++--
+ include/linux/console_struct.h                     |   1 -
+ include/linux/selection.h                          |  48 +-
+ include/linux/serdev.h                             |   8 +-
+ include/linux/serial_8250.h                        |   6 +
+ include/linux/soc/qcom/geni-se.h                   |   1 +
+ include/linux/vt_kern.h                            |  12 +-
+ include/uapi/linux/fb.h                            |   8 +-
+ include/uapi/linux/vesa.h                          |  18 +
+ lib/Kconfig.kgdb                                   |   2 +-
+ sound/drivers/serial-generic.c                     |   4 +-
+ 66 files changed, 1654 insertions(+), 1335 deletions(-)
+ create mode 100644 Documentation/driver-api/tty/console.rst
+ create mode 100644 include/uapi/linux/vesa.h
+Merging char-misc/char-misc-next (390b60f7638a mei: pxp: add dependency on Xe driver)
+$ git merge -m Merge branch 'char-misc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git char-misc/char-misc-next
+Merge made by the 'ort' strategy.
+ drivers/misc/hpilo.c             |  8 ++++----
+ drivers/misc/mei/gsc-me.c        |  5 +++++
+ drivers/misc/mei/hdcp/Kconfig    |  2 +-
+ drivers/misc/mei/hdcp/mei_hdcp.c | 14 ++++++++++++--
+ drivers/misc/mei/pxp/Kconfig     |  2 +-
+ drivers/misc/mei/pxp/mei_pxp.c   | 14 ++++++++++++--
+ 6 files changed, 35 insertions(+), 10 deletions(-)
+Merging accel/habanalabs-next (dddb2e526a36 accel/habanalabs: use kcalloc() instead of kzalloc())
+$ git merge -m Merge branch 'habanalabs-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux.git accel/habanalabs-next
+Merge made by the 'ort' strategy.
+ .../accel/habanalabs/common/command_submission.c   |   3 +-
+ drivers/accel/habanalabs/common/device.c           |  49 ++-
+ drivers/accel/habanalabs/common/habanalabs.h       |  33 +-
+ drivers/accel/habanalabs/common/hw_queue.c         |  17 +
+ drivers/accel/habanalabs/common/mmu/Makefile       |   2 +-
+ drivers/accel/habanalabs/common/mmu/mmu.c          | 223 ++++++++++++-
+ drivers/accel/habanalabs/common/mmu/mmu_v1.c       | 354 +++------------------
+ drivers/accel/habanalabs/common/mmu/mmu_v2.c       | 338 ++++++++++++++++++++
+ drivers/accel/habanalabs/gaudi/gaudi.c             |   1 +
+ drivers/accel/habanalabs/gaudi2/gaudi2.c           | 258 +++++++++++----
+ drivers/accel/habanalabs/gaudi2/gaudi2P.h          |  12 +-
+ drivers/accel/habanalabs/goya/goya_coresight.c     |   3 +-
+ .../habanalabs/include/hw_ip/mmu/mmu_general.h     |   2 +
+ 13 files changed, 899 insertions(+), 396 deletions(-)
+ create mode 100644 drivers/accel/habanalabs/common/mmu/mmu_v2.c
+Merging coresight/next (60e5f23dc5d6 coresight: ultrasoc-smb: Use guards to cleanup)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/coresight/linux.git coresight/next
+Already up to date.
+Merging fastrpc/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/srini/fastrpc.git fastrpc/for-next
+Already up to date.
+Merging fpga/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/fpga/linux-fpga.git fpga/for-next
+Already up to date.
+Merging icc/icc-next (7158ba962f41 Merge branch 'icc-fixes' into icc-next)
+$ git merge -m Merge branch 'icc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/djakov/icc.git icc/icc-next
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/interconnect/qcom,rpm.yaml |    3 +
+ drivers/interconnect/qcom/Kconfig                  |    9 +
+ drivers/interconnect/qcom/Makefile                 |    2 +
+ drivers/interconnect/qcom/msm8909.c                | 1329 ++++++++++++++++++++
+ drivers/interconnect/qcom/sc8180x.c                |    1 +
+ drivers/interconnect/qcom/sm8550.c                 |  575 +--------
+ drivers/interconnect/qcom/sm8550.h                 |  284 ++---
+ drivers/interconnect/qcom/x1e80100.c               |  315 -----
+ include/dt-bindings/interconnect/qcom,msm8909.h    |   93 ++
+ .../dt-bindings/interconnect/qcom,x1e80100-rpmh.h  |   24 -
+ 10 files changed, 1560 insertions(+), 1075 deletions(-)
+ create mode 100644 drivers/interconnect/qcom/msm8909.c
+ create mode 100644 include/dt-bindings/interconnect/qcom,msm8909.h
+Merging iio/togreg (a0295c1bd4a7 iio: frequency: admfm2000: New driver)
+$ git merge -m Merge branch 'togreg' of git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git iio/togreg
+Auto-merging MAINTAINERS
+Auto-merging drivers/iio/industrialio-core.c
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/iio/adc/adi,ad7380.yaml    |  86 ++++
+ .../bindings/iio/adc/richtek,rtq6056.yaml          |   9 +-
+ .../bindings/iio/frequency/adi,admfm2000.yaml      | 127 +++++
+ .../devicetree/bindings/iio/light/ams,as73211.yaml |   7 +-
+ .../iio/pressure/honeywell,mprls0025pa.yaml        |  98 +++-
+ MAINTAINERS                                        |  24 +-
+ drivers/iio/accel/Kconfig                          |   8 +-
+ drivers/iio/accel/Makefile                         |   1 +
+ drivers/iio/accel/bmc150-accel-i2c.c               |   2 +-
+ drivers/iio/accel/bmc150-accel-spi.c               |   3 +-
+ drivers/iio/accel/bmi088-accel-i2c.c               |  70 +++
+ drivers/iio/accel/da280.c                          |  66 ++-
+ drivers/iio/accel/kxcjk-1013.c                     |  33 +-
+ drivers/iio/accel/mma9551.c                        |   4 +-
+ drivers/iio/accel/mma9553.c                        |   4 +-
+ drivers/iio/accel/mxc4005.c                        |   4 +-
+ drivers/iio/accel/mxc6255.c                        |   4 +-
+ drivers/iio/accel/st_accel_i2c.c                   |   5 +-
+ drivers/iio/accel/stk8ba50.c                       |   4 +-
+ drivers/iio/adc/Kconfig                            |  16 +
+ drivers/iio/adc/Makefile                           |   1 +
+ drivers/iio/adc/ad7380.c                           | 462 +++++++++++++++++++
+ drivers/iio/adc/ad_sigma_delta.c                   |   7 +-
+ drivers/iio/adc/rtq6056.c                          | 275 ++++++++++-
+ drivers/iio/adc/ti-adc108s102.c                    |   4 +-
+ drivers/iio/adc/ti-ads1015.c                       |   2 +-
+ drivers/iio/buffer/industrialio-buffer-dmaengine.c |   3 +-
+ .../iio/common/inv_sensors/inv_sensors_timestamp.c |   2 +-
+ drivers/iio/dummy/iio_dummy_evgen.c                |   2 -
+ drivers/iio/frequency/Kconfig                      |  10 +
+ drivers/iio/frequency/Makefile                     |   1 +
+ drivers/iio/frequency/admfm2000.c                  | 282 +++++++++++
+ drivers/iio/gyro/bmg160_i2c.c                      |   4 +-
+ drivers/iio/health/afe4403.c                       |  65 +--
+ drivers/iio/health/afe4404.c                       |  65 +--
+ drivers/iio/humidity/hts221_i2c.c                  |   4 +-
+ drivers/iio/imu/adis16475.c                        |   8 +-
+ drivers/iio/imu/adis16480.c                        |   9 +-
+ drivers/iio/imu/fxos8700_i2c.c                     |   3 +-
+ drivers/iio/imu/fxos8700_spi.c                     |   3 +-
+ drivers/iio/imu/kmx61.c                            |   2 +-
+ drivers/iio/industrialio-core.c                    |   4 +-
+ drivers/iio/light/Kconfig                          |   5 +-
+ drivers/iio/light/as73211.c                        | 142 ++++--
+ drivers/iio/light/jsa1212.c                        |   4 +-
+ drivers/iio/light/ltr501.c                         |   3 +-
+ drivers/iio/light/max44000.c                       |   6 +-
+ drivers/iio/light/rpr0521.c                        |   4 +-
+ drivers/iio/light/stk3310.c                        |   4 +-
+ drivers/iio/light/us5182d.c                        |   4 +-
+ drivers/iio/light/vcnl4000.c                       |  36 +-
+ drivers/iio/magnetometer/bmc150_magn_i2c.c         |   3 +-
+ drivers/iio/magnetometer/bmc150_magn_spi.c         |   3 +-
+ drivers/iio/magnetometer/mmc35240.c                |   4 +-
+ drivers/iio/potentiometer/max5487.c                |   4 +-
+ drivers/iio/pressure/Kconfig                       |  14 +-
+ drivers/iio/pressure/Makefile                      |   2 +
+ drivers/iio/pressure/hp206c.c                      |   6 +-
+ drivers/iio/pressure/mprls0025pa.c                 | 313 ++++++-------
+ drivers/iio/pressure/mprls0025pa.h                 | 102 ++++
+ drivers/iio/pressure/mprls0025pa_i2c.c             | 100 ++++
+ drivers/iio/pressure/mprls0025pa_spi.c             |  92 ++++
+ drivers/iio/pressure/st_pressure_i2c.c             |   5 +-
+ drivers/iio/test/Kconfig                           |  14 +
+ drivers/iio/test/Makefile                          |   1 +
+ drivers/iio/test/iio-test-gts.c                    | 513 +++++++++++++++++++++
+ tools/iio/iio_utils.c                              |   2 +-
+ 67 files changed, 2724 insertions(+), 455 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml
+ create mode 100644 Documentation/devicetree/bindings/iio/frequency/adi,admfm2000.yaml
+ create mode 100644 drivers/iio/accel/bmi088-accel-i2c.c
+ create mode 100644 drivers/iio/adc/ad7380.c
+ create mode 100644 drivers/iio/frequency/admfm2000.c
+ create mode 100644 drivers/iio/pressure/mprls0025pa.h
+ create mode 100644 drivers/iio/pressure/mprls0025pa_i2c.c
+ create mode 100644 drivers/iio/pressure/mprls0025pa_spi.c
+ create mode 100644 drivers/iio/test/iio-test-gts.c
+Merging phy-next/next (25ee21fc97db phy: qcom: sgmii-eth: move PCS registers to separate header)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git phy-next/next
+Auto-merging drivers/phy/qualcomm/phy-qcom-qmp-usb.c
+CONFLICT (content): Merge conflict in drivers/phy/qualcomm/phy-qcom-qmp-usb.c
+Resolved 'drivers/phy/qualcomm/phy-qcom-qmp-usb.c' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master b4386f156a7c] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git
+$ git diff -M --stat --summary HEAD^..
+ .../bindings/phy/qcom,msm8998-qmp-usb3-phy.yaml    |  184 ++++
+ .../bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml   |    6 +
+ .../bindings/phy/qcom,sc8280xp-qmp-ufs-phy.yaml    |    2 +
+ .../phy/qcom,sc8280xp-qmp-usb3-uni-phy.yaml        |   22 -
+ drivers/phy/marvell/phy-armada38x-comphy.c         |    7 +-
+ drivers/phy/qualcomm/Makefile                      |    2 +-
+ drivers/phy/qualcomm/phy-qcom-edp.c                |    3 +-
+ drivers/phy/qualcomm/phy-qcom-qmp-combo.c          |  109 +-
+ drivers/phy/qualcomm/phy-qcom-qmp-common.h         |   59 +
+ drivers/phy/qualcomm/phy-qcom-qmp-dp-com-v3.h      |   18 +
+ drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v3.h      |   21 +
+ drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v4.h      |   19 +
+ drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v5.h      |   13 +
+ drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v6.h      |   13 +
+ drivers/phy/qualcomm/phy-qcom-qmp-dp-phy.h         |   62 ++
+ drivers/phy/qualcomm/phy-qcom-qmp-pcie-msm8996.c   |   70 +-
+ drivers/phy/qualcomm/phy-qcom-qmp-pcie.c           |  288 +++--
+ drivers/phy/qualcomm/phy-qcom-qmp-pcs-pcie-v6.h    |    2 +
+ drivers/phy/qualcomm/phy-qcom-qmp-pcs-pcie-v6_20.h |    2 +
+ drivers/phy/qualcomm/phy-qcom-qmp-pcs-sgmii.h      |   20 +
+ drivers/phy/qualcomm/phy-qcom-qmp-pcs-ufs-v6.h     |    2 +
+ drivers/phy/qualcomm/phy-qcom-qmp-pcs-v6_20.h      |    1 +
+ drivers/phy/qualcomm/phy-qcom-qmp-qserdes-com-v6.h |    2 +
+ .../qualcomm/phy-qcom-qmp-qserdes-txrx-ufs-v6.h    |    8 +
+ .../phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_20.h |    2 +
+ drivers/phy/qualcomm/phy-qcom-qmp-ufs.c            |  242 +++--
+ drivers/phy/qualcomm/phy-qcom-qmp-usb-legacy.c     |   76 +-
+ drivers/phy/qualcomm/phy-qcom-qmp-usb.c            |  422 +------
+ drivers/phy/qualcomm/phy-qcom-qmp-usbc.c           | 1149 ++++++++++++++++++++
+ drivers/phy/qualcomm/phy-qcom-qmp.h                |  101 +-
+ drivers/phy/qualcomm/phy-qcom-sgmii-eth.c          |  417 +++----
+ 31 files changed, 2198 insertions(+), 1146 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/phy/qcom,msm8998-qmp-usb3-phy.yaml
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-common.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-com-v3.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v3.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v4.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v5.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-phy-v6.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-dp-phy.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-pcs-sgmii.h
+ create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-usbc.c
+Merging soundwire/next (0707496ff4e4 soundwire: stream: add missing const to Documentation)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/soundwire.git soundwire/next
+Merge made by the 'ort' strategy.
+ Documentation/driver-api/soundwire/stream.rst | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+Merging extcon/extcon-next (7803680964c0 extcon: qcom-spmi-misc: don't use kernel-doc marker for comment)
+$ git merge -m Merge branch 'extcon-next' of git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/extcon.git extcon/extcon-next
+Already up to date.
+Merging gnss/gnss-next (41bccc98fb79 Linux 6.8-rc2)
+$ git merge -m Merge branch 'gnss-next' of git://git.kernel.org/pub/scm/linux/kernel/git/johan/gnss.git gnss/gnss-next
+Already up to date.
+Merging vfio/next (78f70c02bdbc vfio/virtio: fix virtio-pci dependency)
+$ git merge -m Merge branch 'next' of git://github.com/awilliam/linux-vfio.git vfio/next
+Already up to date.
+Merging w1/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-w1.git w1/for-next
+Already up to date.
+Merging spmi/spmi-next (b85ea95d0864 Linux 6.7-rc1)
+$ git merge -m Merge branch 'spmi-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sboyd/spmi.git spmi/spmi-next
+Already up to date.
+Merging staging/staging-next (ce54e9342124 staging: Remove board staging code)
+$ git merge -m Merge branch 'staging-next' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git staging/staging-next
+Merge made by the 'ort' strategy.
+ drivers/staging/Kconfig                           |    4 -
+ drivers/staging/Makefile                          |    2 -
+ drivers/staging/board/Kconfig                     |   12 -
+ drivers/staging/board/Makefile                    |    4 -
+ drivers/staging/board/TODO                        |    2 -
+ drivers/staging/board/armadillo800eva.c           |   88 -
+ drivers/staging/board/board.c                     |  204 --
+ drivers/staging/board/board.h                     |   46 -
+ drivers/staging/board/kzm9d.c                     |   26 -
+ drivers/staging/emxx_udc/Kconfig                  |   11 -
+ drivers/staging/emxx_udc/Makefile                 |    2 -
+ drivers/staging/emxx_udc/TODO                     |    6 -
+ drivers/staging/emxx_udc/emxx_udc.c               | 3223 ---------------------
+ drivers/staging/emxx_udc/emxx_udc.h               |  554 ----
+ drivers/staging/fieldbus/anybuss/arcx-anybus.c    |    6 +-
+ drivers/staging/fieldbus/dev_core.c               |    6 +-
+ drivers/staging/greybus/audio_manager.c           |    8 +-
+ drivers/staging/greybus/authentication.c          |    6 +-
+ drivers/staging/greybus/fw-download.c             |    7 +-
+ drivers/staging/greybus/fw-management.c           |   20 +-
+ drivers/staging/greybus/gbphy.c                   |    8 +-
+ drivers/staging/greybus/loopback.c                |    6 +-
+ drivers/staging/greybus/raw.c                     |    6 +-
+ drivers/staging/greybus/vibrator.c                |    6 +-
+ drivers/staging/rtl8192e/rtl8192e/r8192E_dev.c    |    8 +-
+ drivers/staging/rtl8192e/rtl8192e/rtl_core.c      |   33 +-
+ drivers/staging/rtl8192e/rtl8192e/rtl_wx.c        |    2 +-
+ drivers/staging/rtl8192e/rtl819x_TSProc.c         |    6 +-
+ drivers/staging/rtl8192e/rtllib.h                 |   26 +-
+ drivers/staging/rtl8192e/rtllib_rx.c              |    6 +-
+ drivers/staging/rtl8192e/rtllib_softmac.c         |   86 +-
+ drivers/staging/rtl8192e/rtllib_tx.c              |   10 +-
+ drivers/staging/rtl8192e/rtllib_wx.c              |    2 +-
+ drivers/staging/rtl8723bs/core/rtw_ieee80211.c    |    4 +-
+ drivers/staging/rtl8723bs/core/rtw_sta_mgt.c      |    3 +-
+ drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c |    3 +-
+ drivers/staging/vt6655/card.c                     |   74 +-
+ drivers/staging/vt6655/rxtx.h                     |    1 -
+ 38 files changed, 168 insertions(+), 4359 deletions(-)
+ delete mode 100644 drivers/staging/board/Kconfig
+ delete mode 100644 drivers/staging/board/Makefile
+ delete mode 100644 drivers/staging/board/TODO
+ delete mode 100644 drivers/staging/board/armadillo800eva.c
+ delete mode 100644 drivers/staging/board/board.c
+ delete mode 100644 drivers/staging/board/board.h
+ delete mode 100644 drivers/staging/board/kzm9d.c
+ delete mode 100644 drivers/staging/emxx_udc/Kconfig
+ delete mode 100644 drivers/staging/emxx_udc/Makefile
+ delete mode 100644 drivers/staging/emxx_udc/TODO
+ delete mode 100644 drivers/staging/emxx_udc/emxx_udc.c
+ delete mode 100644 drivers/staging/emxx_udc/emxx_udc.h
+Merging counter-next/counter-next (0b3bbd8f9baf counter: linux/counter.h: fix Excess kernel-doc description warning)
+$ git merge -m Merge branch 'counter-next' of git://git.kernel.org/pub/scm/linux/kernel/git/wbg/counter.git counter-next/counter-next
+Merge made by the 'ort' strategy.
+ include/linux/counter.h | 1 -
+ 1 file changed, 1 deletion(-)
+Merging mux/for-next (44c026a73be8 Linux 6.4-rc3)
+$ git merge -m Merge branch 'for-next' of https://gitlab.com/peda-linux/mux.git mux/for-next
+Already up to date.
+Merging dmaengine/next (93bdff7bb83a dmaengine: ti: k3-psil-j721s2: Add entry for CSI2RX)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/dmaengine.git dmaengine/next
+Merge made by the 'ort' strategy.
+ .../bindings/dma/allwinner,sun50i-a64-dma.yaml     |  12 +-
+ drivers/dma/Kconfig                                |  14 +-
+ drivers/dma/bestcomm/sram.c                        |   5 -
+ drivers/dma/pl330.c                                |   1 +
+ drivers/dma/ti/k3-psil-j721s2.c                    |  73 +++++
+ drivers/dma/ti/k3-udma-glue.c                      | 314 +++++++++++++++------
+ drivers/dma/xilinx/xilinx_dma.c                    |   6 +
+ include/linux/dma/k3-udma-glue.h                   |  10 +
+ 8 files changed, 328 insertions(+), 107 deletions(-)
+Merging cgroup/for-next (8d4c171f451d docs: cgroup-v1: add missing code-block tags)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git cgroup/for-next
+Merge made by the 'ort' strategy.
+ Documentation/admin-guide/cgroup-v1/hugetlb.rst | 20 ++++++++++++--------
+ 1 file changed, 12 insertions(+), 8 deletions(-)
+Merging scsi/for-next (890d900e7fec Merge branch 'misc' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git scsi/for-next
+Merge made by the 'ort' strategy.
+ drivers/message/fusion/mptfc.c           |   4 +-
+ drivers/scsi/fnic/fnic_scsi.c            |   4 +-
+ drivers/scsi/hisi_sas/hisi_sas_main.c    |  26 +++++--
+ drivers/scsi/hisi_sas/hisi_sas_v3_hw.c   |   8 ++-
+ drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c |  18 ++---
+ drivers/scsi/megaraid.c                  |   2 +-
+ drivers/scsi/mpi3mr/mpi3mr_os.c          |  12 +++-
+ drivers/scsi/mpt3sas/mpt3sas_base.c      | 113 ++++++++++++++++++++-----------
+ drivers/scsi/mpt3sas/mpt3sas_base.h      |   8 ++-
+ drivers/scsi/mpt3sas/mpt3sas_ctl.c       |  54 +++++++++++++++
+ drivers/scsi/mpt3sas/mpt3sas_ctl.h       |  10 +++
+ drivers/scsi/mpt3sas/mpt3sas_scsih.c     |   1 +
+ drivers/scsi/scsi_devinfo.c              |   6 +-
+ drivers/scsi/sd.c                        |   2 +-
+ drivers/ufs/core/ufs-mcq.c               |  12 ++--
+ drivers/ufs/core/ufs-sysfs.c             |  49 ++++++++++++++
+ drivers/ufs/core/ufshcd.c                |  68 ++++++++++++++++---
+ drivers/ufs/host/ufs-mediatek.c          |  90 ++++++++++++++++--------
+ drivers/ufs/host/ufs-mediatek.h          |   7 +-
+ drivers/ufs/host/ufs-qcom.c              |  28 ++++++--
+ include/scsi/scsi_host.h                 |   6 +-
+ include/ufs/ufshcd.h                     |   7 ++
+ include/ufs/ufshci.h                     |   3 +
+ 23 files changed, 412 insertions(+), 126 deletions(-)
+Merging scsi-mkp/for-next (3f90ac7138ed Merge patch series "scsi: Allow scsi_execute users to request retries")
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git scsi-mkp/for-next
+Auto-merging drivers/scsi/scsi_lib.c
+Merge made by the 'ort' strategy.
+ drivers/scsi/3w-9xxx.c                      |  44 ++--
+ drivers/scsi/3w-sas.c                       |  36 +--
+ drivers/scsi/3w-xxxx.c                      |  44 ++--
+ drivers/scsi/53c700.c                       |   2 +-
+ drivers/scsi/Kconfig                        |   9 +
+ drivers/scsi/aacraid/aachba.c               |   6 +-
+ drivers/scsi/ch.c                           |  27 ++-
+ drivers/scsi/device_handler/scsi_dh_hp_sw.c |  49 +++--
+ drivers/scsi/device_handler/scsi_dh_rdac.c  |  84 +++----
+ drivers/scsi/fnic/fnic_attrs.c              |   7 +-
+ drivers/scsi/ibmvscsi/ibmvfc.c              |  22 +-
+ drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c    |   6 +-
+ drivers/scsi/isci/init.c                    |   2 +-
+ drivers/scsi/pm8001/pm8001_ctl.c            |   6 +-
+ drivers/scsi/scsi_lib.c                     | 124 ++++++++++-
+ drivers/scsi/scsi_lib_test.c                | 330 ++++++++++++++++++++++++++++
+ drivers/scsi/scsi_scan.c                    | 105 +++++----
+ drivers/scsi/scsi_transport_spi.c           |  35 +--
+ drivers/scsi/sd.c                           | 218 +++++++++++-------
+ drivers/scsi/ses.c                          |  66 ++++--
+ drivers/scsi/sr.c                           |  38 ++--
+ drivers/ufs/core/ufshcd.c                   |  22 +-
+ include/scsi/scsi_device.h                  |  48 ++++
+ 23 files changed, 979 insertions(+), 351 deletions(-)
+ create mode 100644 drivers/scsi/scsi_lib_test.c
+Merging vhost/linux-next (f16d65124380 vdpa/mlx5: Add mkey leak detection)
+$ git merge -m Merge branch 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost/linux-next
+Already up to date.
+Merging rpmsg/for-next (99f59b148871 Merge branches 'rpmsg-next' and 'rproc-next' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux.git rpmsg/for-next
+Merge made by the 'ort' strategy.
+ drivers/remoteproc/remoteproc_virtio.c |  6 +++---
+ drivers/remoteproc/stm32_rproc.c       |  6 +++---
+ drivers/rpmsg/rpmsg_char.c             | 12 ++++++------
+ drivers/rpmsg/rpmsg_ctrl.c             | 12 ++++++------
+ 4 files changed, 18 insertions(+), 18 deletions(-)
+Merging gpio/for-next (0bb80ecc33a8 Linux 6.6-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-gpio.git gpio/for-next
+Already up to date.
+Merging gpio-brgl/gpio/for-next (6933ba529d06 gpio: improve the API contract for setting direction)
+$ git merge -m Merge branch 'gpio/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git gpio-brgl/gpio/for-next
+Auto-merging Documentation/userspace-api/index.rst
+CONFLICT (content): Merge conflict in Documentation/userspace-api/index.rst
+Auto-merging MAINTAINERS
+Auto-merging drivers/gpio/gpio-eic-sprd.c
+Resolved 'Documentation/userspace-api/index.rst' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master 8b887b8afdb2] Merge branch 'gpio/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git
+$ git diff -M --stat --summary HEAD^..
+ Documentation/ABI/obsolete/sysfs-gpio              |   4 +-
+ Documentation/ABI/testing/gpio-cdev                |   9 +-
+ Documentation/admin-guide/gpio/gpio-mockup.rst     |   8 ++
+ Documentation/admin-guide/gpio/index.rst           |   6 +-
+ Documentation/admin-guide/gpio/obsolete.rst        |  13 ++
+ Documentation/userspace-api/gpio/chardev.rst       | 116 ++++++++++++++++
+ Documentation/userspace-api/gpio/chardev_v1.rst    | 131 ++++++++++++++++++
+ Documentation/userspace-api/gpio/error-codes.rst   |  79 +++++++++++
+ .../userspace-api/gpio/gpio-get-chipinfo-ioctl.rst |  41 ++++++
+ .../gpio/gpio-get-lineevent-ioctl.rst              |  84 ++++++++++++
+ .../gpio/gpio-get-linehandle-ioctl.rst             | 125 +++++++++++++++++
+ .../userspace-api/gpio/gpio-get-lineinfo-ioctl.rst |  54 ++++++++
+ .../gpio/gpio-get-lineinfo-unwatch-ioctl.rst       |  49 +++++++
+ .../gpio/gpio-get-lineinfo-watch-ioctl.rst         |  74 ++++++++++
+ .../gpio/gpio-handle-get-line-values-ioctl.rst     |  56 ++++++++
+ .../gpio/gpio-handle-set-config-ioctl.rst          |  63 +++++++++
+ .../gpio/gpio-handle-set-line-values-ioctl.rst     |  48 +++++++
+ .../gpio/gpio-lineevent-data-read.rst              |  84 ++++++++++++
+ .../gpio/gpio-lineinfo-changed-read.rst            |  87 ++++++++++++
+ .../userspace-api/gpio/gpio-v2-get-line-ioctl.rst  | 152 +++++++++++++++++++++
+ .../gpio/gpio-v2-get-lineinfo-ioctl.rst            |  50 +++++++
+ .../gpio/gpio-v2-get-lineinfo-watch-ioctl.rst      |  67 +++++++++
+ .../userspace-api/gpio/gpio-v2-line-event-read.rst |  83 +++++++++++
+ .../gpio/gpio-v2-line-get-values-ioctl.rst         |  51 +++++++
+ .../gpio/gpio-v2-line-set-config-ioctl.rst         |  58 ++++++++
+ .../gpio/gpio-v2-line-set-values-ioctl.rst         |  47 +++++++
+ .../gpio/gpio-v2-lineinfo-changed-read.rst         |  81 +++++++++++
+ Documentation/userspace-api/gpio/index.rst         |  18 +++
+ Documentation/userspace-api/gpio/obsolete.rst      |  11 ++
+ .../{admin-guide => userspace-api}/gpio/sysfs.rst  |  27 ++--
+ Documentation/userspace-api/index.rst              |   1 +
+ MAINTAINERS                                        |   1 +
+ drivers/gpio/gpio-eic-sprd.c                       |  10 +-
+ drivers/gpio/gpiolib-cdev.c                        |   5 -
+ drivers/gpio/gpiolib-legacy.c                      |  12 ++
+ drivers/gpio/gpiolib-of.c                          |   2 +
+ drivers/gpio/gpiolib.c                             |  93 ++++++-------
+ include/linux/gpio/driver.h                        |  22 +--
+ include/uapi/linux/gpio.h                          |  52 +++----
+ 39 files changed, 1850 insertions(+), 124 deletions(-)
+ create mode 100644 Documentation/admin-guide/gpio/obsolete.rst
+ create mode 100644 Documentation/userspace-api/gpio/chardev.rst
+ create mode 100644 Documentation/userspace-api/gpio/chardev_v1.rst
+ create mode 100644 Documentation/userspace-api/gpio/error-codes.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-get-chipinfo-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-get-lineevent-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-get-linehandle-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-get-lineinfo-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-get-lineinfo-unwatch-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-get-lineinfo-watch-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-handle-get-line-values-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-handle-set-config-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-handle-set-line-values-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-lineevent-data-read.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-lineinfo-changed-read.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-get-line-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-get-lineinfo-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-get-lineinfo-watch-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-line-event-read.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-line-get-values-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-line-set-config-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-line-set-values-ioctl.rst
+ create mode 100644 Documentation/userspace-api/gpio/gpio-v2-lineinfo-changed-read.rst
+ create mode 100644 Documentation/userspace-api/gpio/index.rst
+ create mode 100644 Documentation/userspace-api/gpio/obsolete.rst
+ rename Documentation/{admin-guide => userspace-api}/gpio/sysfs.rst (89%)
+Merging gpio-intel/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/andy/linux-gpio-intel.git gpio-intel/for-next
+Already up to date.
+Merging pinctrl/for-next (47eed1127d2a dt-bindings: pinctrl: amlogic: narrow regex for unit address to hex numbers)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl.git pinctrl/for-next
+Merge made by the 'ort' strategy.
+ .../bindings/pinctrl/amlogic,meson-pinctrl-a1.yaml |  2 +-
+ .../pinctrl/amlogic,meson-pinctrl-g12a-aobus.yaml  |  2 +-
+ .../amlogic,meson-pinctrl-g12a-periphs.yaml        |  2 +-
+ .../pinctrl/amlogic,meson8-pinctrl-aobus.yaml      |  2 +-
+ .../pinctrl/amlogic,meson8-pinctrl-cbus.yaml       |  2 +-
+ drivers/pinctrl/mediatek/pinctrl-mt7981.c          | 24 ++++++++++++++++++++--
+ drivers/pinctrl/mediatek/pinctrl-mt7986.c          |  2 +-
+ drivers/pinctrl/pinctrl-st.c                       |  3 +--
+ drivers/pinctrl/pinctrl-zynqmp.c                   |  8 ++++----
+ 9 files changed, 33 insertions(+), 14 deletions(-)
+Merging pinctrl-intel/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/intel.git pinctrl-intel/for-next
+Already up to date.
+Merging pinctrl-renesas/renesas-pinctrl (fea58424e252 pinctrl: renesas: pinctrl-rzg2l: Add the missing port pins P19 to P28)
+$ git merge -m Merge branch 'renesas-pinctrl' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-drivers.git pinctrl-renesas/renesas-pinctrl
+Merge made by the 'ort' strategy.
+ arch/riscv/boot/dts/renesas/r9a07g043f.dtsi |   4 +
+ drivers/pinctrl/renesas/core.c              |   4 +-
+ drivers/pinctrl/renesas/pfc-r8a779g0.c      |  14 ++
+ drivers/pinctrl/renesas/pinctrl-rzg2l.c     | 309 +++++++++++++++++++++++-----
+ 4 files changed, 277 insertions(+), 54 deletions(-)
+Merging pinctrl-samsung/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/samsung.git pinctrl-samsung/for-next
+Already up to date.
+Merging pwm/pwm/for-next (979c6fe7e799 dt-bindings: pxa-pwm: Convert to YAML)
+$ git merge -m Merge branch 'pwm/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ukleinek/linux.git pwm/pwm/for-next
+Merge made by the 'ort' strategy.
+ .../devicetree/bindings/pwm/marvell,pxa-pwm.yaml   | 51 ++++++++++++++++++++++
+ Documentation/devicetree/bindings/pwm/pxa-pwm.txt  | 30 -------------
+ drivers/gpu/drm/bridge/ti-sn65dsi86.c              |  1 -
+ drivers/pwm/core.c                                 | 45 +++++--------------
+ drivers/pwm/pwm-atmel-hlcdc.c                      |  2 +-
+ drivers/pwm/pwm-clps711x.c                         | 11 -----
+ drivers/pwm/pwm-cros-ec.c                          |  1 -
+ drivers/pwm/pwm-mediatek.c                         |  1 -
+ drivers/pwm/pwm-pxa.c                              |  4 +-
+ include/linux/pwm.h                                |  2 -
+ 10 files changed, 65 insertions(+), 83 deletions(-)
+ create mode 100644 Documentation/devicetree/bindings/pwm/marvell,pxa-pwm.yaml
+ delete mode 100644 Documentation/devicetree/bindings/pwm/pxa-pwm.txt
+Merging ktest/for-next (7dc8e24f0e09 ktest: Restore stty setting at first in dodie)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-ktest.git ktest/for-next
+Already up to date.
+Merging kselftest/next (6a71770442b5 selftests: livepatch: Test livepatching a heavily called syscall)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git kselftest/next
+Auto-merging Documentation/dev-tools/kselftest.rst
+Auto-merging MAINTAINERS
+Auto-merging lib/Kconfig.debug
+Auto-merging tools/testing/selftests/lib.mk
+Auto-merging tools/testing/selftests/livepatch/functions.sh
+Merge made by the 'ort' strategy.
+ Documentation/dev-tools/kselftest.rst              |   4 +
+ MAINTAINERS                                        |   1 -
+ arch/s390/configs/debug_defconfig                  |   1 -
+ arch/s390/configs/defconfig                        |   1 -
+ lib/Kconfig.debug                                  |  22 ----
+ lib/Makefile                                       |   2 -
+ lib/livepatch/Makefile                             |  14 ---
+ tools/testing/selftests/lib.mk                     |  26 ++++-
+ tools/testing/selftests/livepatch/Makefile         |   5 +-
+ tools/testing/selftests/livepatch/README           |  25 +++--
+ tools/testing/selftests/livepatch/config           |   1 -
+ tools/testing/selftests/livepatch/functions.sh     |  34 +++---
+ .../testing/selftests/livepatch/test-callbacks.sh  |  50 ++++-----
+ tools/testing/selftests/livepatch/test-ftrace.sh   |   6 +-
+ .../testing/selftests/livepatch/test-livepatch.sh  |  10 +-
+ .../selftests/livepatch/test-shadow-vars.sh        |   2 +-
+ tools/testing/selftests/livepatch/test-state.sh    |  18 ++--
+ tools/testing/selftests/livepatch/test-syscall.sh  |  53 ++++++++++
+ tools/testing/selftests/livepatch/test-sysfs.sh    |   6 +-
+ .../selftests/livepatch/test_klp-call_getpid.c     |  44 ++++++++
+ .../selftests/livepatch/test_modules/Makefile      |  20 ++++
+ .../test_modules}/test_klp_atomic_replace.c        |   0
+ .../test_modules}/test_klp_callbacks_busy.c        |   0
+ .../test_modules}/test_klp_callbacks_demo.c        |   0
+ .../test_modules}/test_klp_callbacks_demo2.c       |   0
+ .../test_modules}/test_klp_callbacks_mod.c         |   0
+ .../livepatch/test_modules}/test_klp_livepatch.c   |   0
+ .../livepatch/test_modules}/test_klp_shadow_vars.c |   0
+ .../livepatch/test_modules}/test_klp_state.c       |   0
+ .../livepatch/test_modules}/test_klp_state2.c      |   0
+ .../livepatch/test_modules}/test_klp_state3.c      |   0
+ .../livepatch/test_modules/test_klp_syscall.c      | 116 +++++++++++++++++++++
+ 32 files changed, 340 insertions(+), 121 deletions(-)
+ delete mode 100644 lib/livepatch/Makefile
+ create mode 100755 tools/testing/selftests/livepatch/test-syscall.sh
+ create mode 100644 tools/testing/selftests/livepatch/test_klp-call_getpid.c
+ create mode 100644 tools/testing/selftests/livepatch/test_modules/Makefile
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_atomic_replace.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_busy.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_demo.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_demo2.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_callbacks_mod.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_livepatch.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_shadow_vars.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_state.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_state2.c (100%)
+ rename {lib/livepatch => tools/testing/selftests/livepatch/test_modules}/test_klp_state3.c (100%)
+ create mode 100644 tools/testing/selftests/livepatch/test_modules/test_klp_syscall.c
+Merging kunit/test (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'test' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git kunit/test
+Already up to date.
+Merging kunit-next/kunit (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'kunit' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git kunit-next/kunit
+Already up to date.
+Merging livepatching/for-next (602bf1830798 Merge branch 'for-6.7' into for-next)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching livepatching/for-next
+Merge made by the 'ort' strategy.
+Merging rtc/rtc-next (14688f1a91e1 rtc: nuvoton: Compatible with NCT3015Y-R and NCT3018Y-R)
+$ git merge -m Merge branch 'rtc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git rtc/rtc-next
+Already up to date.
+Merging nvdimm/libnvdimm-for-next (a085a5eb6594 acpi/nfit: Use sysfs_emit() for all attributes)
+$ git merge -m Merge branch 'libnvdimm-for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git nvdimm/libnvdimm-for-next
+Already up to date.
+Merging at24/at24/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'at24/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git at24/at24/for-next
+Already up to date.
+Merging ntb/ntb-next (9341b37ec17a ntb_perf: Fix printk format)
+$ git merge -m Merge branch 'ntb-next' of https://github.com/jonmason/ntb.git ntb/ntb-next
+Merge made by the 'ort' strategy.
+ drivers/ntb/hw/intel/ntb_hw_gen1.c | 2 +-
+ drivers/ntb/test/ntb_perf.c        | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+Merging seccomp/for-next/seccomp (0c6f28a84431 selftests/seccomp: user_notification_addfd check nextfd is available)
+$ git merge -m Merge branch 'for-next/seccomp' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git seccomp/for-next/seccomp
+Merge made by the 'ort' strategy.
+ tools/testing/selftests/seccomp/seccomp_bpf.c | 41 ++++++++++++++++++++-------
+ 1 file changed, 31 insertions(+), 10 deletions(-)
+Merging fsi/next (c5eeb63edac9 fsi: Fix panic on scom file read)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/joel/fsi.git fsi/next
+Merge made by the 'ort' strategy.
+ drivers/fsi/fsi-sbefifo.c |  9 ++++++++-
+ drivers/fsi/i2cr-scom.c   | 11 ++++++++++-
+ 2 files changed, 18 insertions(+), 2 deletions(-)
+Merging slimbus/for-next (04b945e4cf81 slimbus: qcom-ngd-ctrl: Make QMI message rules const)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/srini/slimbus.git slimbus/for-next
+Merge made by the 'ort' strategy.
+ drivers/slimbus/qcom-ngd-ctrl.c | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+Merging nvmem/for-next (a0cfd5e99782 dt-bindings: nvmem: Convert xlnx,zynqmp-nvmem.txt to yaml)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/srini/nvmem.git nvmem/for-next
+Merge made by the 'ort' strategy.
+ Documentation/ABI/testing/sysfs-nvmem-cells        | 16 ++++----
+ .../bindings/nvmem/layouts/fixed-cell.yaml         | 22 +++++------
+ .../bindings/nvmem/xlnx,zynqmp-nvmem.txt           | 46 ----------------------
+ .../bindings/nvmem/xlnx,zynqmp-nvmem.yaml          | 42 ++++++++++++++++++++
+ drivers/nvmem/core.c                               |  5 ++-
+ drivers/nvmem/mtk-efuse.c                          | 21 +++++++++-
+ 6 files changed, 83 insertions(+), 69 deletions(-)
+ delete mode 100644 Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.txt
+ create mode 100644 Documentation/devicetree/bindings/nvmem/xlnx,zynqmp-nvmem.yaml
+Merging xarray/main (2a15de80dd0f idr: fix param name in idr_alloc_cyclic() doc)
+$ git merge -m Merge branch 'main' of git://git.infradead.org/users/willy/xarray.git xarray/main
+Already up to date.
+Merging hyperv/hyperv-next (ce9ecca0238b Linux 6.6-rc2)
+$ git merge -m Merge branch 'hyperv-next' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git hyperv/hyperv-next
+Already up to date.
+Merging auxdisplay/auxdisplay (c52391fafcef auxdisplay: img-ascii-lcd: Use device_get_match_data())
+$ git merge -m Merge branch 'auxdisplay' of https://github.com/ojeda/linux.git auxdisplay/auxdisplay
+Already up to date.
+Merging kgdb/kgdb/for-next (4f41d30cd6dc kdb: Fix a potential buffer overflow in kdb_local())
+$ git merge -m Merge branch 'kgdb/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/danielt/linux.git kgdb/kgdb/for-next
+Already up to date.
+Merging hmm/hmm (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git hmm/hmm
+Already up to date.
+Merging cfi/cfi/next (06c2afb862f9 Linux 6.5-rc1)
+$ git merge -m Merge branch 'cfi/next' of git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git cfi/cfi/next
+Already up to date.
+Merging mhi/mhi-next (8ddf54a32111 bus: mhi: host: Read PK HASH dynamically)
+$ git merge -m Merge branch 'mhi-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mani/mhi.git mhi/mhi-next
+Merge made by the 'ort' strategy.
+ drivers/bus/mhi/host/boot.c     | 11 +----------
+ drivers/bus/mhi/host/init.c     | 17 +++++++++++++----
+ drivers/bus/mhi/host/internal.h |  9 ++++++---
+ drivers/bus/mhi/host/pm.c       | 20 +++++++++++++++++---
+ include/linux/mhi.h             |  2 --
+ 5 files changed, 37 insertions(+), 22 deletions(-)
+Merging memblock/for-next (2159bd4e9057 memblock: Return NUMA_NO_NODE instead of -1 to improve code readability)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git memblock/for-next
+Already up to date.
+Merging cxl/next (73bf93edeeea cxl/core: use sysfs_emit() for attr's _show())
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git cxl/next
+Already up to date.
+Merging zstd/zstd-next (3f832dfb8a8e zstd: fix g_debuglevel export warning)
+$ git merge -m Merge branch 'zstd-next' of https://github.com/terrelln/linux.git zstd/zstd-next
+Merge made by the 'ort' strategy.
+ include/linux/zstd.h                           |    2 +-
+ include/linux/zstd_errors.h                    |   23 +-
+ include/linux/zstd_lib.h                       |  697 ++++++++--
+ lib/zstd/Makefile                              |    2 +-
+ lib/zstd/common/allocations.h                  |   56 +
+ lib/zstd/common/bits.h                         |  149 ++
+ lib/zstd/common/bitstream.h                    |   53 +-
+ lib/zstd/common/compiler.h                     |   14 +-
+ lib/zstd/common/cpu.h                          |    3 +-
+ lib/zstd/common/debug.c                        |    5 +-
+ lib/zstd/common/debug.h                        |    3 +-
+ lib/zstd/common/entropy_common.c               |   42 +-
+ lib/zstd/common/error_private.c                |   12 +-
+ lib/zstd/common/error_private.h                |    3 +-
+ lib/zstd/common/fse.h                          |   89 +-
+ lib/zstd/common/fse_decompress.c               |   94 +-
+ lib/zstd/common/huf.h                          |  234 +---
+ lib/zstd/common/mem.h                          |    2 +-
+ lib/zstd/common/portability_macros.h           |   26 +-
+ lib/zstd/common/zstd_common.c                  |   38 +-
+ lib/zstd/common/zstd_deps.h                    |   16 +-
+ lib/zstd/common/zstd_internal.h                |   99 +-
+ lib/zstd/compress/clevels.h                    |    3 +-
+ lib/zstd/compress/fse_compress.c               |   59 +-
+ lib/zstd/compress/hist.c                       |    3 +-
+ lib/zstd/compress/hist.h                       |    3 +-
+ lib/zstd/compress/huf_compress.c               |  372 +++--
+ lib/zstd/compress/zstd_compress.c              | 1758 +++++++++++++++++-------
+ lib/zstd/compress/zstd_compress_internal.h     |  333 +++--
+ lib/zstd/compress/zstd_compress_literals.c     |  155 ++-
+ lib/zstd/compress/zstd_compress_literals.h     |   25 +-
+ lib/zstd/compress/zstd_compress_sequences.c    |    7 +-
+ lib/zstd/compress/zstd_compress_sequences.h    |    3 +-
+ lib/zstd/compress/zstd_compress_superblock.c   |   47 +-
+ lib/zstd/compress/zstd_compress_superblock.h   |    3 +-
+ lib/zstd/compress/zstd_cwksp.h                 |  149 +-
+ lib/zstd/compress/zstd_double_fast.c           |  129 +-
+ lib/zstd/compress/zstd_double_fast.h           |    6 +-
+ lib/zstd/compress/zstd_fast.c                  |  578 ++++++--
+ lib/zstd/compress/zstd_fast.h                  |    6 +-
+ lib/zstd/compress/zstd_lazy.c                  |  518 +++----
+ lib/zstd/compress/zstd_lazy.h                  |    7 +-
+ lib/zstd/compress/zstd_ldm.c                   |   11 +-
+ lib/zstd/compress/zstd_ldm.h                   |    3 +-
+ lib/zstd/compress/zstd_ldm_geartab.h           |    3 +-
+ lib/zstd/compress/zstd_opt.c                   |  187 +--
+ lib/zstd/compress/zstd_opt.h                   |    3 +-
+ lib/zstd/decompress/huf_decompress.c           |  772 +++++++----
+ lib/zstd/decompress/zstd_ddict.c               |    9 +-
+ lib/zstd/decompress/zstd_ddict.h               |    3 +-
+ lib/zstd/decompress/zstd_decompress.c          |  259 +++-
+ lib/zstd/decompress/zstd_decompress_block.c    |  283 ++--
+ lib/zstd/decompress/zstd_decompress_block.h    |    8 +-
+ lib/zstd/decompress/zstd_decompress_internal.h |    7 +-
+ lib/zstd/decompress_sources.h                  |    2 +-
+ lib/zstd/zstd_common_module.c                  |    5 +-
+ lib/zstd/zstd_compress_module.c                |    2 +-
+ lib/zstd/zstd_decompress_module.c              |    4 +-
+ 58 files changed, 4791 insertions(+), 2596 deletions(-)
+ create mode 100644 lib/zstd/common/allocations.h
+ create mode 100644 lib/zstd/common/bits.h
+Merging efi/next (4afa688d7141 efi: memmap: fix kernel-doc warnings)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git efi/next
+Already up to date.
+Merging unicode/for-next (367122c529f3 libfs: Attempt exact-match comparison first during casefolded lookup)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/krisman/unicode.git unicode/for-next
+Auto-merging include/linux/fs.h
+Merge made by the 'ort' strategy.
+ fs/libfs.c            | 40 +++++++++++++++++++++++-----------------
+ fs/overlayfs/params.c | 13 ++++++++++---
+ include/linux/fs.h    |  9 +++++++++
+ 3 files changed, 42 insertions(+), 20 deletions(-)
+Merging slab/slab/for-next (7d2ec24bd8a5 Merge branch 'slab/for-6.9/optimize-get-freelist' into slab/for-next)
+$ git merge -m Merge branch 'slab/for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab.git slab/slab/for-next
+Auto-merging Documentation/admin-guide/kernel-parameters.txt
+Auto-merging mm/slab_common.c
+Merge made by the 'ort' strategy.
+ Documentation/admin-guide/kernel-parameters.txt | 79 +++++++++++--------------
+ Documentation/mm/slub.rst                       | 60 +++++++++----------
+ drivers/misc/lkdtm/heap.c                       |  2 +-
+ mm/Kconfig.debug                                |  6 +-
+ mm/slab.h                                       |  6 +-
+ mm/slab_common.c                                | 17 +++---
+ mm/slub.c                                       | 75 +++++++++++------------
+ 7 files changed, 117 insertions(+), 128 deletions(-)
+Merging random/master (615d30064886 Merge tag 'trace-v6.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace)
+$ git merge -m Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git random/master
+Already up to date.
+Merging landlock/next (2f8bb71d737c landlock: Document IOCTL support)
+$ git merge -m Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/mic/linux.git landlock/next
+Merge made by the 'ort' strategy.
+ Documentation/userspace-api/landlock.rst     | 119 +++++-
+ include/uapi/linux/landlock.h                |  58 ++-
+ samples/landlock/sandboxer.c                 |  13 +-
+ security/landlock/.kunitconfig               |   4 +
+ security/landlock/Kconfig                    |  15 +
+ security/landlock/common.h                   |   2 +
+ security/landlock/fs.c                       | 410 ++++++++++++++++++++-
+ security/landlock/fs.h                       |   2 +
+ security/landlock/limits.h                   |  11 +-
+ security/landlock/ruleset.h                  |   2 +-
+ security/landlock/syscalls.c                 |  19 +-
+ tools/testing/kunit/configs/all_tests.config |   1 +
+ tools/testing/selftests/landlock/base_test.c |   2 +-
+ tools/testing/selftests/landlock/common.h    |  88 +++--
+ tools/testing/selftests/landlock/fs_test.c   | 520 ++++++++++++++++++++++++++-
+ tools/testing/selftests/landlock/net_test.c  |  13 +-
+ 16 files changed, 1187 insertions(+), 92 deletions(-)
+ create mode 100644 security/landlock/.kunitconfig
+Merging rust/rust-next (f090f0d0eea9 rust: sync: update integer types in CondVar)
+$ git merge -m Merge branch 'rust-next' of https://github.com/Rust-for-Linux/linux.git rust/rust-next
+Auto-merging Documentation/process/changes.rst
+CONFLICT (content): Merge conflict in Documentation/process/changes.rst
+Auto-merging scripts/min-tool-version.sh
+Resolved 'Documentation/process/changes.rst' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master a5b16680f994] Merge branch 'rust-next' of https://github.com/Rust-for-Linux/linux.git
+$ git diff -M --stat --summary HEAD^..
+ Documentation/process/changes.rst |   2 +-
+ rust/alloc/alloc.rs               |   9 +++-
+ rust/alloc/boxed.rs               |  20 +++++---
+ rust/alloc/lib.rs                 |   7 +--
+ rust/alloc/raw_vec.rs             |  19 +++++--
+ rust/alloc/vec/mod.rs             |  16 +++---
+ rust/bindings/bindings_helper.h   |   1 +
+ rust/kernel/lib.rs                |   2 +-
+ rust/kernel/sync.rs               |   2 +-
+ rust/kernel/sync/condvar.rs       | 104 ++++++++++++++++++++++++++++++--------
+ rust/kernel/sync/lock.rs          |   4 +-
+ rust/kernel/task.rs               |  18 ++++++-
+ rust/kernel/time.rs               |  20 ++++++++
+ scripts/min-tool-version.sh       |   2 +-
+ 14 files changed, 177 insertions(+), 49 deletions(-)
+ create mode 100644 rust/kernel/time.rs
+Merging sysctl/sysctl-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'sysctl-next' of git://git.kernel.org/pub/scm/linux/kernel/git/sysctl/sysctl.git sysctl/sysctl-next
+Already up to date.
+Merging execve/for-next/execve (90383cc07895 exec: Distinguish in_execve from in_exec)
+$ git merge -m Merge branch 'for-next/execve' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git execve/for-next/execve
+Already up to date.
+Merging bitmap/bitmap-for-next (071ad962baf5 bitmap: Step down as a reviewer)
+$ git merge -m Merge branch 'bitmap-for-next' of https://github.com/norov/linux.git bitmap/bitmap-for-next
+Auto-merging MAINTAINERS
+Auto-merging arch/x86/kvm/hyperv.c
+Auto-merging drivers/block/null_blk/main.c
+Auto-merging drivers/infiniband/ulp/rtrs/rtrs-clt.c
+Auto-merging drivers/iommu/arm/arm-smmu/arm-smmu.h
+Auto-merging drivers/net/ethernet/sfc/rx_common.c
+Auto-merging drivers/net/wireless/realtek/rtw88/pci.c
+Auto-merging drivers/net/wireless/realtek/rtw89/pci.c
+Auto-merging drivers/pci/controller/pci-hyperv.c
+Auto-merging drivers/perf/arm_pmuv3.c
+Auto-merging drivers/scsi/mpi3mr/mpi3mr_os.c
+Auto-merging drivers/scsi/scsi_lib.c
+Auto-merging drivers/tty/nozomi.c
+Auto-merging drivers/tty/serial/sc16is7xx.c
+CONFLICT (content): Merge conflict in drivers/tty/serial/sc16is7xx.c
+Auto-merging drivers/usb/class/cdc-acm.c
+Auto-merging kernel/sched/sched.h
+Auto-merging kernel/watch_queue.c
+Auto-merging lib/sbitmap.c
+Auto-merging sound/pci/hda/hda_codec.c
+Resolved 'drivers/tty/serial/sc16is7xx.c' using previous resolution.
+Automatic merge failed; fix conflicts and then commit the result.
+$ git commit --no-edit -v -a
+[master ac451fd4abca] Merge branch 'bitmap-for-next' of https://github.com/norov/linux.git
+$ git diff -M --stat --summary HEAD^..
+ MAINTAINERS                                  |   1 -
+ arch/m68k/include/asm/mmu_context.h          |  11 +-
+ arch/microblaze/include/asm/mmu_context_mm.h |  11 +-
+ arch/mips/sgi-ip30/ip30-irq.c                |  12 +-
+ arch/powerpc/mm/book3s32/mmu_context.c       |  10 +-
+ arch/powerpc/platforms/pasemi/dma_lib.c      |  41 +---
+ arch/powerpc/platforms/powernv/pci-sriov.c   |  12 +-
+ arch/sh/boards/mach-x3proto/ilsel.c          |   4 +-
+ arch/sparc/kernel/pci_msi.c                  |   9 +-
+ arch/x86/kvm/hyperv.c                        |  36 ++--
+ drivers/block/null_blk/main.c                |  39 ++--
+ drivers/dma/idxd/perfmon.c                   |   8 +-
+ drivers/infiniband/ulp/rtrs/rtrs-clt.c       |  15 +-
+ drivers/iommu/arm/arm-smmu/arm-smmu.h        |  10 +-
+ drivers/iommu/msm_iommu.c                    |  18 +-
+ drivers/isdn/mISDN/core.c                    |   9 +-
+ drivers/media/radio/radio-shark.c            |   5 +-
+ drivers/media/radio/radio-shark2.c           |   5 +-
+ drivers/media/usb/cx231xx/cx231xx-cards.c    |  16 +-
+ drivers/media/usb/em28xx/em28xx-cards.c      |  37 ++--
+ drivers/net/ethernet/rocker/rocker_ofdpa.c   |  11 +-
+ drivers/net/ethernet/sfc/rx_common.c         |   4 +-
+ drivers/net/ethernet/sfc/siena/rx_common.c   |   4 +-
+ drivers/net/ethernet/sfc/siena/siena_sriov.c |  14 +-
+ drivers/net/wireless/ath/ath10k/snoc.c       |   9 +-
+ drivers/net/wireless/realtek/rtw88/pci.c     |   5 +-
+ drivers/net/wireless/realtek/rtw89/pci.c     |   5 +-
+ drivers/pci/controller/pci-hyperv.c          |   7 +-
+ drivers/perf/alibaba_uncore_drw_pmu.c        |  10 +-
+ drivers/perf/arm-cci.c                       |  24 +--
+ drivers/perf/arm-ccn.c                       |  10 +-
+ drivers/perf/arm_dmc620_pmu.c                |   9 +-
+ drivers/perf/arm_pmuv3.c                     |   8 +-
+ drivers/scsi/mpi3mr/mpi3mr_os.c              |  21 +-
+ drivers/scsi/qedi/qedi_main.c                |   9 +-
+ drivers/scsi/scsi_lib.c                      |   7 +-
+ drivers/tty/nozomi.c                         |   5 +-
+ drivers/usb/class/cdc-acm.c                  |   5 +-
+ include/linux/cpumask.h                      |  12 ++
+ include/linux/find.h                         | 301 ++++++++++++++++++++++++++-
+ kernel/sched/sched.h                         |  14 +-
+ kernel/watch_queue.c                         |   6 +-
+ lib/find_bit.c                               |  85 ++++++++
+ lib/sbitmap.c                                |  46 +---
+ lib/test_bitmap.c                            |  61 ++++++
+ net/bluetooth/cmtp/core.c                    |  10 +-
+ net/smc/smc_wr.c                             |  10 +-
+ sound/pci/hda/hda_codec.c                    |   7 +-
+ sound/usb/caiaq/audio.c                      |  13 +-
+ 49 files changed, 629 insertions(+), 412 deletions(-)
+Merging hte/for-next (b85ea95d0864 Linux 6.7-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/pateldipen1984/linux.git hte/for-next
+Already up to date.
+Merging kspp/for-next/kspp (34b82a2fb747 lkdtm/bugs: In lkdtm_HUNG_TASK() use BUG(), not BUG_ON(1))
+$ git merge -m Merge branch 'for-next/kspp' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git kspp/for-next/kspp
+Merge made by the 'ort' strategy.
+ drivers/misc/lkdtm/bugs.c |  3 ++-
+ drivers/misc/lkdtm/core.c | 22 ++++++++++++++--------
+ 2 files changed, 16 insertions(+), 9 deletions(-)
+Merging kspp-gustavo/for-next/kspp (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next/kspp' of git://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux.git kspp-gustavo/for-next/kspp
+Already up to date.
+Merging nolibc/nolibc (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'nolibc' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git nolibc/nolibc
+Already up to date.
+Merging tsm/tsm-next (f4738f56d1dc virt: tdx-guest: Add Quote generation support using TSM_REPORTS)
+$ git merge -m Merge branch 'tsm-next' of git://git.kernel.org/pub/scm/linux/kernel/git/djbw/linux tsm/tsm-next
+Already up to date.
+Merging iommufd/for-next (6613476e225e Linux 6.8-rc1)
+$ git merge -m Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git iommufd/for-next
+Already up to date.
+Merging header_cleanup/header_cleanup (5f4c01f1e3c7 spinlock: Fix failing build for PREEMPT_RT)
+$ git merge -m Merge branch 'header_cleanup' of https://evilpiepirate.org/git/bcachefs.git header_cleanup/header_cleanup
+Already up to date.
+$ git am -3 ../patches/0001-Revert-kasan-revert-eviction-of-stack-traces-in-gene.patch
+Applying: Revert "kasan: revert eviction of stack traces in generic mode"
+$ git am -3 ../patches/0002-Revert-stackdepot-use-variable-size-records-for-non-.patch
+Applying: Revert "stackdepot: use variable size records for non-evictable entries"
diff --git a/localversion-next b/localversion-next
new file mode 100644
index 00000000000000..6d8c883d7eb82d
--- /dev/null
+++ b/localversion-next
@@ -0,0 +1 @@
+-next-20240201

From bac17ffeaa5629eb9d06933574b3b7fc97358572 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Tue, 2 Jan 2024 11:33:45 -0800
Subject: [PATCH 662/707] drm/msm/a7xx: Fix LLC typo

We'd miss actually activating LLC.

Signed-off-by: Rob Clark <robdclark@chromium.org>
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index c0bc924cd3025d..4c80f336bbcd4a 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -2427,7 +2427,7 @@ static int a6xx_gmu_pm_resume(struct msm_gpu *gpu)
 
 	msm_devfreq_resume(gpu);
 
-	adreno_is_a7xx(adreno_gpu) ? a7xx_llc_activate : a6xx_llc_activate(a6xx_gpu);
+	adreno_is_a7xx(adreno_gpu) ? a7xx_llc_activate(a6xx_gpu) : a6xx_llc_activate(a6xx_gpu);
 
 	return ret;
 }

From 0c4dfbb410cc840470b981cb666dad4b9c56eb9f Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Thu, 18 Aug 2022 13:33:07 +0100
Subject: [PATCH 663/707] ASoC: q6apm-dai: fix period size to be algined to 64
 bytes

DSP expects the buffers to be aligned to 64bytes, so fix
the current sizes where there is a possiblity of getting an
unaligned buffers.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
(cherry picked from commit 910a758d0340cff90ddb997a94ea269e30180beb)
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 sound/soc/qcom/qdsp6/q6apm-dai.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/sound/soc/qcom/qdsp6/q6apm-dai.c b/sound/soc/qcom/qdsp6/q6apm-dai.c
index 052e40cb38feca..9c6f33cd508a16 100644
--- a/sound/soc/qcom/qdsp6/q6apm-dai.c
+++ b/sound/soc/qcom/qdsp6/q6apm-dai.c
@@ -19,15 +19,17 @@
 #define DRV_NAME "q6apm-dai"
 
 #define PLAYBACK_MIN_NUM_PERIODS	2
-#define PLAYBACK_MAX_NUM_PERIODS	8
-#define PLAYBACK_MAX_PERIOD_SIZE	65536
-#define PLAYBACK_MIN_PERIOD_SIZE	128
+#define PLAYBACK_MAX_NUM_PERIODS	4
+#define PLAYBACK_MAX_PERIOD_SIZE	8192
+#define PLAYBACK_MAX_BUF_SIZE		65536
+#define PLAYBACK_MIN_PERIOD_SIZE	8192
 #define CAPTURE_MIN_NUM_PERIODS		2
-#define CAPTURE_MAX_NUM_PERIODS		8
-#define CAPTURE_MAX_PERIOD_SIZE		4096
-#define CAPTURE_MIN_PERIOD_SIZE		320
-#define BUFFER_BYTES_MAX (PLAYBACK_MAX_NUM_PERIODS * PLAYBACK_MAX_PERIOD_SIZE)
-#define BUFFER_BYTES_MIN (PLAYBACK_MIN_NUM_PERIODS * PLAYBACK_MIN_PERIOD_SIZE)
+#define CAPTURE_MAX_NUM_PERIODS		4
+#define CAPTURE_MAX_PERIOD_SIZE		8192
+#define CAPTURE_MAX_BUF_SIZE		65536
+#define CAPTURE_MIN_PERIOD_SIZE		8192
+#define BUFFER_BYTES_MAX PLAYBACK_MAX_BUF_SIZE
+#define BUFFER_BYTES_MIN (8192)
 #define COMPR_PLAYBACK_MAX_FRAGMENT_SIZE (128 * 1024)
 #define COMPR_PLAYBACK_MAX_NUM_FRAGMENTS (16 * 4)
 #define COMPR_PLAYBACK_MIN_FRAGMENT_SIZE (8 * 1024)

From 081a921f705bd000097fb53efe1db7effaef8fe5 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Thu, 1 Feb 2024 16:55:24 +0100
Subject: [PATCH 664/707] of: provide a cleanup helper for OF nodes

Allow to use __free() to automatically put references to OF nodes.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/of.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/linux/of.h b/include/linux/of.h
index 331e05918f11f9..5462ed47f25bc1 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -11,6 +11,8 @@
  * Updates for SPARC64 by David S. Miller
  * Derived from PowerPC and Sparc prom.h files by Stephen Rothwell, IBM Corp.
  */
+
+#include <linux/cleanup.h>
 #include <linux/types.h>
 #include <linux/bitops.h>
 #include <linux/errno.h>
@@ -887,6 +889,8 @@ static inline const void *of_device_get_match_data(const struct device *dev)
 #define of_match_node(_matches, _node)	NULL
 #endif /* CONFIG_OF */
 
+DEFINE_FREE(of_node, struct device_node *, if (_T) of_node_put(_T))
+
 /* Default string compare functions, Allow arch asm/prom.h to override */
 #if !defined(of_compat_cmp)
 #define of_compat_cmp(s1, s2, l)	strcasecmp((s1), (s2))

From cf0d1217677bf37ba5be94fc2866be5d1e00eb99 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Thu, 1 Feb 2024 16:55:25 +0100
Subject: [PATCH 665/707] arm64: dts: qcom: qrb5165-rb5: model the PMU of the
 QCA6391

Add a node for the PMU module of the QCA6391 present on the RB5 board.
Assign its LDO power outputs to the existing Bluetooth module. Add a
node for the PCIe port to sm8250.dtsi and define the WLAN node on it in
the board's .dts and also make it consume the power outputs of the PMU.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 arch/arm64/boot/dts/qcom/qrb5165-rb5.dts | 128 +++++++++++++++++++++--
 arch/arm64/boot/dts/qcom/sm8250.dtsi     |  10 ++
 2 files changed, 127 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts b/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts
index cd0db4f31d4af9..fab5bebafbadc9 100644
--- a/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts
+++ b/arch/arm64/boot/dts/qcom/qrb5165-rb5.dts
@@ -108,6 +108,87 @@
 		regulator-always-on;
 	};
 
+	qca6390_pmu: pmu@0 {
+		compatible = "qcom,qca6390-pmu";
+
+		pinctrl-names = "default";
+		pinctrl-0 = <&bt_en_state>, <&wlan_en_state>;
+
+		vddaon-supply = <&vreg_s6a_0p95>;
+		vddpmu-supply = <&vreg_s2f_0p95>;
+		vddrfa1-supply = <&vreg_s2f_0p95>;
+		vddrfa2-supply = <&vreg_s8c_1p3>;
+		vddrfa3-supply = <&vreg_s5a_1p9>;
+		vddpcie1-supply = <&vreg_s8c_1p3>;
+		vddpcie2-supply = <&vreg_s5a_1p9>;
+		vddio-supply = <&vreg_s4a_1p8>;
+
+		wlan-enable-gpios = <&tlmm 20 GPIO_ACTIVE_HIGH>;
+		bt-enable-gpios = <&tlmm 21 GPIO_ACTIVE_HIGH>;
+
+		regulators {
+			vreg_pmu_rfa_cmn: ldo0 {
+				regulator-name = "vreg_pmu_rfa_cmn";
+				regulator-min-microvolt = <760000>;
+				regulator-max-microvolt = <840000>;
+			};
+
+			vreg_pmu_aon_0p59: ldo1 {
+				regulator-name = "vreg_pmu_aon_0p59";
+				regulator-min-microvolt = <540000>;
+				regulator-max-microvolt = <840000>;
+			};
+
+			vreg_pmu_wlcx_0p8: ldo2 {
+				regulator_name = "vreg_pmu_wlcx_0p8";
+				regulator-min-microvolt = <760000>;
+				regulator-max-microvolt = <840000>;
+			};
+
+			vreg_pmu_wlmx_0p85: ldo3 {
+				regulator-name = "vreg_pmu_wlmx_0p85";
+				regulator-min-microvolt = <810000>;
+				regulator-max-microvolt = <890000>;
+			};
+
+			vreg_pmu_btcmx_0p85: ldo4 {
+				regulator-name = "vreg_pmu_btcmx_0p85";
+				regulator-min-microvolt = <810000>;
+				regulator-max-microvolt = <890000>;
+			};
+
+			vreg_pmu_rfa_0p8: ldo5 {
+				regulator-name = "vreg_pmu_rfa_0p8";
+				regulator-min-microvolt = <760000>;
+				regulator-max-microvolt = <840000>;
+			};
+
+			vreg_pmu_rfa_1p2: ldo6 {
+				regulator-name = "vreg_pmu_rfa_1p2";
+				regulator-min-microvolt = <1187000>;
+				regulator-max-microvolt = <1313000>;
+			};
+
+			vreg_pmu_rfa_1p7: ldo7 {
+				regulator_name = "vreg_pmu_rfa_1p7";
+				regulator-min-microvolt = <1710000>;
+				regulator-max-microvolt = <1890000>;
+			};
+
+			vreg_pmu_pcie_0p9: ldo8 {
+				regulator_name = "vreg_pmu_pcie_0p9";
+				regulator-min-microvolt = <870000>;
+				regulator-max-microvolt = <970000>;
+			};
+
+			vreg_pmu_pcie_1p8: ldo9 {
+				regulator_name = "vreg_pmu_pcie_1p8";
+				regulator-min-microvolt = <1710000>;
+				regulator-max-microvolt = <1890000>;
+			};
+		};
+	};
+
 	thermal-zones {
 		conn-thermal {
 			polling-delay-passive = <0>;
@@ -734,6 +815,24 @@
 	vdda-pll-supply = <&vreg_l9a_1p2>;
 };
 
+&pcieport0 {
+	wifi@0 {
+		compatible = "pci17cb,1101";
+		reg = <0x10000 0x0 0x0 0x0 0x0>;
+
+		vddrfacmn-supply = <&vreg_pmu_rfa_cmn>;
+		vddaon-supply = <&vreg_pmu_aon_0p59>;
+		vddwlcx-supply = <&vreg_pmu_wlcx_0p8>;
+		vddwlmx-supply = <&vreg_pmu_wlmx_0p85>;
+		vddbtcmx-supply = <&vreg_pmu_btcmx_0p85>;
+		vddrfa0-supply = <&vreg_pmu_rfa_0p8>;
+		vddrfa1-supply = <&vreg_pmu_rfa_1p2>;
+		vddrfa2-supply = <&vreg_pmu_rfa_1p7>;
+		vddpcie0-supply = <&vreg_pmu_pcie_0p9>;
+		vddpcie1-supply = <&vreg_pmu_pcie_1p8>;
+	};
+};
+
 &pcie1 {
 	status = "okay";
 };
@@ -1303,6 +1402,14 @@
 		function = "gpio";
 		bias-pull-up;
 	};
+
+	wlan_en_state: wlan-default-state {
+		pins = "gpio20";
+		function = "gpio";
+		drive-strength = <16>;
+		output-low;
+		bias-pull-up;
+	};
 };
 
 &uart6 {
@@ -1311,17 +1418,16 @@
 	bluetooth {
 		compatible = "qcom,qca6390-bt";
 
-		pinctrl-names = "default";
-		pinctrl-0 = <&bt_en_state>;
-
-		enable-gpios = <&tlmm 21 GPIO_ACTIVE_HIGH>;
-
-		vddio-supply = <&vreg_s4a_1p8>;
-		vddpmu-supply = <&vreg_s2f_0p95>;
-		vddaon-supply = <&vreg_s6a_0p95>;
-		vddrfa0p9-supply = <&vreg_s2f_0p95>;
-		vddrfa1p3-supply = <&vreg_s8c_1p3>;
-		vddrfa1p9-supply = <&vreg_s5a_1p9>;
+		vddrfacmn-supply = <&vreg_pmu_rfa_cmn>;
+		vddaon-supply = <&vreg_pmu_aon_0p59>;
+		vddwlcx-supply = <&vreg_pmu_wlcx_0p8>;
+		vddwlmx-supply = <&vreg_pmu_wlmx_0p85>;
+		vddbtcmx-supply = <&vreg_pmu_btcmx_0p85>;
+		vddrfa0-supply = <&vreg_pmu_rfa_0p8>;
+		vddrfa1-supply = <&vreg_pmu_rfa_1p2>;
+		vddrfa2-supply = <&vreg_pmu_rfa_1p7>;
+		vddpcie0-supply = <&vreg_pmu_pcie_0p9>;
+		vddpcie1-supply = <&vreg_pmu_pcie_1p8>;
 	};
 };
 
diff --git a/arch/arm64/boot/dts/qcom/sm8250.dtsi b/arch/arm64/boot/dts/qcom/sm8250.dtsi
index 4d849e98bf9bdc..7cd21d4e727832 100644
--- a/arch/arm64/boot/dts/qcom/sm8250.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8250.dtsi
@@ -2203,6 +2203,16 @@
 			dma-coherent;
 
 			status = "disabled";
+
+			pcieport0: pcie@0 {
+				device_type = "pci";
+				reg = <0x0 0x0 0x0 0x0 0x0>;
+				#address-cells = <3>;
+				#size-cells = <2>;
+				ranges;
+
+				bus-range = <0x01 0xff>;
+			};
 		};
 
 		pcie0_phy: phy@1c06000 {

From 2b1a70d9fac3fe0aaf86cbd498efc12394e0d719 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Thu, 1 Feb 2024 16:55:26 +0100
Subject: [PATCH 666/707] power: sequencing: new subsystem

Implement the power sequencing subsystem allowing devices to share
complex powering-up and down procedures. It's split into the consumer
and provider parts but does not implement any new DT bindings so that
the actual power sequencing is never revealed in the DT representation.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/power/Kconfig             |   1 +
 drivers/power/Makefile            |   1 +
 drivers/power/sequencing/Kconfig  |  12 +
 drivers/power/sequencing/Makefile |   4 +
 drivers/power/sequencing/core.c   | 482 ++++++++++++++++++++++++++++++
 include/linux/pwrseq/consumer.h   |  53 ++++
 include/linux/pwrseq/provider.h   |  41 +++
 7 files changed, 594 insertions(+)
 create mode 100644 drivers/power/sequencing/Kconfig
 create mode 100644 drivers/power/sequencing/Makefile
 create mode 100644 drivers/power/sequencing/core.c
 create mode 100644 include/linux/pwrseq/consumer.h
 create mode 100644 include/linux/pwrseq/provider.h

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index 696bf77a70420e..9a8e44ca9ae4a5 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 source "drivers/power/reset/Kconfig"
+source "drivers/power/sequencing/Kconfig"
 source "drivers/power/supply/Kconfig"
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index effbf0377f3218..962a2cd30a51d9 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_POWER_RESET)	+= reset/
+obj-$(CONFIG_POWER_SEQUENCING)	+= sequencing/
 obj-$(CONFIG_POWER_SUPPLY)	+= supply/
diff --git a/drivers/power/sequencing/Kconfig b/drivers/power/sequencing/Kconfig
new file mode 100644
index 00000000000000..ba5732b1dbf84c
--- /dev/null
+++ b/drivers/power/sequencing/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menuconfig POWER_SEQUENCING
+	tristate "Power Sequencing support"
+	help
+	  Say Y here to enable the Power Sequencing subsystem.
+
+	  This subsystem is designed to control power to devices that share
+	  complex resources and/or require specific power sequences to be run
+	  during power-up.
+
+	  If unsure, say no.
diff --git a/drivers/power/sequencing/Makefile b/drivers/power/sequencing/Makefile
new file mode 100644
index 00000000000000..dcdf8c0c159e3d
--- /dev/null
+++ b/drivers/power/sequencing/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_POWER_SEQUENCING)		+= pwrseq-core.o
+pwrseq-core-y				:= core.o
diff --git a/drivers/power/sequencing/core.c b/drivers/power/sequencing/core.c
new file mode 100644
index 00000000000000..f035caed0e4ec9
--- /dev/null
+++ b/drivers/power/sequencing/core.c
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2024 Linaro Ltd.
+ */
+
+#include <linux/bug.h>
+#include <linux/cleanup.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/idr.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/property.h>
+#include <linux/pwrseq/consumer.h>
+#include <linux/pwrseq/provider.h>
+#include <linux/rwsem.h>
+
+static DEFINE_IDA(pwrseq_ida);
+/*
+ * Protects the device list on the bus from concurrent modifications but allows
+ * simultaneous read-only access.
+ */
+static DECLARE_RWSEM(pwrseq_sem);
+
+/**
+ * struct pwrseq_device - Private power sequencing data.
+ * @dev: Device struct associated with this sequencer.
+ * @id: Device ID.
+ * @owner: Prevents removal of active power sequencing providers.
+ * @pwrup_count: Keeps track of power state change requests.
+ * @sem: Protects the device against being unregistered while in use.
+ * @drvdata: Provider driver private data.
+ * @match: Power sequencer matching callback.
+ * @power_on: Power-on callback.
+ * @power_off: Power-off callback.
+ */
+struct pwrseq_device {
+	struct device dev;
+	int id;
+	struct module *owner;
+	unsigned int pwrup_count;
+	struct rw_semaphore dev_sem;
+	struct mutex state_lock;
+	void *drvdata;
+	pwrseq_match_func match;
+	pwrseq_power_state_func power_on;
+	pwrseq_power_state_func power_off;
+};
+
+/**
+ * struct pwrseq_desc - Wraps access to the pwrseq_device and ensures that one
+ *                      user cannot break the reference counting for others.
+ * @pwrseq: Reference to the power sequencing device.
+ * @powered_up: Power state set by the holder of the descriptor (not necessarily
+ * corresponding to the actual power state of the device).
+ */
+struct pwrseq_desc {
+	struct pwrseq_device *pwrseq;
+	bool powered_up;
+};
+
+static struct pwrseq_device *to_pwrseq_device(struct device *dev)
+{
+	return container_of(dev, struct pwrseq_device, dev);
+}
+
+static struct pwrseq_device *pwrseq_device_get(struct pwrseq_device *pwrseq)
+{
+	get_device(&pwrseq->dev);
+
+	return pwrseq;
+}
+
+static void pwrseq_device_put(struct pwrseq_device *pwrseq)
+{
+	put_device(&pwrseq->dev);
+}
+
+static struct bus_type pwrseq_bus = {
+	.name = "pwrseq",
+};
+
+static void pwrseq_release(struct device *dev)
+{
+	struct pwrseq_device *pwrseq = to_pwrseq_device(dev);
+
+	mutex_destroy(&pwrseq->state_lock);
+	ida_free(&pwrseq_ida, pwrseq->id);
+	kfree(pwrseq);
+}
+
+static const struct device_type pwrseq_device_type = {
+	.name = "power_sequencer",
+	.release = pwrseq_release,
+};
+
+/**
+ * pwrseq_device_register() - Register a new power sequencer.
+ * @config: Configuration of the new power sequencing device.
+ *
+ * The config structure is only used during the call and can be freed after
+ * the function returns. The config structure *must* have the parent device
+ * as well as the match(), power_on() and power_off() callbacks registered.
+ *
+ * Returns:
+ * Returns the address of the new pwrseq device or ERR_PTR() on failure.
+ */
+struct pwrseq_device *pwrseq_device_register(struct pwrseq_config *config)
+{
+	struct pwrseq_device *pwrseq;
+	int ret;
+
+	/*
+	 * Power sequencer must have a parent device and at least the power-on,
+	 * power-off and match callbacks.
+	 */
+	if (!config->parent || !config->match || !config->power_on ||
+	    !config->power_off)
+		return ERR_PTR(-EINVAL);
+
+	pwrseq = kzalloc(sizeof(*pwrseq), GFP_KERNEL);
+	if (!pwrseq)
+		return ERR_PTR(-ENOMEM);
+
+	pwrseq->dev.type = &pwrseq_device_type;
+	pwrseq->dev.bus = &pwrseq_bus;
+	pwrseq->dev.parent = config->parent;
+	device_set_node(&pwrseq->dev, dev_fwnode(config->parent));
+
+	pwrseq->id = ida_alloc(&pwrseq_ida, GFP_KERNEL);
+	if (pwrseq->id < 0) {
+		kfree(pwrseq);
+		return ERR_PTR(pwrseq->id);
+	}
+
+	/*
+	 * From this point onwards the device's release() callback is
+	 * responsible for freeing resources.
+	 */
+	device_initialize(&pwrseq->dev);
+
+	ret = dev_set_name(&pwrseq->dev, "pwrseq.%d", pwrseq->id);
+	if (ret)
+		goto err_put_pwrseq;
+
+	pwrseq->owner = config->owner ?: THIS_MODULE;
+	pwrseq->drvdata = config->drvdata;
+	pwrseq->match = config->match;
+	pwrseq->power_on = config->power_on;
+	pwrseq->power_off = config->power_off;
+
+	init_rwsem(&pwrseq->dev_sem);
+	mutex_init(&pwrseq->state_lock);
+
+	scoped_guard(rwsem_write, &pwrseq_sem) {
+		ret = device_add(&pwrseq->dev);
+		if (ret)
+			goto err_put_pwrseq;
+	}
+
+	return pwrseq;
+
+err_put_pwrseq:
+	pwrseq_device_put(pwrseq);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(pwrseq_device_register);
+
+/**
+ * pwrseq_device_unregister() - Unregister the power sequencer.
+ * @pwrseq: Power sequencer to unregister.
+ */
+void pwrseq_device_unregister(struct pwrseq_device *pwrseq)
+{
+	struct device *dev = &pwrseq->dev;
+
+	scoped_guard(mutex, &pwrseq->state_lock) {
+		WARN_ONCE(pwrseq->pwrup_count > 0,
+			  "%s: UNREGISTERING POWER SEQUENCER WITH ACTIVE USERS\n",
+			  dev_name(&pwrseq->dev));
+
+		scoped_guard(rwsem_write, &pwrseq_sem) {
+			scoped_guard(rwsem_write, &pwrseq->dev_sem)
+				device_del(dev);
+		}
+	}
+
+	pwrseq_device_put(pwrseq);
+}
+EXPORT_SYMBOL_GPL(pwrseq_device_unregister);
+
+static void devm_pwrseq_device_unregister(void *data)
+{
+	struct pwrseq_device *pwrseq = data;
+
+	pwrseq_device_unregister(pwrseq);
+}
+
+/**
+ * devm_pwrseq_device_register() - Managed variant of pwrseq_device_register().
+ * @dev: Managing device.
+ * @config: Configuration of the new power sequencing device.
+ *
+ * Returns:
+ * Returns the address of the new pwrseq device or ERR_PTR() on failure.
+ */
+struct pwrseq_device *
+devm_pwrseq_device_register(struct device *dev, struct pwrseq_config *config)
+{
+	struct pwrseq_device *pwrseq;
+	int ret;
+
+	pwrseq = pwrseq_device_register(config);
+	if (IS_ERR(pwrseq))
+		return pwrseq;
+
+	ret = devm_add_action_or_reset(dev, devm_pwrseq_device_unregister,
+				       pwrseq);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return pwrseq;
+}
+EXPORT_SYMBOL_GPL(devm_pwrseq_device_register);
+
+/**
+ * pwrseq_device_get_data() - Get the driver private data associated with this
+ *                            sequencer.
+ * @pwrseq: Power sequencer object.
+ *
+ * Returns:
+ * Address of the private driver data.
+ */
+void *pwrseq_device_get_data(struct pwrseq_device *pwrseq)
+{
+	return pwrseq->drvdata;
+}
+EXPORT_SYMBOL_GPL(pwrseq_device_get_data);
+
+struct pwrseq_match_data {
+	struct pwrseq_device *matched;
+	struct device *dev;
+};
+
+static int pwrseq_match_device(struct device *pwrseq_dev, void *data)
+{
+	struct pwrseq_device *pwrseq = to_pwrseq_device(pwrseq_dev);
+	struct pwrseq_match_data *match_data = data;
+	int ret;
+
+	guard(rwsem_read)(&pwrseq->dev_sem);
+	if (!device_is_registered(&pwrseq->dev))
+		return 0;
+
+	ret = pwrseq->match(pwrseq, match_data->dev);
+	if (ret <= 0)
+		return ret;
+
+	match_data->matched = pwrseq;
+
+	return 1;
+}
+
+/**
+ * pwrseq_get() - Get the power sequencer associated with this device.
+ * @dev: Device for which to get the sequencer.
+ *
+ * Returns:
+ * New power sequencer descriptor for use by the consumer driver or ERR_PTR()
+ * on failure.
+ */
+struct pwrseq_desc *pwrseq_get(struct device *dev)
+{
+	struct pwrseq_match_data match_data;
+	struct pwrseq_device *pwrseq;
+	int ret;
+
+	struct pwrseq_desc *desc __free(kfree) = kzalloc(sizeof(*desc),
+							 GFP_KERNEL);
+	if (!desc)
+		return ERR_PTR(-ENOMEM);
+
+	match_data.matched = NULL;
+	match_data.dev = dev;
+
+	guard(rwsem_read)(&pwrseq_sem);
+
+	ret = bus_for_each_dev(&pwrseq_bus, NULL, &match_data,
+			       pwrseq_match_device);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret == 0)
+		/* No device matched. */
+		return ERR_PTR(-EPROBE_DEFER);
+
+	pwrseq = match_data.matched;
+
+	if (!try_module_get(pwrseq->owner))
+		return ERR_PTR(-EPROBE_DEFER);
+
+	desc->pwrseq = pwrseq_device_get(pwrseq);
+
+	return no_free_ptr(desc);
+}
+EXPORT_SYMBOL_GPL(pwrseq_get);
+
+/**
+ * pwrseq_put() - Release the power sequencer descriptor.
+ * @desc: Descriptor to release.
+ */
+void pwrseq_put(struct pwrseq_desc *desc)
+{
+	struct pwrseq_device *pwrseq;
+
+	if (!desc)
+		return;
+
+	pwrseq = desc->pwrseq;
+
+	if (desc->powered_up)
+		pwrseq_power_off(desc);
+
+	kfree(desc);
+	module_put(pwrseq->owner);
+	pwrseq_device_put(pwrseq);
+}
+EXPORT_SYMBOL_GPL(pwrseq_put);
+
+static void devm_pwrseq_put(void *data)
+{
+	struct pwrseq_desc *desc = data;
+
+	pwrseq_put(desc);
+}
+
+/**
+ * devm_pwrseq_get() - Managed variant of pwrseq_get().
+ * @dev: Device for which to get the sequencer and which also manages its
+ *       lifetime.
+ *
+ * Returns:
+ * New power sequencer descriptor for use by the consumer driver or ERR_PTR()
+ * on failure.
+ */
+struct pwrseq_desc *devm_pwrseq_get(struct device *dev)
+{
+	struct pwrseq_desc *desc;
+	int ret;
+
+	desc = pwrseq_get(dev);
+	if (IS_ERR(desc))
+		return desc;
+
+	ret = devm_add_action_or_reset(dev, devm_pwrseq_put, desc);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return desc;
+}
+EXPORT_SYMBOL_GPL(devm_pwrseq_get);
+
+/**
+ * pwrseq_power_on() - Issue a power-on request on behalf of the consumer
+ *                     device.
+ * @desc: Descriptor referencing the power sequencer.
+ *
+ * This function tells the power sequencer that the consumer wants to be
+ * powered-up. The sequencer may already have powered-up the device in which
+ * case the function returns 0. If the power-up sequence is already in
+ * progress, the function will block until it's done and return 0. If this is
+ * the first request, the device will be powered up.
+ *
+ * Returns:
+ * 0 on success, negative error number on failure.
+ */
+int pwrseq_power_on(struct pwrseq_desc *desc)
+{
+	struct pwrseq_device *pwrseq;
+	int ret;
+
+	might_sleep();
+
+	if (!desc || desc->powered_up)
+		return 0;
+
+	pwrseq = desc->pwrseq;
+
+	guard(rwsem_read)(&pwrseq->dev_sem);
+	if (!device_is_registered(&pwrseq->dev))
+		return -ENODEV;
+
+	guard(mutex)(&pwrseq->state_lock);
+
+	pwrseq->pwrup_count++;
+	if (pwrseq->pwrup_count != 1) {
+		desc->powered_up = true;
+		return 0;
+	}
+
+	ret = pwrseq->power_on(pwrseq);
+	if (!ret)
+		desc->powered_up = true;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pwrseq_power_on);
+
+/**
+ * pwrseq_power_off() - Issue a power-off request on behalf of the consumer
+ *                      device.
+ * @desc: Descriptor referencing the power sequencer.
+ *
+ * This undoes the effects of pwrseq_power_on(). It issues a power-off request
+ * on behalf of the consumer and when the last remaining user does so, the
+ * power-down sequence will be started. If one is in progress, the function
+ * will block until it's complete and then return.
+ *
+ * Returns:
+ * 0 on success, negative error number on failure.
+ */
+int pwrseq_power_off(struct pwrseq_desc *desc)
+{
+	struct pwrseq_device *pwrseq;
+	int ret;
+
+	might_sleep();
+
+	if (!desc || !desc->powered_up)
+		return 0;
+
+	pwrseq = desc->pwrseq;
+
+	guard(rwsem_read)(&pwrseq->dev_sem);
+	if (!device_is_registered(&pwrseq->dev))
+		return -ENODEV;
+
+	guard(mutex)(&pwrseq->state_lock);
+
+	if (pwrseq->pwrup_count == 0) {
+		WARN_ONCE(1, "Unmatched power-off\n");
+		return -EBUSY;
+	}
+
+	pwrseq->pwrup_count--;
+	if (pwrseq->pwrup_count != 0) {
+		desc->powered_up = false;
+		return 0;
+	}
+
+	ret = pwrseq->power_off(pwrseq);
+	if (!ret)
+		desc->powered_up = false;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pwrseq_power_off);
+
+static int __init pwrseq_init(void)
+{
+	int ret;
+
+	ret = bus_register(&pwrseq_bus);
+	if (ret) {
+		pr_err("Failed to register the power sequencer bus\n");
+		return ret;
+	}
+
+	return 0;
+}
+subsys_initcall(pwrseq_init);
+
+static void __exit pwrseq_exit(void)
+{
+	bus_unregister(&pwrseq_bus);
+}
+module_exit(pwrseq_exit);
+
+MODULE_AUTHOR("Bartosz Golaszewski <bartosz.golaszewski@linaro.org>");
+MODULE_DESCRIPTION("Power Sequencing subsystem core");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/pwrseq/consumer.h b/include/linux/pwrseq/consumer.h
new file mode 100644
index 00000000000000..f207b8b2864dbe
--- /dev/null
+++ b/include/linux/pwrseq/consumer.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2024 Linaro Ltd.
+ */
+
+#ifndef __POWER_SEQUENCING_CONSUMER_H__
+#define __POWER_SEQUENCING_CONSUMER_H__
+
+#include <linux/err.h>
+
+struct device;
+struct pwrseq_desc;
+
+#if IS_ENABLED(CONFIG_POWER_SEQUENCING)
+
+struct pwrseq_desc * __must_check pwrseq_get(struct device *dev);
+void pwrseq_put(struct pwrseq_desc *desc);
+
+struct pwrseq_desc * __must_check devm_pwrseq_get(struct device *dev);
+
+int pwrseq_power_on(struct pwrseq_desc *desc);
+int pwrseq_power_off(struct pwrseq_desc *desc);
+
+#else /* CONFIG_POWER_SEQUENCING */
+
+static inline struct pwrseq_desc * __must_check pwrseq_get(struct device *dev)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline void pwrseq_put(struct pwrseq_desc *desc)
+{
+}
+
+static inline struct pwrseq_desc * __must_check
+devm_pwrseq_get(struct device *dev)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline int pwrseq_power_on(struct pwrseq_desc *desc)
+{
+	return -ENOSYS;
+}
+
+static inline int pwrseq_power_off(struct pwrseq_desc *desc)
+{
+	return -ENOSYS;
+}
+
+#endif /* CONFIG_POWER_SEQUENCING */
+
+#endif /* __POWER_SEQUENCING_CONSUMER_H__ */
diff --git a/include/linux/pwrseq/provider.h b/include/linux/pwrseq/provider.h
new file mode 100644
index 00000000000000..8696a89afa43c0
--- /dev/null
+++ b/include/linux/pwrseq/provider.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2024 Linaro Ltd.
+ */
+
+#ifndef __POWER_SEQUENCING_PROVIDER_H__
+#define __POWER_SEQUENCING_PROVIDER_H__
+
+struct device;
+struct module;
+struct pwrseq_device;
+
+typedef int (*pwrseq_power_state_func)(struct pwrseq_device *);
+typedef int (*pwrseq_match_func)(struct pwrseq_device *, struct device *);
+
+/**
+ * struct pwrseq_config - Configuration used for registering a new provider.
+ * @parent: Parent device for the sequencer.
+ * @owner: Module providing this device.
+ * @drvdata: Private driver data.
+ * @match: Provider callback used to match the consumer device to the sequencer.
+ * @power_on: Callback running the power-on sequence.
+ * @power_off: Callback running the power-off sequence.
+ */
+struct pwrseq_config {
+	struct device *parent;
+	struct module *owner;
+	void *drvdata;
+	pwrseq_match_func match;
+	pwrseq_power_state_func power_on;
+	pwrseq_power_state_func power_off;
+};
+
+struct pwrseq_device *pwrseq_device_register(struct pwrseq_config *config);
+void pwrseq_device_unregister(struct pwrseq_device *pwrseq);
+struct pwrseq_device *
+devm_pwrseq_device_register(struct device *dev, struct pwrseq_config *config);
+
+void *pwrseq_device_get_data(struct pwrseq_device *pwrseq);
+
+#endif /* __POWER_SEQUENCING_PROVIDER_H__ */

From 1f9afdbf94c63d9031b3948807e88e72e1d09a60 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Thu, 1 Feb 2024 16:55:27 +0100
Subject: [PATCH 667/707] power: pwrseq: add a driver for the QCA6390 PMU
 module

This adds the power sequencing driver for the QCA6390's PMU module. It
uses the pwrseq subsystem and knows how to match the sequencer to the
consumer device by verifying the relevant properties and DT layout.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/power/sequencing/Kconfig          |  16 ++
 drivers/power/sequencing/Makefile         |   2 +
 drivers/power/sequencing/pwrseq-qca6390.c | 232 ++++++++++++++++++++++
 3 files changed, 250 insertions(+)
 create mode 100644 drivers/power/sequencing/pwrseq-qca6390.c

diff --git a/drivers/power/sequencing/Kconfig b/drivers/power/sequencing/Kconfig
index ba5732b1dbf84c..84ddf3b4ae56ad 100644
--- a/drivers/power/sequencing/Kconfig
+++ b/drivers/power/sequencing/Kconfig
@@ -10,3 +10,19 @@ menuconfig POWER_SEQUENCING
 	  during power-up.
 
 	  If unsure, say no.
+
+if POWER_SEQUENCING
+
+config POWER_SEQUENCING_QCA6390
+	tristate "QCA6390 PMU driver"
+	default m if ARCH_QCOM
+	help
+	  Say U here to enable the power sequencing driver for Qualcomm
+	  QCA6390.
+
+	  The QCA6390 package contains the BT and WLAN modules whose power
+	  is controlled by the PMU module. As the former two share the power-up
+	  sequence which is executed by the PMU, this driver is needed for
+	  correct power control.
+
+endif
diff --git a/drivers/power/sequencing/Makefile b/drivers/power/sequencing/Makefile
index dcdf8c0c159e3d..628345c4e7aecc 100644
--- a/drivers/power/sequencing/Makefile
+++ b/drivers/power/sequencing/Makefile
@@ -2,3 +2,5 @@
 
 obj-$(CONFIG_POWER_SEQUENCING)		+= pwrseq-core.o
 pwrseq-core-y				:= core.o
+
+obj-$(CONFIG_POWER_SEQUENCING_QCA6390)	+= pwrseq-qca6390.o
diff --git a/drivers/power/sequencing/pwrseq-qca6390.c b/drivers/power/sequencing/pwrseq-qca6390.c
new file mode 100644
index 00000000000000..6c930e3e88ec9a
--- /dev/null
+++ b/drivers/power/sequencing/pwrseq-qca6390.c
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2024 Linaro Ltd.
+ */
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/gpio/consumer.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/pwrseq/provider.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+struct pwrseq_qca6390_vreg {
+	const char *name;
+	unsigned int load_uA;
+};
+
+struct pwrseq_qca6390_pdata {
+	const struct pwrseq_qca6390_vreg *vregs;
+	size_t num_vregs;
+	unsigned int pwup_delay_msec;
+};
+
+struct pwrseq_qca6390_ctx {
+	struct pwrseq_device *pwrseq;
+	struct device_node *of_node;
+	const struct pwrseq_qca6390_pdata *pdata;
+	struct regulator_bulk_data *regs;
+	struct gpio_desc *bt_gpio;
+	struct gpio_desc *wlan_gpio;
+};
+
+static const struct pwrseq_qca6390_vreg pwrseq_qca6390_vregs[] = {
+	{
+		.name = "vddio",
+		.load_uA = 20000,
+	},
+	{
+		.name = "vddaon",
+		.load_uA = 100000,
+	},
+	{
+		.name = "vddpmu",
+		.load_uA = 1250000,
+	},
+	{
+		.name = "vddpcie1",
+		.load_uA = 35000,
+	},
+	{
+		.name = "vddpcie2",
+		.load_uA = 15000,
+	},
+	{
+		.name = "vddrfa1",
+		.load_uA = 200000,
+	},
+	{
+		.name = "vddrfa2",
+		.load_uA = 400000,
+	},
+	{
+		.name = "vddrfa3",
+		.load_uA = 400000,
+	},
+};
+
+static const struct pwrseq_qca6390_pdata pwrseq_qca6390_of_data = {
+	.vregs = pwrseq_qca6390_vregs,
+	.num_vregs = ARRAY_SIZE(pwrseq_qca6390_vregs),
+	.pwup_delay_msec = 16,
+};
+
+static int pwrseq_qca6390_power_on(struct pwrseq_device *pwrseq)
+{
+	struct pwrseq_qca6390_ctx *ctx = pwrseq_device_get_data(pwrseq);
+	int ret;
+
+	ret = regulator_bulk_enable(ctx->pdata->num_vregs, ctx->regs);
+	if (ret)
+		return ret;
+
+	gpiod_set_value_cansleep(ctx->bt_gpio, 1);
+	gpiod_set_value_cansleep(ctx->wlan_gpio, 1);
+
+	if (ctx->pdata->pwup_delay_msec)
+		msleep(ctx->pdata->pwup_delay_msec);
+
+	return 0;
+}
+
+static int pwrseq_qca6390_power_off(struct pwrseq_device *pwrseq)
+{
+	struct pwrseq_qca6390_ctx *ctx = pwrseq_device_get_data(pwrseq);
+
+	gpiod_set_value_cansleep(ctx->bt_gpio, 0);
+	gpiod_set_value_cansleep(ctx->wlan_gpio, 0);
+
+	return regulator_bulk_disable(ctx->pdata->num_vregs, ctx->regs);
+}
+
+static int pwrseq_qca6390_match(struct pwrseq_device *pwrseq,
+				struct device *dev)
+{
+	struct pwrseq_qca6390_ctx *ctx = pwrseq_device_get_data(pwrseq);
+	struct device_node *dev_node = dev->of_node;
+
+	/*
+	 * The PMU supplies power to the Bluetooth and WLAN modules. both
+	 * consume the PMU AON output so check the presence of the
+	 * 'vddaon-supply' property and whether it leads us to the right
+	 * device.
+	 */
+	if (!of_property_present(dev_node, "vddaon-supply"))
+		return 0;
+
+	struct device_node *reg_node __free(of_node) =
+			of_parse_phandle(dev_node, "vddaon-supply", 0);
+	if (!reg_node)
+		return 0;
+
+	/*
+	 * `reg_node` is the PMU AON regulator, its parent is the `regulators`
+	 * node and finally its grandparent is the PMU device node that we're
+	 * looking for.
+	 */
+	if (!reg_node->parent || !reg_node->parent->parent ||
+	    reg_node->parent->parent != ctx->of_node)
+		return 0;
+
+	return 1;
+}
+
+static int pwrseq_qca6390_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct pwrseq_qca6390_ctx *ctx;
+	struct pwrseq_config config;
+	int ret, i;
+
+	ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->of_node = dev->of_node;
+
+	ctx->pdata = of_device_get_match_data(dev);
+	if (!ctx->pdata)
+		return dev_err_probe(dev, -ENODEV,
+				     "Failed to obtain platform data\n");
+
+	if (ctx->pdata->vregs) {
+		ctx->regs = devm_kcalloc(dev, ctx->pdata->num_vregs,
+					 sizeof(*ctx->regs), GFP_KERNEL);
+		if (!ctx->regs)
+			return -ENOMEM;
+
+		for (i = 0; i < ctx->pdata->num_vregs; i++)
+			ctx->regs[i].supply = ctx->pdata->vregs[i].name;
+
+		ret = devm_regulator_bulk_get(dev, ctx->pdata->num_vregs,
+					      ctx->regs);
+		if (ret < 0)
+			return dev_err_probe(dev, ret,
+					     "Failed to get all regulators\n");
+
+		for (i = 0; i < ctx->pdata->num_vregs; i++) {
+			if (!ctx->pdata->vregs[1].load_uA)
+				continue;
+
+			ret = regulator_set_load(ctx->regs[i].consumer,
+						 ctx->pdata->vregs[i].load_uA);
+			if (ret)
+				return dev_err_probe(dev, ret,
+						     "Failed to set vreg load\n");
+		}
+	}
+
+	ctx->bt_gpio = devm_gpiod_get_optional(dev, "bt-enable", GPIOD_OUT_LOW);
+	if (IS_ERR(ctx->bt_gpio))
+		return dev_err_probe(dev, PTR_ERR(ctx->bt_gpio),
+				     "Failed to get the Bluetooth enable GPIO\n");
+
+	ctx->wlan_gpio = devm_gpiod_get_optional(dev, "wlan-enable",
+						 GPIOD_OUT_LOW);
+	if (IS_ERR(ctx->wlan_gpio))
+		return dev_err_probe(dev, PTR_ERR(ctx->wlan_gpio),
+				     "Failed to get the WLAN enable GPIO\n");
+
+	memset(&config, 0, sizeof(config));
+
+	config.parent = dev;
+	config.owner = THIS_MODULE;
+	config.drvdata = ctx;
+	config.match = pwrseq_qca6390_match;
+	config.power_on = pwrseq_qca6390_power_on;
+	config.power_off = pwrseq_qca6390_power_off;
+
+	ctx->pwrseq = devm_pwrseq_device_register(dev, &config);
+	if (IS_ERR(ctx->pwrseq))
+		return dev_err_probe(dev, PTR_ERR(ctx->pwrseq),
+				     "Failed to register the power sequencer\n");
+
+	return 0;
+}
+
+static const struct of_device_id pwrseq_qca6390_of_match[] = {
+	{
+		.compatible = "qcom,qca6390-pmu",
+		.data = &pwrseq_qca6390_of_data,
+	},
+	{ }
+};
+MODULE_DEVICE_TABLE(of, pwrseq_qca6390_of_match);
+
+static struct platform_driver pwrseq_qca6390_driver = {
+	.driver = {
+		.name = "pwrseq-qca6390",
+		.of_match_table = pwrseq_qca6390_of_match,
+	},
+	.probe = pwrseq_qca6390_probe,
+};
+module_platform_driver(pwrseq_qca6390_driver);
+
+MODULE_AUTHOR("Bartosz Golaszewski <bartosz.golaszewski@linaro.org>");
+MODULE_DESCRIPTION("QCA6390 PMU power sequencing driver");
+MODULE_LICENSE("GPL");

From b9ed8b5a2c000afaae55811a3a7aacfbf1cf7b16 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Thu, 1 Feb 2024 16:55:28 +0100
Subject: [PATCH 668/707] Bluetooth: qca: use the power sequencer for QCA6390

Use the pwrseq subsystem's consumer API to run the power-up sequence for
the Bluetooth module of the QCA6390 package.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/bluetooth/hci_qca.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
index 94b8c406f0c0ed..21c306caafbcb3 100644
--- a/drivers/bluetooth/hci_qca.c
+++ b/drivers/bluetooth/hci_qca.c
@@ -28,6 +28,7 @@
 #include <linux/of.h>
 #include <linux/acpi.h>
 #include <linux/platform_device.h>
+#include <linux/pwrseq/consumer.h>
 #include <linux/regulator/consumer.h>
 #include <linux/serdev.h>
 #include <linux/mutex.h>
@@ -214,6 +215,7 @@ struct qca_power {
 	struct regulator_bulk_data *vreg_bulk;
 	int num_vregs;
 	bool vregs_on;
+	struct pwrseq_desc *pwrseq;
 };
 
 struct qca_serdev {
@@ -1791,6 +1793,11 @@ static int qca_power_on(struct hci_dev *hdev)
 		ret = qca_regulator_init(hu);
 		break;
 
+	case QCA_QCA6390:
+		qcadev = serdev_device_get_drvdata(hu->serdev);
+		ret = pwrseq_power_on(qcadev->bt_power->pwrseq);
+		break;
+
 	default:
 		qcadev = serdev_device_get_drvdata(hu->serdev);
 		if (qcadev->bt_en) {
@@ -2160,6 +2167,10 @@ static void qca_power_shutdown(struct hci_uart *hu)
 		}
 		break;
 
+	case QCA_QCA6390:
+		pwrseq_power_off(qcadev->bt_power->pwrseq);
+		break;
+
 	default:
 		gpiod_set_value_cansleep(qcadev->bt_en, 0);
 	}
@@ -2298,12 +2309,25 @@ static int qca_serdev_probe(struct serdev_device *serdev)
 	case QCA_WCN6750:
 	case QCA_WCN6855:
 	case QCA_WCN7850:
+	case QCA_QCA6390:
 		qcadev->bt_power = devm_kzalloc(&serdev->dev,
 						sizeof(struct qca_power),
 						GFP_KERNEL);
 		if (!qcadev->bt_power)
 			return -ENOMEM;
+		break;
+	default:
+		break;
+	}
 
+	switch (qcadev->btsoc_type) {
+	case QCA_WCN3988:
+	case QCA_WCN3990:
+	case QCA_WCN3991:
+	case QCA_WCN3998:
+	case QCA_WCN6750:
+	case QCA_WCN6855:
+	case QCA_WCN7850:
 		qcadev->bt_power->dev = &serdev->dev;
 		err = qca_init_regulators(qcadev->bt_power, data->vregs,
 					  data->num_vregs);
@@ -2344,6 +2368,12 @@ static int qca_serdev_probe(struct serdev_device *serdev)
 		}
 		break;
 
+	case QCA_QCA6390:
+		qcadev->bt_power->pwrseq = devm_pwrseq_get(&serdev->dev);
+		if (IS_ERR(qcadev->bt_power->pwrseq))
+			return PTR_ERR(qcadev->bt_power->pwrseq);
+		fallthrough;
+
 	default:
 		qcadev->bt_en = devm_gpiod_get_optional(&serdev->dev, "enable",
 					       GPIOD_OUT_LOW);

From f5e49cd2dcec4fad9a61eef8e34f0cb4ce09f0f3 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Thu, 1 Feb 2024 16:55:29 +0100
Subject: [PATCH 669/707] PCI: create platform devices for child OF nodes of
 the port node

In order to introduce PCI power-sequencing, we need to create platform
devices for child nodes of the port node. They will get matched against
the pwrseq drivers (if one exists) and then the actual PCI device will
reuse the node once it's detected on the bus.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/pci/bus.c    | 9 ++++++++-
 drivers/pci/remove.c | 2 ++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 826b5016a10102..17ab41094c4e3c 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -12,6 +12,7 @@
 #include <linux/errno.h>
 #include <linux/ioport.h>
 #include <linux/of.h>
+#include <linux/of_platform.h>
 #include <linux/proc_fs.h>
 #include <linux/slab.h>
 
@@ -342,8 +343,14 @@ void pci_bus_add_device(struct pci_dev *dev)
 	 */
 	pcibios_bus_add_device(dev);
 	pci_fixup_device(pci_fixup_final, dev);
-	if (pci_is_bridge(dev))
+	if (pci_is_bridge(dev)) {
 		of_pci_make_dev_node(dev);
+		retval = of_platform_populate(dev->dev.of_node, NULL, NULL,
+					      &dev->dev);
+		if (retval)
+			pci_err(dev, "failed to populate child OF nodes (%d)\n",
+				retval);
+	}
 	pci_create_sysfs_dev_files(dev);
 	pci_proc_attach_device(dev);
 	pci_bridge_d3_update(dev);
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index d749ea8250d656..fc9db2805888ab 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/pci.h>
 #include <linux/module.h>
+#include <linux/of_platform.h>
 #include "pci.h"
 
 static void pci_free_resources(struct pci_dev *dev)
@@ -22,6 +23,7 @@ static void pci_stop_dev(struct pci_dev *dev)
 		device_release_driver(&dev->dev);
 		pci_proc_detach_device(dev);
 		pci_remove_sysfs_dev_files(dev);
+		of_platform_depopulate(&dev->dev);
 		of_pci_remove_node(dev);
 
 		pci_dev_assign_added(dev, false);

From d3688012fd051930c975956e04cd519bbe8fddca Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Thu, 1 Feb 2024 16:55:30 +0100
Subject: [PATCH 670/707] PCI: hold the rescan mutex when scanning for the
 first time

With the introduction of the power sequencing drivers that will be able
to trigger the port rescan, we need to hold the rescan mutex during the
initial pci_host_probe() too or the two could get in each other's way.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/pci/probe.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index b7335be56008f7..957f7afee7ba51 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -3122,7 +3122,9 @@ int pci_host_probe(struct pci_host_bridge *bridge)
 	struct pci_bus *bus, *child;
 	int ret;
 
+	pci_lock_rescan_remove();
 	ret = pci_scan_root_bus_bridge(bridge);
+	pci_unlock_rescan_remove();
 	if (ret < 0) {
 		dev_err(bridge->dev.parent, "Scanning root bridge failed");
 		return ret;

From a0c987643e9bfdf48ec9e06640a724a63c5c2096 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Thu, 1 Feb 2024 16:55:31 +0100
Subject: [PATCH 671/707] PCI/pwrctl: add PCI power control core code

Some PCI devices must be powered-on before they can be detected on the
bus. Introduce a simple framework reusing the existing PCI OF
infrastructure.

The way this works is: a DT node representing a PCI device connected to
the port can be matched against its power control platform driver. If
the match succeeds, the driver is responsible for powering-up the device
and calling pcie_pwrctl_device_enable() which will trigger a PCI bus
rescan as well as subscribe to PCI bus notifications.

When the device is detected and created, we'll make it consume the same
DT node that the platform device did. When the device is bound, we'll
create a device link between it and the parent power control device.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/pci/Kconfig         |  1 +
 drivers/pci/Makefile        |  1 +
 drivers/pci/pwrctl/Kconfig  |  8 ++++
 drivers/pci/pwrctl/Makefile |  3 ++
 drivers/pci/pwrctl/core.c   | 82 +++++++++++++++++++++++++++++++++++++
 include/linux/pci-pwrctl.h  | 24 +++++++++++
 6 files changed, 119 insertions(+)
 create mode 100644 drivers/pci/pwrctl/Kconfig
 create mode 100644 drivers/pci/pwrctl/Makefile
 create mode 100644 drivers/pci/pwrctl/core.c
 create mode 100644 include/linux/pci-pwrctl.h

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 74147262625bc2..5b9b84f8774fb0 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -291,5 +291,6 @@ source "drivers/pci/hotplug/Kconfig"
 source "drivers/pci/controller/Kconfig"
 source "drivers/pci/endpoint/Kconfig"
 source "drivers/pci/switch/Kconfig"
+source "drivers/pci/pwrctl/Kconfig"
 
 endif
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index cc8b4e01e29de5..6ae202d950f8bf 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_PCI)		+= access.o bus.o probe.o host-bridge.o \
 
 obj-$(CONFIG_PCI)		+= msi/
 obj-$(CONFIG_PCI)		+= pcie/
+obj-$(CONFIG_PCI)		+= pwrctl/
 
 ifdef CONFIG_PCI
 obj-$(CONFIG_PROC_FS)		+= proc.o
diff --git a/drivers/pci/pwrctl/Kconfig b/drivers/pci/pwrctl/Kconfig
new file mode 100644
index 00000000000000..e2dc5e5d2af1e8
--- /dev/null
+++ b/drivers/pci/pwrctl/Kconfig
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+
+menu "PCI Power control drivers"
+
+config PCI_PWRCTL
+	bool
+
+endmenu
diff --git a/drivers/pci/pwrctl/Makefile b/drivers/pci/pwrctl/Makefile
new file mode 100644
index 00000000000000..4381cfbf3f211a
--- /dev/null
+++ b/drivers/pci/pwrctl/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_PCI_PWRCTL)		+= core.o
diff --git a/drivers/pci/pwrctl/core.c b/drivers/pci/pwrctl/core.c
new file mode 100644
index 00000000000000..312e6fe95c315f
--- /dev/null
+++ b/drivers/pci/pwrctl/core.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2024 Linaro Ltd.
+ */
+
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/pci-pwrctl.h>
+#include <linux/property.h>
+#include <linux/slab.h>
+
+static int pci_pwrctl_notify(struct notifier_block *nb, unsigned long action,
+			     void *data)
+{
+	struct pci_pwrctl *pwrctl = container_of(nb, struct pci_pwrctl, nb);
+	struct device *dev = data;
+
+	if (dev_fwnode(dev) != dev_fwnode(pwrctl->dev))
+		return NOTIFY_DONE;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		device_set_of_node_from_dev(dev, pwrctl->dev);
+		break;
+	case BUS_NOTIFY_BOUND_DRIVER:
+		pwrctl->link = device_link_add(dev, pwrctl->dev,
+					       DL_FLAG_AUTOREMOVE_CONSUMER);
+		if (!pwrctl->link)
+			dev_err(pwrctl->dev, "Failed to add device link\n");
+		break;
+	case BUS_NOTIFY_UNBOUND_DRIVER:
+		device_link_del(pwrctl->link);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+int pci_pwrctl_device_enable(struct pci_pwrctl *pwrctl)
+{
+	if (!pwrctl->dev)
+		return -ENODEV;
+
+	pwrctl->nb.notifier_call = pci_pwrctl_notify;
+	bus_register_notifier(&pci_bus_type, &pwrctl->nb);
+
+	pci_lock_rescan_remove();
+	pci_rescan_bus(to_pci_dev(pwrctl->dev->parent)->bus);
+	pci_unlock_rescan_remove();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_pwrctl_device_enable);
+
+void pci_pwrctl_device_disable(struct pci_pwrctl *pwrctl)
+{
+	bus_unregister_notifier(&pci_bus_type, &pwrctl->nb);
+}
+EXPORT_SYMBOL_GPL(pci_pwrctl_device_disable);
+
+static void devm_pci_pwrctl_device_disable(void *data)
+{
+	struct pci_pwrctl *pwrctl = data;
+
+	pci_pwrctl_device_disable(pwrctl);
+}
+
+int devm_pci_pwrctl_device_enable(struct device *dev,
+				  struct pci_pwrctl *pwrctl)
+{
+	int ret;
+
+	ret = pci_pwrctl_device_enable(pwrctl);
+	if (ret)
+		return ret;
+
+	return devm_add_action_or_reset(dev, devm_pci_pwrctl_device_disable,
+					pwrctl);
+}
+EXPORT_SYMBOL_GPL(devm_pci_pwrctl_device_enable);
diff --git a/include/linux/pci-pwrctl.h b/include/linux/pci-pwrctl.h
new file mode 100644
index 00000000000000..8d16d27cbfebff
--- /dev/null
+++ b/include/linux/pci-pwrctl.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2024 Linaro Ltd.
+ */
+
+#ifndef __PCI_PWRCTL_H__
+#define __PCI_PWRCTL_H__
+
+#include <linux/notifier.h>
+
+struct device;
+
+struct pci_pwrctl {
+	struct notifier_block nb;
+	struct device *dev;
+	struct device_link *link;
+};
+
+int pci_pwrctl_device_enable(struct pci_pwrctl *pwrctl);
+void pci_pwrctl_device_disable(struct pci_pwrctl *pwrctl);
+int devm_pci_pwrctl_device_enable(struct device *dev,
+				  struct pci_pwrctl *pwrctl);
+
+#endif /* __PCI_PWRCTL_H__ */

From ea458457f1f665494a23f3ced6f3fb692dcfb45b Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Thu, 1 Feb 2024 16:55:32 +0100
Subject: [PATCH 672/707] PCI/pwrctl: add a PCI power control driver for power
 sequenced devices

Add a PCI power control driver that's capable of correctly powering up
devices using the power sequencing subsystem. For now we support the
ath11k module on QCA6390.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/pci/pwrctl/Kconfig             |  9 +++
 drivers/pci/pwrctl/Makefile            |  1 +
 drivers/pci/pwrctl/pci-pwrctl-pwrseq.c | 83 ++++++++++++++++++++++++++
 3 files changed, 93 insertions(+)
 create mode 100644 drivers/pci/pwrctl/pci-pwrctl-pwrseq.c

diff --git a/drivers/pci/pwrctl/Kconfig b/drivers/pci/pwrctl/Kconfig
index e2dc5e5d2af1e8..bca72dc08e79fd 100644
--- a/drivers/pci/pwrctl/Kconfig
+++ b/drivers/pci/pwrctl/Kconfig
@@ -5,4 +5,13 @@ menu "PCI Power control drivers"
 config PCI_PWRCTL
 	bool
 
+config PCI_PWRCTL_PWRSEQ
+	tristate "PCI Power Control driver using the Power Sequencing subsystem"
+	select POWER_SEQUENCING
+	select PCI_PWRCTL
+	default m if (ATH11K_PCI && ARCH_QCOM)
+	help
+	  Enable support for the PCI power control driver for device
+	  drivers using the Power Sequencing subsystem.
+
 endmenu
diff --git a/drivers/pci/pwrctl/Makefile b/drivers/pci/pwrctl/Makefile
index 4381cfbf3f211a..919c0f704ee9ea 100644
--- a/drivers/pci/pwrctl/Makefile
+++ b/drivers/pci/pwrctl/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_PCI_PWRCTL)		+= core.o
+obj-$(CONFIG_PCI_PWRCTL_PWRSEQ)		+= pci-pwrctl-pwrseq.o
diff --git a/drivers/pci/pwrctl/pci-pwrctl-pwrseq.c b/drivers/pci/pwrctl/pci-pwrctl-pwrseq.c
new file mode 100644
index 00000000000000..510598c4edc47b
--- /dev/null
+++ b/drivers/pci/pwrctl/pci-pwrctl-pwrseq.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023-2024 Linaro Ltd.
+ */
+
+#include <linux/device.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/pci-pwrctl.h>
+#include <linux/platform_device.h>
+#include <linux/pwrseq/consumer.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+struct pci_pwrctl_pwrseq_data {
+	struct pci_pwrctl ctx;
+	struct pwrseq_desc *pwrseq;
+};
+
+static void devm_pci_pwrctl_pwrseq_power_off(void *data)
+{
+	struct pwrseq_desc *pwrseq = data;
+
+	pwrseq_power_off(pwrseq);
+}
+
+static int pci_pwrctl_pwrseq_probe(struct platform_device *pdev)
+{
+	struct pci_pwrctl_pwrseq_data *data;
+	struct device *dev = &pdev->dev;
+	int ret;
+
+	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->pwrseq = devm_pwrseq_get(dev);
+	if (IS_ERR(data->pwrseq))
+		return dev_err_probe(dev, PTR_ERR(data->pwrseq),
+				     "Failed to get the power sequencer\n");
+
+	ret = pwrseq_power_on(data->pwrseq);
+	if (ret)
+		return dev_err_probe(dev, ret,
+				     "Failed to power-on the device\n");
+
+	ret = devm_add_action_or_reset(dev, devm_pci_pwrctl_pwrseq_power_off,
+				       data->pwrseq);
+	if (ret)
+		return ret;
+
+	data->ctx.dev = dev;
+
+	ret = devm_pci_pwrctl_device_enable(dev, &data->ctx);
+	if (ret)
+		return dev_err_probe(dev, ret,
+				     "Failed to register the pwrctl wrapper\n");
+
+	return 0;
+}
+
+static const struct of_device_id pci_pwrctl_pwrseq_of_match[] = {
+	{
+		/* ATH11K in QCA6390 package. */
+		.compatible = "pci17cb,1101",
+	},
+	{ }
+};
+MODULE_DEVICE_TABLE(of, pci_pwrctl_pwrseq_of_match);
+
+static struct platform_driver pci_pwrctl_pwrseq_driver = {
+	.driver = {
+		.name = "pci-pwrctl-pwrseq",
+		.of_match_table = pci_pwrctl_pwrseq_of_match,
+	},
+	.probe = pci_pwrctl_pwrseq_probe,
+};
+module_platform_driver(pci_pwrctl_pwrseq_driver);
+
+MODULE_AUTHOR("Bartosz Golaszewski <bartosz.golaszewski@linaro.org>");
+MODULE_DESCRIPTION("Generic PCI Power Control module for power sequenced devices");
+MODULE_LICENSE("GPL");

From 06180b178035251b348a89611a69741ad710ad69 Mon Sep 17 00:00:00 2001
From: Marijn Suijten <marijn.suijten@somainline.org>
Date: Wed, 26 Jul 2023 10:17:38 +0200
Subject: [PATCH 673/707] drm/msm/dpu: Hack in 1:1:1 DSC topology

---
 drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c | 29 +++++++++------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
index 83380bc92a00a9..22cb575d5d124a 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
@@ -504,9 +504,8 @@ bool dpu_encoder_use_dsc_merge(struct drm_encoder *drm_enc)
 		if (dpu_enc->phys_encs[i])
 			intf_count++;
 
-	/* See dpu_encoder_get_topology, we only support 2:2:1 topology */
 	if (dpu_enc->dsc)
-		num_dsc = 2;
+		num_dsc = 1;
 
 	return (num_dsc > 0) && (num_dsc > intf_count);
 }
@@ -567,8 +566,8 @@ static struct msm_display_topology dpu_encoder_get_topology(
 		 * this is power optimal and can drive up to (including) 4k
 		 * screens
 		 */
-		topology.num_dsc = 2;
-		topology.num_lm = 2;
+		topology.num_dsc = 1;
+		topology.num_lm = 1;
 		topology.num_intf = 1;
 	}
 
@@ -1832,46 +1831,45 @@ static void dpu_encoder_dsc_pipe_cfg(struct dpu_hw_ctl *ctl,
 static void dpu_encoder_prep_dsc(struct dpu_encoder_virt *dpu_enc,
 				 struct drm_dsc_config *dsc)
 {
-	/* coding only for 2LM, 2enc, 1 dsc config */
 	struct dpu_encoder_phys *enc_master = dpu_enc->cur_master;
 	struct dpu_hw_ctl *ctl = enc_master->hw_ctl;
 	struct dpu_hw_dsc *hw_dsc[MAX_CHANNELS_PER_ENC];
 	struct dpu_hw_pingpong *hw_pp[MAX_CHANNELS_PER_ENC];
 	int this_frame_slices;
 	int intf_ip_w, enc_ip_w;
-	int dsc_common_mode;
+	int dsc_common_mode = 0;
 	int pic_width;
 	u32 initial_lines;
 	int i;
+	int num_dsc = 0;
 
 	for (i = 0; i < MAX_CHANNELS_PER_ENC; i++) {
 		hw_pp[i] = dpu_enc->hw_pp[i];
 		hw_dsc[i] = dpu_enc->hw_dsc[i];
 
 		if (!hw_pp[i] || !hw_dsc[i]) {
-			DPU_ERROR_ENC(dpu_enc, "invalid params for DSC\n");
-			return;
+			break;
 		}
+		num_dsc++;
 	}
 
 	dsc_common_mode = 0;
 	pic_width = dsc->pic_width;
 
-	dsc_common_mode = DSC_MODE_MULTIPLEX | DSC_MODE_SPLIT_PANEL;
+	if (num_dsc > 1)
+		dsc_common_mode |= DSC_MODE_SPLIT_PANEL;
+	if (dpu_encoder_use_dsc_merge(&dpu_enc->base))
+		dsc_common_mode |= DSC_MODE_MULTIPLEX;
 	if (enc_master->intf_mode == INTF_MODE_VIDEO)
 		dsc_common_mode |= DSC_MODE_VIDEO;
 
 	this_frame_slices = pic_width / dsc->slice_width;
 	intf_ip_w = this_frame_slices * dsc->slice_width;
 
-	/*
-	 * dsc merge case: when using 2 encoders for the same stream,
-	 * no. of slices need to be same on both the encoders.
-	 */
-	enc_ip_w = intf_ip_w / 2;
+	enc_ip_w = intf_ip_w / num_dsc;
 	initial_lines = dpu_encoder_dsc_initial_line_calc(dsc, enc_ip_w);
 
-	for (i = 0; i < MAX_CHANNELS_PER_ENC; i++)
+	for (i = 0; i < num_dsc; i++)
 		dpu_encoder_dsc_pipe_cfg(ctl, hw_dsc[i], hw_pp[i],
 					 dsc, dsc_common_mode, initial_lines);
 }
@@ -2015,7 +2013,6 @@ static void dpu_encoder_dsc_pipe_clr(struct dpu_hw_ctl *ctl,
 
 static void dpu_encoder_unprep_dsc(struct dpu_encoder_virt *dpu_enc)
 {
-	/* coding only for 2LM, 2enc, 1 dsc config */
 	struct dpu_encoder_phys *enc_master = dpu_enc->cur_master;
 	struct dpu_hw_ctl *ctl = enc_master->hw_ctl;
 	struct dpu_hw_dsc *hw_dsc[MAX_CHANNELS_PER_ENC];

From 4beb1292cc46d1410dbebf3d154682d3592eadaf Mon Sep 17 00:00:00 2001
From: Jens Reidel <adrian@travitia.xyz>
Date: Fri, 10 Nov 2023 22:03:37 +0100
Subject: [PATCH 674/707] arm64: dts: qcom: sm8450: Add clock-names to mdss

Signed-off-by: Jens Reidel <adrian@travitia.xyz>
---
 arch/arm64/boot/dts/qcom/sm8450.dtsi | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/sm8450.dtsi b/arch/arm64/boot/dts/qcom/sm8450.dtsi
index c60ffe917b44b4..ba9f4a32bcd6c9 100644
--- a/arch/arm64/boot/dts/qcom/sm8450.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8450.dtsi
@@ -2962,6 +2962,10 @@
 				 <&gcc GCC_DISP_HF_AXI_CLK>,
 				 <&gcc GCC_DISP_SF_AXI_CLK>,
 				 <&dispcc DISP_CC_MDSS_MDP_CLK>;
+			clock-names = "iface_clk",
+			          "gcc_bus",
+			          "gcc_nrt_bus",
+			          "branch_clk";
 
 			interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
 			interrupt-controller;

From 84e42363ef18d5a793a373ccf536647252f318b6 Mon Sep 17 00:00:00 2001
From: Jens Reidel <adrian@travitia.xyz>
Date: Thu, 7 Sep 2023 20:55:48 +0200
Subject: [PATCH 675/707] arm64: configs: Add sm8450 defconfig

Signed-off-by: Jens Reidel <adrian@travitia.xyz>
---
 arch/arm64/configs/sm8450.config | 742 +++++++++++++++++++++++++++++++
 1 file changed, 742 insertions(+)
 create mode 100644 arch/arm64/configs/sm8450.config

diff --git a/arch/arm64/configs/sm8450.config b/arch/arm64/configs/sm8450.config
new file mode 100644
index 00000000000000..1f336f1f3b46f9
--- /dev/null
+++ b/arch/arm64/configs/sm8450.config
@@ -0,0 +1,742 @@
+# Qualcomm Snapdragon SM8450 config fragment
+CONFIG_LOCALVERSION="-sm8450"
+# CONFIG_LOCALVERSION_AUTO is not set
+
+# Common for SM8450 devices
+# nothing yet
+
+# Xiaomi 12 (Cupid)
+CONFIG_DRM_PANEL_XIAOMI_42_02_0A=y
+
+# SOC
+CONFIG_NR_CPUS=8
+CONFIG_SCHED_CLUSTER=y
+CONFIG_SCSI_UFS_QCOM=y
+CONFIG_QCOM_LLCC=y
+CONFIG_QCOM_OCMEM=y
+CONFIG_QCOM_RMTFS_MEM=y
+CONFIG_QCOM_SOCINFO=y
+CONFIG_QCOM_APR=y
+CONFIG_POWER_RESET_QCOM_PON=y
+CONFIG_QCOM_SPMI_TEMP_ALARM=y
+CONFIG_QCOM_SPMI_ADC_TM5=y
+CONFIG_QCOM_GPI_DMA=y
+
+# SM8450 SOC
+CONFIG_INTERCONNECT_QCOM_OSM_L3=y
+CONFIG_INTERCONNECT_QCOM_SM8450=y
+CONFIG_PINCTRL_LPASS_LPI=y
+CONFIG_PINCTRL_SM8450=y
+CONFIG_PINCTRL_SM8450_LPASS_LPI=y
+CONFIG_SM_CAMCC_8450=y
+CONFIG_SM_DISPCC_8450=y
+CONFIG_SM_GCC_8450=y
+CONFIG_SM_GPUCC_8450=y
+CONFIG_SM_VIDEOCC_8450=y
+
+# Sound
+CONFIG_SOUNDWIRE=y
+CONFIG_SOUNDWIRE_QCOM=y
+CONFIG_SND_SOC_CS35L41_I2C=y
+CONFIG_SND_SOC_WCD938X_SDW=y
+CONFIG_SND_SOC_LPASS_RX_MACRO=y
+CONFIG_SND_SOC_LPASS_TX_MACRO=y
+CONFIG_SND_SOC_LPASS_VA_MACRO=y
+CONFIG_SND_SOC_SC8280XP=y
+CONFIG_SND_SOC_QCOM=y
+
+# Remoteproc
+CONFIG_SLIMBUS=y
+CONFIG_SLIM_QCOM_CTRL=y
+CONFIG_SLIM_QCOM_NGD_CTRL=y
+CONFIG_REMOTEPROC_CDEV=y
+
+# Graphics
+CONFIG_DRM=y
+CONFIG_DRM_DISPLAY_HELPER=y
+CONFIG_DRM_MSM=y
+CONFIG_BACKLIGHT_CLASS_DEVICE=y
+
+# Power management
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+
+# Misc useful things
+CONFIG_SCSI_SCAN_ASYNC=y
+
+# Needed for mounting userdata on android
+CONFIG_QFMT_V2=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+
+# HID/Input
+CONFIG_HID_GENERIC=m
+CONFIG_UHID=m
+CONFIG_USB_HID=m
+CONFIG_INPUT_EVDEV=y
+CONFIG_BT_HIDP=m
+CONFIG_INPUT_JOYDEV=m
+
+# USB
+CONFIG_USB=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_HID=y
+CONFIG_USB_ETH_RNDIS=y
+
+# GENI
+CONFIG_I2C_QCOM_GENI=y
+
+CONFIG_LEDS_TRIGGER_ONESHOT=y
+CONFIG_LEDS_TRIGGER_BACKLIGHT=y
+CONFIG_LEDS_TRIGGER_ACTIVITY=y
+
+CONFIG_RPMSG_CHAR=y
+CONFIG_RPMSG_QCOM_GLINK_SMEM=y
+# Always load RFCOMM and BNEP as modules so they initialize properly
+CONFIG_BT_RFCOMM=m
+CONFIG_BT_RFCOMM_TTY=y
+CONFIG_BT_BNEP=m
+CONFIG_BT_BNEP_MC_FILTER=y
+CONFIG_BT_BNEP_PROTO_FILTER=y
+CONFIG_BT_HS=y
+CONFIG_BT_LE=y
+CONFIG_HID_BATTERY_STRENGTH=y
+CONFIG_HIDRAW=y
+CONFIG_QCOM_FASTRPC=m
+CONFIG_QCOM_PMIC_GLINK=y
+CONFIG_QCOM_SPMI_ADC5=y
+CONFIG_PHY_QCOM_PCIE2=y
+CONFIG_PHY_QCOM_QMP=y
+CONFIG_PHY_QCOM_QMP_COMBO=y
+CONFIG_PHY_QCOM_QMP_UFS=y
+CONFIG_PHY_QCOM_QMP_USB=y
+CONFIG_PHY_QCOM_QMP_PCIE=y
+CONFIG_PHY_QCOM_QMP_PCIE_8996=y
+CONFIG_PHY_QCOM_QUSB2=y
+CONFIG_PHY_QCOM_USB_HS_28NM=y
+CONFIG_PHY_QCOM_USB_SNPS_FEMTO_V2=y
+CONFIG_PHY_QCOM_USB_HS=y
+CONFIG_PHY_QCOM_SNPS_EUSB2=y
+CONFIG_PHY_QCOM_EUSB2_REPEATER=y
+CONFIG_PHY_QCOM_USB_HSIC=y
+CONFIG_PHY_QCOM_USB_SS=y
+CONFIG_LEDS_CLASS_FLASH=y
+CONFIG_TCP_CONG_ADVANCED=y
+CONFIG_TCP_CONG_WESTWOOD=y
+CONFIG_DEFAULT_WESTWOOD=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_F2FS_FS=y
+CONFIG_NLS_UTF8=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_FAT_DEFAULT_UTF8=y
+CONFIG_CPU_FREQ_GOV_POWERSAVE=y
+CONFIG_SYN_COOKIES=y
+CONFIG_UEVENT_HELPER=y
+CONFIG_INPUT_UINPUT=m
+CONFIG_PSTORE_PMSG=y
+CONFIG_TYPEC=y
+CONFIG_TYPEC_TCPM=y
+CONFIG_TYPEC_QCOM_PMIC=y
+CONFIG_TYPEC_UCSI=y
+CONFIG_UCSI_PMIC_GLINK=y
+CONFIG_TYPEC_MUX_FSA4480=y
+CONFIG_TYPEC_MUX_GPIO_SBU=y
+CONFIG_TYPEC_MUX_NB7VPQ904M=y
+CONFIG_TYPEC_DP_ALTMODE=y
+CONFIG_U_SERIAL_CONSOLE=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+
+# Anbox
+CONFIG_BRIDGE_NETFILTER=y
+CONFIG_CHECKPOINT_RESTORE=y
+CONFIG_PACKET_DIAG=y
+CONFIG_UNIX_DIAG=y
+CONFIG_NETLINK_DIAG=m
+CONFIG_NETFILTER_XT_MATCH_COMMENT=m
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_SQUASHFS_XATTR=y
+CONFIG_SQUASHFS_LZ4=y
+CONFIG_SQUASHFS_LZO=y
+CONFIG_SQUASHFS_XZ=y
+
+# Waydroid
+CONFIG_PSI=y
+
+# WLAN debugging
+CONFIG_ATH10K_DEBUG=y
+CONFIG_ATH10K_DEBUGFS=y
+CONFIG_ATH10K_SPECTRAL=y
+
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_SCH_PRIO=y
+CONFIG_NET_SCH_MULTIQ=y
+
+# Debugging stuff
+CONFIG_STACKTRACE=y
+
+#pmOS Related
+CONFIG_VT=y
+CONFIG_CRYPTO_XTS=y
+CONFIG_TMPFS_XATTR=y
+CONFIG_DM_CRYPT=y
+CONFIG_BINFMT_MISC=m
+
+CONFIG_NF_TABLES=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NFT_CT=m
+CONFIG_NFT_COUNTER=m
+CONFIG_NFT_LOG=m
+CONFIG_NFT_LIMIT=m
+CONFIG_NFT_MASQ=m
+CONFIG_NFT_NAT=m
+CONFIG_NFT_REJECT=m
+CONFIG_NF_TABLES_IPV4=y
+CONFIG_NF_TABLES_IPV6=y
+
+CONFIG_WIREGUARD=m
+CONFIG_DRM_GUD=m
+
+# pmos containers kconfig
+CONFIG_CGROUP_FREEZER=y
+CONFIG_NETFILTER_XT_MATCH_IPVS=m
+CONFIG_NETFILTER_XT_MARK=m
+CONFIG_DUMMY=m
+CONFIG_BLK_DEV_THROTTLING=y
+CONFIG_NET_CLS_CGROUP=m
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_VS=m
+CONFIG_IP_VS_NFCT=y
+CONFIG_IP_VS_PROTO_TCP=y
+CONFIG_IP_VS_PROTO_UDP=y
+CONFIG_IP_VS_RR=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_CFS_BANDWIDTH=y
+CONFIG_DM_THIN_PROVISIONING=y
+CONFIG_VXLAN=m
+CONFIG_CGROUP_NET_PRIO=y
+CONFIG_IPVLAN=m
+
+# pmOS ZRAM kconfig
+CONFIG_ZSMALLOC=m
+CONFIG_ZSMALLOC_STAT=y
+CONFIG_ZRAM=m
+CONFIG_ZRAM_MEMORY_TRACKING=y
+CONFIG_CRYPTO_LZ4=m
+CONFIG_LZ4_COMPRESS=m
+CONFIG_CRYPTO_LZO=m
+CONFIG_CRYPTO_ZSTD=m
+
+# pmOS iwd kconfig
+CONFIG_CRYPTO_USER_API_HASH=m
+CONFIG_CRYPTO_USER_API_SKCIPHER=m
+CONFIG_KEY_DH_OPERATIONS=y
+CONFIG_CRYPTO_KPP=y
+CONFIG_PKCS8_PRIVATE_KEY_PARSER=y
+
+# LEDs
+CONFIG_NEW_LEDS=y
+CONFIG_LEDS_CLASS=y
+CONFIG_LEDS_QCOM_FLASH=y
+
+#Sony PlayStation controllers
+CONFIG_HID_SONY=m
+CONFIG_SONY_FF=y
+
+# Disable all unrelated stuffs afaik
+CONFIG_ACPI=n
+CONFIG_VIRTUALIZATION=n
+CONFIG_PSTORE_DEFLATE_COMPRESS=n
+CONFIG_HIBERNATION=n
+CONFIG_FW_LOADER_USER_HELPER=n
+CONFIG_FW_LOADER_USER_HELPER_FALLBACK=n
+CONFIG_BLK_DEV_NVME=n
+CONFIG_ATA=n
+CONFIG_MTD=n
+CONFIG_SRAM=n
+CONFIG_MEGARAID_SAS=n
+CONFIG_EEPROM_AT25=n
+CONFIG_SCSI_MPT3SAS=n
+CONFIG_BLK_DEV_MD=n
+CONFIG_DM_MIRROR=n
+CONFIG_DM_ZERO=n
+CONFIG_EXT2_FS=n
+CONFIG_EXT3_FS=n
+CONFIG_BTRFS_FS=n
+CONFIG_USB_DWC2=n
+CONFIG_USB_CHIPIDEA=n
+CONFIG_USB_MUSB_HDRC=n
+CONFIG_USB_ISP1760=n
+CONFIG_USB_HSIC_USB3503=n
+CONFIG_USB_NET_PLUSB=n
+CONFIG_TYPEC_FUSB302=n
+CONFIG_EXTCON_PTN5150=n
+CONFIG_REALTEK_PHY=n
+CONFIG_NET_VENDOR_NI=n
+CONFIG_NET_9P=n
+CONFIG_CAN=n
+CONFIG_BNX2X=n
+CONFIG_MACB=n
+CONFIG_IGB=n
+CONFIG_IGBVF=n
+CONFIG_SMC91X=n
+CONFIG_MLX4_EN=n
+CONFIG_MLX5_CORE=n
+CONFIG_STMMAC_ETH=n
+CONFIG_ATL1C=n
+CONFIG_BRCMFMAC=n
+CONFIG_WL18XX=n
+CONFIG_WLCORE=n
+CONFIG_ATH10K_PCI=n
+CONFIG_NET_SCH_CBS=n
+CONFIG_NET_SCH_ETF=n
+CONFIG_NET_SCH_TAPRIO=n
+CONFIG_NET_SCH_MQPRIO=n
+CONFIG_NET_CLS_BASIC=n
+CONFIG_NET_CLS_FLOWER=n
+CONFIG_NET_CLS_ACT=n
+CONFIG_NET_ACT_GACT=n
+CONFIG_NET_ACT_MIRRED=n
+CONFIG_NET_ACT_GATE=n
+CONFIG_MDIO_BUS_MUX_MMIOREG=n
+CONFIG_MDIO_BUS_MUX_MULTIPLEXER=n
+CONFIG_GPIO_DWAPB=n
+CONFIG_COMMON_CLK_XGENE=n
+CONFIG_SENSORS_ARM_SCPI=n
+CONFIG_TCG_TPM=n
+CONFIG_BATTERY_SBS=n
+CONFIG_REGULATOR_VCTRL=n
+CONFIG_THUNDER_NIC_BGX=n
+CONFIG_THUNDER_NIC_RGX=n
+CONFIG_MDIO_THUNDER=n
+CONFIG_HW_RANDOM_CAVIUM=n
+CONFIG_EEPROM_AT24=n
+CONFIG_NET_DSA=n
+CONFIG_VITESSE_PHY=n
+CONFIG_SENSORS_LM90=n
+CONFIG_SENSORS_INA2XX=n
+CONFIG_SENSORS_ISL29018=n
+CONFIG_PCI_PASID=n
+CONFIG_UACCE=n
+CONFIG_NOP_USB_XCEIV=n
+CONFIG_SURFACE_PLATFORMS=n
+CONFIG_SENSORS_LM75=n
+CONFIG_SENSORS_PWM_FAN=n
+CONFIG_SENSORS_INA3221=n
+CONFIG_USB_CONN_GPIO=n
+CONFIG_MICREL_PHY=n
+CONFIG_COMMON_CLK_VC5=n
+CONFIG_CRYPTO_DEV_CCREE=n
+CONFIG_SND_SIMPLE_CARD=n
+CONFIG_SND_SIMPLE_CARD_UTILS=n
+CONFIG_SND_AUDIO_GRAPH_CARD=n
+CONFIG_TYPEC_HD3SS3220=n
+CONFIG_COMMON_CLK_CS2000_CP=n
+CONFIG_PL330_DMA=n
+CONFIG_NET_VENDOR_SOCIONEXT=n
+
+# Disable MII PHY device drivers
+CONFIG_AQUANTIA_PHY=n
+CONFIG_MICROSEMI_PHY=n
+
+# Disable Multiplexer I2C Chip support
+CONFIG_I2C_MUX_PCA954x=n
+
+# Disable pressure sensors
+CONFIG_MPL3115=n
+
+# Disable Display Panels
+CONFIG_DRM_PANEL_BOE_TV101WUM_NL6=n
+CONFIG_DRM_PANEL_LVDS=n
+CONFIG_DRM_PANEL_MANTIX_MLAF057WE51=n
+CONFIG_DRM_PANEL_RAYDIUM_RM67191=n
+CONFIG_DRM_PANEL_SITRONIX_ST7703=n
+CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=n
+CONFIG_DRM_PANEL_VISIONOX_VTDR6130=n
+
+# Disable I2C GPIO expanders
+CONFIG_GPIO_MAX732X=n
+CONFIG_GPIO_PCA953X=n
+
+# Disable Backlight & LCD device support
+CONFIG_BACKLIGHT_LP855X=n
+
+# Disable IR I2C driver auto-selected by 'Autoselect ancillary drivers'
+CONFIG_VIDEO_IMX219=n
+CONFIG_VIDEO_IMX412=n
+CONFIG_VIDEO_OV5640=n
+CONFIG_VIDEO_OV5645=n
+
+# Disable Common SoC Audio options for Freescale CPUs:
+CONFIG_SND_SOC_FSL_ASRC=n
+CONFIG_SND_SOC_FSL_SAI=n
+CONFIG_SND_SOC_FSL_AUDMIX=n
+CONFIG_SND_SOC_FSL_SPDIF=n
+CONFIG_SND_SOC_FSL_MICFIL=n
+
+# Disable Input Device Drivers
+CONFIG_KEYBOARD_ADC=n
+CONFIG_MOUSE_ELAN_I2C=n
+CONFIG_TOUCHSCREEN_ATMEL_MXT=n
+CONFIG_TOUCHSCREEN_GOODIX=n
+CONFIG_TOUCHSCREEN_ELAN=n
+CONFIG_TOUCHSCREEN_EDT_FT5X06=n
+
+# Disable I2C RTC drivers
+CONFIG_RTC_DRV_DS1307=n
+CONFIG_RTC_DRV_RK808=n
+CONFIG_RTC_DRV_HYM8563=n
+CONFIG_RTC_DRV_ISL1208=n
+CONFIG_RTC_DRV_PCF85063=n
+CONFIG_RTC_DRV_PCF85363=n
+CONFIG_RTC_DRV_M41T80=n
+CONFIG_RTC_DRV_BQ32K=n
+CONFIG_RTC_DRV_RX8581=n
+CONFIG_RTC_DRV_RV3028=n
+CONFIG_RTC_DRV_RV8803=n
+
+# Disable SPI and I2C RTC drivers
+CONFIG_RTC_DRV_DS3232=n
+CONFIG_RTC_DRV_PCF2127=n
+
+# Disable on-CPU RTC drivers
+CONFIG_RTC_DRV_PL031=n
+
+# Disable ARM errata workarounds via the alternatives framework
+CONFIG_CAVIUM_ERRATUM_22375=n
+CONFIG_CAVIUM_ERRATUM_23144=n
+CONFIG_CAVIUM_ERRATUM_23154=n
+CONFIG_CAVIUM_ERRATUM_27456=n
+CONFIG_CAVIUM_ERRATUM_30115=n
+CONFIG_CAVIUM_TX2_ERRATUM_219=n
+CONFIG_FUJITSU_ERRATUM_010001=n
+CONFIG_HISILICON_ERRATUM_161600802=n
+CONFIG_NVIDIA_CARMEL_CNP_ERRATUM=n
+CONFIG_ROCKCHIP_ERRATUM_3588001=n
+CONFIG_SOCIONEXT_SYNQUACER_PREITS=n
+
+# Disable platforms
+CONFIG_ARCH_ACTIONS=n
+CONFIG_ARCH_SUNXI=n
+CONFIG_ARCH_ALPINE=n
+CONFIG_ARCH_APPLE=n
+CONFIG_ARCH_BCM=n
+CONFIG_ARCH_BCM2835=n
+CONFIG_ARCH_BCM_IPROC=n
+CONFIG_ARCH_BCMBCA=n
+CONFIG_ARCH_BRCMSTB=n
+CONFIG_ARCH_BERLIN=n
+CONFIG_ARCH_BITMAIN=n
+CONFIG_ARCH_EXYNOS=n
+CONFIG_ARCH_SPARX5=n
+CONFIG_ARCH_K3=n
+CONFIG_ARCH_LG1K=n
+CONFIG_ARCH_HISI=n
+CONFIG_ARCH_KEEMBAY=n
+CONFIG_ARCH_MEDIATEK=n
+CONFIG_ARCH_MESON=n
+CONFIG_ARCH_MVEBU=n
+CONFIG_ARCH_NXP=n
+CONFIG_ARCH_LAYERSCAPE=n
+CONFIG_ARCH_MXC=n
+CONFIG_ARCH_S32=n
+CONFIG_ARCH_MA35=n
+CONFIG_ARCH_NPCM=n
+CONFIG_ARCH_REALTEK=n
+CONFIG_ARCH_RENESAS=n
+CONFIG_ARCH_ROCKCHIP=n
+CONFIG_ARCH_SEATTLE=n
+CONFIG_ARCH_INTEL_SOCFPGA=n
+CONFIG_ARCH_STM32=n
+CONFIG_ARCH_SYNQUACER=n
+CONFIG_ARCH_TEGRA=n
+CONFIG_ARCH_SPRD=n
+CONFIG_ARCH_THUNDER=n
+CONFIG_ARCH_THUNDER2=n
+CONFIG_ARCH_UNIPHIER=n
+CONFIG_ARCH_VEXPRESS=n
+CONFIG_ARCH_VISCONTI=n
+CONFIG_ARCH_XGENE=n
+CONFIG_ARCH_ZYNQMP=n
+
+# Disable PCI controller drivers
+CONFIG_PCIE_ALTERA=n
+CONFIG_PCI_HOST_THUNDER_PEM=n
+CONFIG_PCI_HOST_THUNDER_ECAM=n
+CONFIG_PCI_XGENE=n
+
+# Disable DesignWare-based PCIe controllers
+CONFIG_PCI_MESON=n
+CONFIG_PCI_HISI=n
+CONFIG_PCIE_KIRIN=n
+
+CONFIG_HIX5HD2_GMAC=n
+CONFIG_HNS_DSAF=n
+CONFIG_HNS_ENET=n
+CONFIG_HNS3=n
+
+# Disable serial drivers
+CONFIG_SERIAL_XILINX_PS_UART=n
+CONFIG_SERIAL_FSL_LPUART=n
+CONFIG_SERIAL_FSL_LINFLEXUART=n
+
+# Disable I2C system bus drivers (mostly embedded / system-on-chip)
+CONFIG_I2C_DESIGNWARE_PLATFORM=n
+CONFIG_I2C_RK3X=n
+
+# Disable SPI Master Controller Drivers
+CONFIG_SPI_CADENCE_QUADSPI=n
+CONFIG_SPI_DESIGNWARE=n
+CONFIG_SPI_PL022=n
+
+# Disable pinctrls
+CONFIG_PINCTRL_RK805=n
+CONFIG_PINCTRL_IPQ5018=n
+CONFIG_PINCTRL_IPQ5332=n
+CONFIG_PINCTRL_IPQ8074=n
+CONFIG_PINCTRL_IPQ6018=n
+CONFIG_PINCTRL_IPQ9574=n
+CONFIG_PINCTRL_MSM8916=n
+CONFIG_PINCTRL_MSM8953=n
+CONFIG_PINCTRL_MSM8976=n
+CONFIG_PINCTRL_MSM8994=n
+CONFIG_PINCTRL_MSM8996=n
+CONFIG_PINCTRL_MSM8998=n
+CONFIG_PINCTRL_QCM2290=n
+CONFIG_PINCTRL_QCS404=n
+CONFIG_PINCTRL_QDU1000=n
+CONFIG_PINCTRL_SA8775P=n
+CONFIG_PINCTRL_SC7180=n
+CONFIG_PINCTRL_SC7280=n
+CONFIG_PINCTRL_SC8180X=n
+CONFIG_PINCTRL_SC8280XP=n
+CONFIG_PINCTRL_SDM660=n
+CONFIG_PINCTRL_SDM670=n
+CONFIG_PINCTRL_SDM845=n
+CONFIG_PINCTRL_SM6115=n
+CONFIG_PINCTRL_SM6125=n
+CONFIG_PINCTRL_SM6350=n
+CONFIG_PINCTRL_SM6375=n
+CONFIG_PINCTRL_SM8150=n
+CONFIG_PINCTRL_SM8250=n
+CONFIG_PINCTRL_SM8350=n
+CONFIG_PINCTRL_SM8550=n
+CONFIG_PINCTRL_SC7280_LPASS_LPI=n
+CONFIG_PINCTRL_SM8250_LPASS_LPI=n
+CONFIG_PINCTRL_SC8280XP_LPASS_LPI=n
+CONFIG_PINCTRL_SM8550_LPASS_LPI=n
+
+# Disable clock drivers
+CONFIG_QCOM_CLK_APCS_MSM8916=n
+CONFIG_QCOM_CLK_APCC_MSM8996=n
+CONFIG_IPQ_GCC_5018=n
+CONFIG_IPQ_GCC_5332=n
+CONFIG_IPQ_GCC_6018=n
+CONFIG_IPQ_GCC_8074=n
+CONFIG_IPQ_GCC_9574=n
+CONFIG_MSM_GCC_8916=n
+CONFIG_MSM_MMCC_8994=n
+CONFIG_MSM_GCC_8994=n
+CONFIG_MSM_GCC_8996=n
+CONFIG_MSM_MMCC_8996=n
+CONFIG_MSM_GCC_8998=n
+CONFIG_MSM_MMCC_8998=n
+CONFIG_QCM_GCC_2290=n
+CONFIG_QCM_DISPCC_2290=n
+CONFIG_QCS_GCC_404=n
+CONFIG_SC_DISPCC_8280XP=n
+CONFIG_SA_GCC_8775P=n
+CONFIG_SA_GPUCC_8775P=n
+CONFIG_SC_GCC_7180=n
+CONFIG_SC_GCC_7280=n
+CONFIG_SC_GCC_8180X=n
+CONFIG_SC_GCC_8280XP=n
+CONFIG_SC_GPUCC_7180=n
+CONFIG_SC_GPUCC_8280XP=n
+CONFIG_SC_LPASSCC_8280XP=n
+CONFIG_SDM_CAMCC_845=n
+CONFIG_SDM_GCC_845=n
+CONFIG_SDM_GPUCC_845=n
+CONFIG_SDM_VIDEOCC_845=n
+CONFIG_SDM_DISPCC_845=n
+CONFIG_SDM_LPASSCC_845=n
+CONFIG_SM_CAMCC_8250=n
+CONFIG_SM_DISPCC_6115=n
+CONFIG_SM_DISPCC_8250=n
+CONFIG_SM_DISPCC_8550=n
+CONFIG_SM_GCC_6115=n
+CONFIG_SM_GCC_8150=n
+CONFIG_SM_GCC_8250=n
+CONFIG_SM_GCC_8350=n
+CONFIG_SM_GCC_8550=n
+CONFIG_SM_GPUCC_6115=n
+CONFIG_SM_GPUCC_8150=n
+CONFIG_SM_GPUCC_8250=n
+CONFIG_SM_TCSRCC_8550=n
+CONFIG_SM_VIDEOCC_8250=n
+CONFIG_CLK_GFM_LPASS_SM8250=n
+
+# Disable interconnect drivers
+CONFIG_INTERCONNECT_QCOM_MSM8916=n
+CONFIG_INTERCONNECT_QCOM_MSM8996=n
+CONFIG_INTERCONNECT_QCOM_QCM2290=n
+CONFIG_INTERCONNECT_QCOM_QCS404=n
+CONFIG_INTERCONNECT_QCOM_SA8775P=n
+CONFIG_INTERCONNECT_QCOM_SC7180=n
+CONFIG_INTERCONNECT_QCOM_SC7280=n
+CONFIG_INTERCONNECT_QCOM_SC8180X=n
+CONFIG_INTERCONNECT_QCOM_SC8280XP=n
+CONFIG_INTERCONNECT_QCOM_SDM845=n
+CONFIG_INTERCONNECT_QCOM_SM8150=n
+CONFIG_INTERCONNECT_QCOM_SM8250=n
+CONFIG_INTERCONNECT_QCOM_SM8350=n
+CONFIG_INTERCONNECT_QCOM_SM8550=n
+
+# Disable memory mapped GPIO drivers
+CONFIG_GPIO_ALTERA=n
+CONFIG_GPIO_MB86S7X=n
+CONFIG_GPIO_PL061=n
+CONFIG_GPIO_WCD934X=n
+CONFIG_GPIO_XGENE=n
+
+# Disable Virtual GPIO drivers
+CONFIG_POWER_RESET_XGENE=n
+CONFIG_POWER_RESET_SYSCON=n
+CONFIG_GNSS_MTK_SERIAL=n
+
+# Disable Watchdog Device Drivers
+CONFIG_ARM_SP805_WATCHDOG=n
+CONFIG_ARM_SBSA_WATCHDOG=n
+CONFIG_DW_WATCHDOG=n
+CONFIG_ARM_SMC_WATCHDOG=n
+
+# Disable Multifunction device drivers
+CONFIG_MFD_BD9571MWV=n
+CONFIG_MFD_AXP20X_I2C=n
+CONFIG_MFD_HI6421_PMIC=n
+CONFIG_MFD_MAX77620=n
+CONFIG_MFD_MT6360=n
+CONFIG_MFD_MT6397=n
+CONFIG_MFD_RK8XX=n
+CONFIG_MFD_SEC_CORE=n
+CONFIG_MFD_TI_AM335X_TSCADC=n
+CONFIG_MFD_TPS65219=n
+CONFIG_MFD_WM8994=n
+CONFIG_MFD_ROHM_BD718XX=n
+CONFIG_MFD_WCD934X=n
+
+CONFIG_REGULATOR_FAN53555=n
+CONFIG_REGULATOR_MAX8973=n
+CONFIG_REGULATOR_MP8859=n
+CONFIG_REGULATOR_MT6315=n
+CONFIG_REGULATOR_MT6360=n
+CONFIG_REGULATOR_PCA9450=n
+CONFIG_REGULATOR_PF8X00=n
+CONFIG_REGULATOR_PFUZE100=n
+CONFIG_REGULATOR_RAA215300=n
+CONFIG_REGULATOR_RK808=n
+CONFIG_REGULATOR_TPS65132=n
+CONFIG_REGULATOR_TPS65219=n
+
+# Disable media device types
+CONFIG_MEDIA_ANALOG_TV_SUPPORT=n
+CONFIG_MEDIA_DIGITAL_TV_SUPPORT=n
+CONFIG_MEDIA_SDR_SUPPORT=n
+
+# Disable I2C encoder or helper chips
+CONFIG_DRM_I2C_NXP_TDA998X=n
+
+# Disable ARM devices
+CONFIG_DRM_MALI_DISPLAY=n
+CONFIG_DRM_KOMEDA=n
+
+# Disable DRM Drivers
+CONFIG_DRM_NOUVEAU=n
+
+# Disable display Interface Bridges
+CONFIG_DRM_LONTIUM_LT8912B=n
+CONFIG_DRM_LONTIUM_LT9611=n
+CONFIG_DRM_LONTIUM_LT9611UXC=n
+CONFIG_DRM_ITE_IT66121=n
+CONFIG_DRM_NWL_MIPI_DSI=n
+CONFIG_DRM_PARADE_PS8640=n
+CONFIG_DRM_SAMSUNG_DSIM=n
+CONFIG_DRM_SII902X=n
+CONFIG_DRM_THINE_THC63LVD1024=n
+CONFIG_DRM_TOSHIBA_TC358768=n
+CONFIG_DRM_TI_TFP410=n
+CONFIG_DRM_TI_SN65DSI83=n
+CONFIG_DRM_TI_SN65DSI86=n
+CONFIG_DRM_ANALOGIX_ANX7625=n
+CONFIG_DRM_I2C_ADV7511=n
+CONFIG_DRM_CDNS_MHDP8546=n
+
+CONFIG_DRM_ETNAVIV=n
+CONFIG_DRM_HISI_HIBMC=n
+CONFIG_DRM_HISI_KIRIN=n
+CONFIG_DRM_PL111=n
+CONFIG_DRM_LIMA=n
+CONFIG_DRM_PANFROST=n
+
+# Disable CODEC drivers
+CONFIG_SND_SOC_ADAU7002=n
+CONFIG_SND_SOC_AK4613=n
+CONFIG_SND_SOC_DA7213=n
+CONFIG_SND_SOC_ES7134=n
+CONFIG_SND_SOC_ES7241=n
+CONFIG_SND_SOC_ES8316=n
+CONFIG_SND_SOC_GTM601=n
+CONFIG_SND_SOC_MAX98357A=n
+CONFIG_SND_SOC_MAX98927=n
+CONFIG_SND_SOC_MSM8916_WCD_ANALOG=n
+CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=n
+CONFIG_SND_SOC_PCM3168A_I2C=n
+CONFIG_SND_SOC_RK817=n
+CONFIG_SND_SOC_RL6231=n
+CONFIG_SND_SOC_RT5640=n
+CONFIG_SND_SOC_RT5659=n
+CONFIG_SND_SOC_RT5663=n
+CONFIG_SND_SOC_RT5682=n
+CONFIG_SND_SOC_SGTL5000=n
+CONFIG_SND_SOC_TAS2552=n
+CONFIG_SND_SOC_TAS571X=n
+CONFIG_SND_SOC_TLV320AIC31XX=n
+CONFIG_SND_SOC_TLV320AIC32X4=n
+CONFIG_SND_SOC_TLV320AIC3X=n
+CONFIG_SND_SOC_TS3A227E=n
+CONFIG_SND_SOC_WCD9335=n
+CONFIG_SND_SOC_WCD934X=n
+CONFIG_SND_SOC_WM8524=n
+CONFIG_SND_SOC_WM8904=n
+CONFIG_SND_SOC_WM8960=n
+CONFIG_SND_SOC_WM8962=n
+CONFIG_SND_SOC_WM8978=n
+CONFIG_SND_SOC_WSA881X=n
+CONFIG_SND_SOC_MT6358=n
+CONFIG_SND_SOC_NAU8822=n
+
+# Disable USB Host Controller Drivers
+CONFIG_USB_XHCI_PCI_RENESAS=n
+
+# Disable MMC/SD/SDIO Host Controller Drivers
+CONFIG_MMC_SDHCI_OF_ARASAN=n
+CONFIG_MMC_SDHCI_CADENCE=n
+CONFIG_MMC_SDHCI_F_SDH30=n
+CONFIG_MMC_DW_EXYNOS=n
+CONFIG_MMC_DW_HI3798CV200=n
+CONFIG_MMC_DW_K3=n
+CONFIG_MMC_MTK=n
+CONFIG_MMC_SDHCI_XENON=n
+CONFIG_MMC_SDHCI_AM654=n
+
+CONFIG_FSL_EDMA=n
+CONFIG_MV_XOR_V2=n
+CONFIG_COMMON_CLK_RK808=n
+CONFIG_FSL_RCPM=n
+CONFIG_ARCH_R9A07G044=n
+CONFIG_MAX9611=n
+CONFIG_NVMEM_RMEM=n
+CONFIG_FPGA=n

From f4f0cc4d3a17be9d335f71919cfd7dde5d1aa779 Mon Sep 17 00:00:00 2001
From: Jens Reidel <adrian@travitia.xyz>
Date: Sun, 5 Nov 2023 18:09:10 +0100
Subject: [PATCH 676/707] drm: panel: Generate panel-l3-42-02-0a driver

Signed-off-by: Jens Reidel <adrian@travitia.xyz>
---
 drivers/gpu/drm/panel/Kconfig                 |  10 +
 drivers/gpu/drm/panel/Makefile                |   1 +
 drivers/gpu/drm/panel/panel-l3-42-02-0a-dsc.c | 453 ++++++++++++++++++
 3 files changed, 464 insertions(+)
 create mode 100644 drivers/gpu/drm/panel/panel-l3-42-02-0a-dsc.c

diff --git a/drivers/gpu/drm/panel/Kconfig b/drivers/gpu/drm/panel/Kconfig
index 8f3783742208b6..250841246ae881 100644
--- a/drivers/gpu/drm/panel/Kconfig
+++ b/drivers/gpu/drm/panel/Kconfig
@@ -875,6 +875,16 @@ config DRM_PANEL_WIDECHIPS_WS2401
 	  480x800 display controller used in panels such as Samsung LMS380KF01.
 	  This display is used in the Samsung Galaxy Ace 2 GT-I8160 (Codina).
 
+config DRM_PANEL_XIAOMI_42_02_0A
+	tristate "Xiaomi 42_02_0A panel driver"
+	depends on OF
+	depends on DRM_MIPI_DSI
+	depends on BACKLIGHT_CLASS_DEVICE
+	select VIDEOMODE_HELPERS
+	help
+		Say Y or M here if you want to enable support for the Xiaomi FHD
+		(2400x1080@120Hz) cmd mode panel.
+
 config DRM_PANEL_XINPENG_XPP055C272
 	tristate "Xinpeng XPP055C272 panel driver"
 	depends on OF
diff --git a/drivers/gpu/drm/panel/Makefile b/drivers/gpu/drm/panel/Makefile
index d94a644d0a6cea..471bfcdf590fc7 100644
--- a/drivers/gpu/drm/panel/Makefile
+++ b/drivers/gpu/drm/panel/Makefile
@@ -89,4 +89,5 @@ obj-$(CONFIG_DRM_PANEL_VISIONOX_RM69299) += panel-visionox-rm69299.o
 obj-$(CONFIG_DRM_PANEL_VISIONOX_VTDR6130) += panel-visionox-vtdr6130.o
 obj-$(CONFIG_DRM_PANEL_VISIONOX_R66451) += panel-visionox-r66451.o
 obj-$(CONFIG_DRM_PANEL_WIDECHIPS_WS2401) += panel-widechips-ws2401.o
+obj-$(CONFIG_DRM_PANEL_XIAOMI_42_02_0A) += panel-l3-42-02-0a-dsc.o
 obj-$(CONFIG_DRM_PANEL_XINPENG_XPP055C272) += panel-xinpeng-xpp055c272.o
diff --git a/drivers/gpu/drm/panel/panel-l3-42-02-0a-dsc.c b/drivers/gpu/drm/panel/panel-l3-42-02-0a-dsc.c
new file mode 100644
index 00000000000000..1553e20a7be32e
--- /dev/null
+++ b/drivers/gpu/drm/panel/panel-l3-42-02-0a-dsc.c
@@ -0,0 +1,453 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (c) 2023 FIXME
+// Generated with linux-mdss-dsi-panel-driver-generator from vendor device tree:
+//   Copyright (c) 2013, The Linux Foundation. All rights reserved. (FIXME)
+
+#include <linux/backlight.h>
+#include <linux/delay.h>
+#include <linux/gpio/consumer.h>
+#include <linux/module.h>
+#include <linux/of.h>
+
+#include <video/mipi_display.h>
+
+#include <drm/display/drm_dsc.h>
+#include <drm/display/drm_dsc_helper.h>
+#include <drm/drm_mipi_dsi.h>
+#include <drm/drm_modes.h>
+#include <drm/drm_panel.h>
+#include <drm/drm_probe_helper.h>
+
+struct l3_42_02_0a_dsc {
+	struct drm_panel panel;
+	struct mipi_dsi_device *dsi;
+	struct regulator_bulk_data supplies[3];
+	struct drm_dsc_config dsc;
+	struct gpio_desc *reset_gpio;
+};
+
+static inline
+struct l3_42_02_0a_dsc *to_l3_42_02_0a_dsc(struct drm_panel *panel)
+{
+	return container_of(panel, struct l3_42_02_0a_dsc, panel);
+}
+
+static void l3_42_02_0a_dsc_reset(struct l3_42_02_0a_dsc *ctx)
+{
+	gpiod_set_value_cansleep(ctx->reset_gpio, 0);
+	usleep_range(11000, 12000);
+	gpiod_set_value_cansleep(ctx->reset_gpio, 1);
+	usleep_range(1000, 2000);
+	gpiod_set_value_cansleep(ctx->reset_gpio, 0);
+	usleep_range(11000, 12000);
+}
+
+static int l3_42_02_0a_dsc_on(struct l3_42_02_0a_dsc *ctx)
+{
+	struct mipi_dsi_device *dsi = ctx->dsi;
+	struct device *dev = &dsi->dev;
+	int ret;
+
+	/* VESA ON */
+	mipi_dsi_dcs_write_seq(dsi, 0x90, 0x01);
+	/* VESA Edition */
+	mipi_dsi_dcs_write_seq(dsi, 0x91,
+			       0xab, 0x28, 0x00, 0x0c, 0xc2, 0x00, 0x03, 0x1c,
+			       0x01, 0x7e, 0x00, 0x0f, 0x08, 0xbb, 0x04, 0x3d,
+			       0x10, 0xf0);
+	mipi_dsi_dcs_write_seq(dsi, MIPI_DCS_GET_COMPRESSION_MODE, 0x01);
+	mipi_dsi_dcs_write_seq(dsi, MIPI_DCS_WRITE_MEMORY_START, 0x00);
+
+	ret = mipi_dsi_dcs_set_tear_on(dsi, MIPI_DSI_DCS_TEAR_MODE_VBLANK);
+	if (ret < 0) {
+		dev_err(dev, "Failed to set tear on: %d\n", ret);
+		return ret;
+	}
+
+	/* BACKLIGHT ON */
+	mipi_dsi_dcs_write_seq(dsi, MIPI_DCS_WRITE_CONTROL_DISPLAY, 0x20);
+	mipi_dsi_dcs_write_seq(dsi, MIPI_DCS_SET_DISPLAY_BRIGHTNESS, 0x00, 0x00, 0x00, 0x00);
+
+	ret = mipi_dsi_dcs_set_column_address(dsi, 0x0000, 0x0437);
+	if (ret < 0) {
+		dev_err(dev, "Failed to set column address: %d\n", ret);
+		return ret;
+	}
+
+	ret = mipi_dsi_dcs_set_page_address(dsi, 0x0000, 0x095f);
+	if (ret < 0) {
+		dev_err(dev, "Failed to set page address: %d\n", ret);
+		return ret;
+	}
+
+	/* 60 Hz */
+	mipi_dsi_dcs_write_seq(dsi, 0x2f, 0x01);
+	mipi_dsi_dcs_write_seq(dsi, MIPI_DCS_SET_GAMMA_CURVE, 0x01);
+	/* OSC calibration */
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x01);
+	mipi_dsi_dcs_write_seq(dsi, 0xc3,
+			       0x94, 0x01, 0x8c, 0xd0, 0x22, 0x02, 0x00);
+	/* enter aod with no black */
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x01);
+	mipi_dsi_dcs_write_seq(dsi, 0xd2, 0x00, 0x00, 0x11);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x06);
+	mipi_dsi_dcs_write_seq(dsi, 0xd2, 0x05);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x0f);
+	mipi_dsi_dcs_write_seq(dsi, 0xd2, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x09);
+	mipi_dsi_dcs_write_seq(dsi, 0xd2, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xce, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xff, 0xaa, 0x55, 0xa5, 0x80);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x61);
+	mipi_dsi_dcs_write_seq(dsi, 0xf3, 0x80);
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc0, 0x46);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x0b);
+	mipi_dsi_dcs_write_seq(dsi, 0xb5, 0x23, 0x2b);
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x29);
+	mipi_dsi_dcs_write_seq(dsi, 0xd9, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x01);
+	mipi_dsi_dcs_write_seq(dsi, 0xd9, 0x80);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x07);
+	mipi_dsi_dcs_write_seq(dsi, 0xb2, 0x07, 0xff);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x17);
+	mipi_dsi_dcs_write_seq(dsi, 0xb2, 0x07, 0xff);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x1f);
+	mipi_dsi_dcs_write_seq(dsi, 0xb2, 0x00, 0x50);
+	/* round for fpr, corner and cup */
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x07);
+	mipi_dsi_dcs_write_seq(dsi, 0xc9,
+			       0x21, 0x00, 0x27, 0xd9, 0x27, 0xd9, 0x00, 0x00,
+			       0x3f, 0xe0, 0x8e, 0xc6, 0x3f, 0xe0, 0x8e, 0xc6,
+			       0x80, 0x06, 0x33, 0xd5, 0xf1, 0x00, 0x16, 0x13,
+			       0x00, 0x7b, 0x78, 0x33, 0xd0, 0x27, 0xd9, 0x0f,
+			       0x9b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xca,
+			       0x27, 0x00, 0x27, 0xd9, 0x27, 0xd9, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x3f, 0xe0, 0x8e, 0xc6,
+			       0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0x13,
+			       0x00, 0xe1, 0x78, 0x33, 0xd0, 0x00, 0x00, 0x03,
+			       0x65, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xcb,
+			       0x2d, 0x00, 0x27, 0xd9, 0x27, 0xd9, 0x00, 0x00,
+			       0x3f, 0xe0, 0x8e, 0xc6, 0x80, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16, 0x79,
+			       0x00, 0x7b, 0xde, 0x33, 0x70, 0x00, 0x00, 0x0c,
+			       0x9b, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xcc,
+			       0x2b, 0x00, 0x27, 0xd9, 0x27, 0xd9, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+			       0x7f, 0xf9, 0xcc, 0x2a, 0x0f, 0x00, 0x7c, 0x79,
+			       0x00, 0xe1, 0xde, 0x33, 0x7f, 0xd8, 0x27, 0x00,
+			       0x65, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc1, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc2, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc3, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc4, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc5, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc6, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc7, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc8, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xcd, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xce, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xcf, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xd0, 0x00);
+	/* round on */
+	mipi_dsi_dcs_write_seq(dsi, 0xc0, 0x05, 0x02);
+	/* ESD error flag, active high */
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x05);
+	mipi_dsi_dcs_write_seq(dsi, 0xbe, 0x0a);
+	/* SRAM not power off at idle mode */
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x01);
+	mipi_dsi_dcs_write_seq(dsi, 0xc0, 0xb3);
+	mipi_dsi_dcs_write_seq(dsi, 0xff, 0xaa, 0x55, 0xa5, 0x80);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x2e);
+	mipi_dsi_dcs_write_seq(dsi, 0xfb, 0xd1);
+
+	ret = mipi_dsi_dcs_exit_sleep_mode(dsi);
+	if (ret < 0) {
+		dev_err(dev, "Failed to exit sleep mode: %d\n", ret);
+		return ret;
+	}
+	msleep(80);
+
+	ret = mipi_dsi_dcs_set_display_on(dsi);
+	if (ret < 0) {
+		dev_err(dev, "Failed to set display on: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int l3_42_02_0a_dsc_off(struct l3_42_02_0a_dsc *ctx)
+{
+	struct mipi_dsi_device *dsi = ctx->dsi;
+	struct device *dev = &dsi->dev;
+	int ret;
+
+	ret = mipi_dsi_dcs_set_display_off(dsi);
+	if (ret < 0) {
+		dev_err(dev, "Failed to set display off: %d\n", ret);
+		return ret;
+	}
+	msleep(20);
+
+	ret = mipi_dsi_dcs_enter_sleep_mode(dsi);
+	if (ret < 0) {
+		dev_err(dev, "Failed to enter sleep mode: %d\n", ret);
+		return ret;
+	}
+	msleep(100);
+
+	return 0;
+}
+
+static int l3_42_02_0a_dsc_prepare(struct drm_panel *panel)
+{
+	struct l3_42_02_0a_dsc *ctx = to_l3_42_02_0a_dsc(panel);
+	struct device *dev = &ctx->dsi->dev;
+	struct drm_dsc_picture_parameter_set pps;
+	int ret;
+
+	ret = regulator_bulk_enable(ARRAY_SIZE(ctx->supplies), ctx->supplies);
+	if (ret < 0) {
+		dev_err(dev, "Failed to enable regulators: %d\n", ret);
+		return ret;
+	}
+
+	l3_42_02_0a_dsc_reset(ctx);
+
+	ret = l3_42_02_0a_dsc_on(ctx);
+	if (ret < 0) {
+		dev_err(dev, "Failed to initialize panel: %d\n", ret);
+		gpiod_set_value_cansleep(ctx->reset_gpio, 1);
+		regulator_bulk_disable(ARRAY_SIZE(ctx->supplies), ctx->supplies);
+		return ret;
+	}
+
+	drm_dsc_pps_payload_pack(&pps, &ctx->dsc);
+
+	ret = mipi_dsi_picture_parameter_set(ctx->dsi, &pps);
+	if (ret < 0) {
+		dev_err(panel->dev, "failed to transmit PPS: %d\n", ret);
+		return ret;
+	}
+
+	ret = mipi_dsi_compression_mode(ctx->dsi, true);
+	if (ret < 0) {
+		dev_err(dev, "failed to enable compression mode: %d\n", ret);
+		return ret;
+	}
+
+	msleep(28); /* TODO: Is this panel-dependent? */
+
+	return 0;
+}
+
+static int l3_42_02_0a_dsc_unprepare(struct drm_panel *panel)
+{
+	struct l3_42_02_0a_dsc *ctx = to_l3_42_02_0a_dsc(panel);
+	struct device *dev = &ctx->dsi->dev;
+	int ret;
+
+	ret = l3_42_02_0a_dsc_off(ctx);
+	if (ret < 0)
+		dev_err(dev, "Failed to un-initialize panel: %d\n", ret);
+
+	gpiod_set_value_cansleep(ctx->reset_gpio, 1);
+	regulator_bulk_disable(ARRAY_SIZE(ctx->supplies), ctx->supplies);
+
+	return 0;
+}
+
+static const struct drm_display_mode l3_42_02_0a_dsc_mode = {
+	.clock = (1080 + 32 + 16 + 32) * (2400 + 64 + 16 + 32) * 60 / 1000,
+	.hdisplay = 1080,
+	.hsync_start = 1080 + 32,
+	.hsync_end = 1080 + 32 + 16,
+	.htotal = 1080 + 32 + 16 + 32,
+	.vdisplay = 2400,
+	.vsync_start = 2400 + 64,
+	.vsync_end = 2400 + 64 + 16,
+	.vtotal = 2400 + 64 + 16 + 32,
+	.width_mm = 654,
+	.height_mm = 1454,
+	.type = DRM_MODE_TYPE_DRIVER,
+};
+
+static int l3_42_02_0a_dsc_get_modes(struct drm_panel *panel,
+				     struct drm_connector *connector)
+{
+	return drm_connector_helper_get_modes_fixed(connector, &l3_42_02_0a_dsc_mode);
+}
+
+static const struct drm_panel_funcs l3_42_02_0a_dsc_panel_funcs = {
+	.prepare = l3_42_02_0a_dsc_prepare,
+	.unprepare = l3_42_02_0a_dsc_unprepare,
+	.get_modes = l3_42_02_0a_dsc_get_modes,
+};
+
+static int l3_42_02_0a_dsc_bl_update_status(struct backlight_device *bl)
+{
+	struct mipi_dsi_device *dsi = bl_get_data(bl);
+	u16 brightness = backlight_get_brightness(bl);
+	int ret;
+
+	dsi->mode_flags &= ~MIPI_DSI_MODE_LPM;
+
+	ret = mipi_dsi_dcs_set_display_brightness_large(dsi, brightness);
+	if (ret < 0)
+		return ret;
+
+	dsi->mode_flags |= MIPI_DSI_MODE_LPM;
+
+	return 0;
+}
+
+// TODO: Check if /sys/class/backlight/.../actual_brightness actually returns
+// correct values. If not, remove this function.
+static int l3_42_02_0a_dsc_bl_get_brightness(struct backlight_device *bl)
+{
+	struct mipi_dsi_device *dsi = bl_get_data(bl);
+	u16 brightness;
+	int ret;
+
+	dsi->mode_flags &= ~MIPI_DSI_MODE_LPM;
+
+	ret = mipi_dsi_dcs_get_display_brightness_large(dsi, &brightness);
+	if (ret < 0)
+		return ret;
+
+	dsi->mode_flags |= MIPI_DSI_MODE_LPM;
+
+	return brightness;
+}
+
+static const struct backlight_ops l3_42_02_0a_dsc_bl_ops = {
+	.update_status = l3_42_02_0a_dsc_bl_update_status,
+	.get_brightness = l3_42_02_0a_dsc_bl_get_brightness,
+};
+
+static struct backlight_device *
+l3_42_02_0a_dsc_create_backlight(struct mipi_dsi_device *dsi)
+{
+	struct device *dev = &dsi->dev;
+	const struct backlight_properties props = {
+		.type = BACKLIGHT_RAW,
+		.brightness = 4095,
+		.max_brightness = 4095,
+	};
+
+	return devm_backlight_device_register(dev, dev_name(dev), dev, dsi,
+					      &l3_42_02_0a_dsc_bl_ops, &props);
+}
+
+static int l3_42_02_0a_dsc_probe(struct mipi_dsi_device *dsi)
+{
+	struct device *dev = &dsi->dev;
+	struct l3_42_02_0a_dsc *ctx;
+	int ret;
+
+	ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->supplies[0].supply = "vddd";
+	ctx->supplies[1].supply = "vci";
+	ctx->supplies[2].supply = "vddio";
+	ret = devm_regulator_bulk_get(dev, ARRAY_SIZE(ctx->supplies),
+				      ctx->supplies);
+	if (ret < 0)
+		return dev_err_probe(dev, ret, "Failed to get regulators\n");
+
+	ctx->reset_gpio = devm_gpiod_get(dev, "reset", GPIOD_OUT_HIGH);
+	if (IS_ERR(ctx->reset_gpio))
+		return dev_err_probe(dev, PTR_ERR(ctx->reset_gpio),
+				     "Failed to get reset-gpios\n");
+
+	ctx->dsi = dsi;
+	mipi_dsi_set_drvdata(dsi, ctx);
+
+	dsi->lanes = 4;
+	dsi->format = MIPI_DSI_FMT_RGB888;
+	dsi->mode_flags = MIPI_DSI_MODE_VIDEO_BURST |
+			  MIPI_DSI_CLOCK_NON_CONTINUOUS | MIPI_DSI_MODE_LPM;
+
+	drm_panel_init(&ctx->panel, dev, &l3_42_02_0a_dsc_panel_funcs,
+		       DRM_MODE_CONNECTOR_DSI);
+	ctx->panel.prepare_prev_first = true;
+
+	ctx->panel.backlight = l3_42_02_0a_dsc_create_backlight(dsi);
+	if (IS_ERR(ctx->panel.backlight))
+		return dev_err_probe(dev, PTR_ERR(ctx->panel.backlight),
+				     "Failed to create backlight\n");
+
+	drm_panel_add(&ctx->panel);
+
+	/* This panel only supports DSC; unconditionally enable it */
+	dsi->dsc = &ctx->dsc;
+
+	ctx->dsc.dsc_version_major = 1;
+	ctx->dsc.dsc_version_minor = 1;
+
+	/* TODO: Pass slice_per_pkt = 1 */
+	ctx->dsc.slice_height = 12;
+	ctx->dsc.slice_width = 1080;
+	/*
+	 * TODO: hdisplay should be read from the selected mode once
+	 * it is passed back to drm_panel (in prepare?)
+	 */
+	WARN_ON(1080 % ctx->dsc.slice_width);
+	ctx->dsc.slice_count = 1080 / ctx->dsc.slice_width;
+	ctx->dsc.bits_per_component = 10;
+	ctx->dsc.bits_per_pixel = 8 << 4; /* 4 fractional bits */
+	ctx->dsc.block_pred_enable = true;
+
+	ret = mipi_dsi_attach(dsi);
+	if (ret < 0) {
+		drm_panel_remove(&ctx->panel);
+		return dev_err_probe(dev, ret, "Failed to attach to DSI host\n");
+	}
+
+	return 0;
+}
+
+static void l3_42_02_0a_dsc_remove(struct mipi_dsi_device *dsi)
+{
+	struct l3_42_02_0a_dsc *ctx = mipi_dsi_get_drvdata(dsi);
+	int ret;
+
+	ret = mipi_dsi_detach(dsi);
+	if (ret < 0)
+		dev_err(&dsi->dev, "Failed to detach from DSI host: %d\n", ret);
+
+	drm_panel_remove(&ctx->panel);
+}
+
+static const struct of_device_id l3_42_02_0a_dsc_of_match[] = {
+	{ .compatible = "mdss,l3-42-02-0a-dsc" }, // FIXME
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, l3_42_02_0a_dsc_of_match);
+
+static struct mipi_dsi_driver l3_42_02_0a_dsc_driver = {
+	.probe = l3_42_02_0a_dsc_probe,
+	.remove = l3_42_02_0a_dsc_remove,
+	.driver = {
+		.name = "panel-l3-42-02-0a-dsc",
+		.of_match_table = l3_42_02_0a_dsc_of_match,
+	},
+};
+module_mipi_dsi_driver(l3_42_02_0a_dsc_driver);
+
+MODULE_AUTHOR("linux-mdss-dsi-panel-driver-generator <fix@me>"); // FIXME
+MODULE_DESCRIPTION("DRM driver for xiaomi 42 02 0a mp cmd mode dsc dsi panel");
+MODULE_LICENSE("GPL");

From fd5f38d30b317c86519d787ac4f40f4cbbe9438d Mon Sep 17 00:00:00 2001
From: Jens Reidel <adrian@travitia.xyz>
Date: Fri, 5 Jan 2024 21:05:13 +0100
Subject: [PATCH 677/707] drm/msm/dsi: Allow 10 BPC DSC configuration

Signed-off-by: Jens Reidel <adrian@travitia.xyz>
---
 drivers/gpu/drm/msm/dsi/dsi_host.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/gpu/drm/msm/dsi/dsi_host.c b/drivers/gpu/drm/msm/dsi/dsi_host.c
index deeecdfd6c4e48..fb1aa479f69b0c 100644
--- a/drivers/gpu/drm/msm/dsi/dsi_host.c
+++ b/drivers/gpu/drm/msm/dsi/dsi_host.c
@@ -1771,11 +1771,6 @@ static int dsi_populate_dsc_params(struct msm_dsi_host *msm_host, struct drm_dsc
 		return -EINVAL;
 	}
 
-	if (dsc->bits_per_component != 8) {
-		DRM_DEV_ERROR(&msm_host->pdev->dev, "DSI does not support bits_per_component != 8 yet\n");
-		return -EOPNOTSUPP;
-	}
-
 	dsc->simple_422 = 0;
 	dsc->convert_rgb = 1;
 	dsc->vbr_enable = 0;

From 3638b93172041bc7273e7461418d676043aeab2e Mon Sep 17 00:00:00 2001
From: Jens Reidel <adrian@travitia.xyz>
Date: Thu, 7 Sep 2023 20:41:08 +0200
Subject: [PATCH 678/707] arm64: dts: qcom: Add device tree for Xiaomi 12

Signed-off-by: Jens Reidel <adrian@travitia.xyz>
---
 arch/arm64/boot/dts/qcom/Makefile             |    1 +
 .../boot/dts/qcom/sm8450-xiaomi-cupid.dts     | 1118 +++++++++++++++++
 2 files changed, 1119 insertions(+)
 create mode 100644 arch/arm64/boot/dts/qcom/sm8450-xiaomi-cupid.dts

diff --git a/arch/arm64/boot/dts/qcom/Makefile b/arch/arm64/boot/dts/qcom/Makefile
index 42e50f2b2ec322..4e2fe85a4bb66b 100644
--- a/arch/arm64/boot/dts/qcom/Makefile
+++ b/arch/arm64/boot/dts/qcom/Makefile
@@ -233,6 +233,7 @@ dtb-$(CONFIG_ARCH_QCOM)	+= sm8450-hdk.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8450-qrd.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8450-sony-xperia-nagara-pdx223.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8450-sony-xperia-nagara-pdx224.dtb
+dtb-$(CONFIG_ARCH_QCOM)	+= sm8450-xiaomi-cupid.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8550-hdk.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8550-mtp.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8550-qrd.dtb
diff --git a/arch/arm64/boot/dts/qcom/sm8450-xiaomi-cupid.dts b/arch/arm64/boot/dts/qcom/sm8450-xiaomi-cupid.dts
new file mode 100644
index 00000000000000..7ce2ee99f2644f
--- /dev/null
+++ b/arch/arm64/boot/dts/qcom/sm8450-xiaomi-cupid.dts
@@ -0,0 +1,1118 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/dts-v1/;
+
+#include <dt-bindings/regulator/qcom,rpmh-regulator.h>
+#include <dt-bindings/leds/common.h>
+#include "sm8450.dtsi"
+#include "pm8350.dtsi"
+#include "pm8350b.dtsi"
+#include "pm8350c.dtsi"
+#include "pm8450.dtsi"
+#include "pmk8350.dtsi"
+#include "pmr735a.dtsi"
+
+/delete-node/ &xbl_ramdump_mem;
+/delete-node/ &xbl_sc_mem;
+/delete-node/ &adsp_mem;
+/delete-node/ &rmtfs_mem;
+/delete-node/ &mte_mem;
+/delete-node/ &trusted_apps_mem;
+/delete-node/ &trusted_apps_ext_mem;
+
+/ {
+	model = "Xiaomi 12";
+	compatible = "xiaomi,cupid", "qcom,sm8450";
+	chassis-type = "handset";
+
+	aliases {
+		serial0 = &uart7;
+	};
+
+	wcd938x: audio-codec {
+		compatible = "qcom,wcd9385-codec";
+
+		pinctrl-names = "default";
+		pinctrl-0 = <&wcd_default>;
+
+		qcom,micbias1-microvolt = <2750000>;
+		qcom,micbias2-microvolt = <2750000>;
+		qcom,micbias3-microvolt = <2750000>;
+		qcom,micbias4-microvolt = <2750000>;
+		qcom,mbhc-buttons-vthreshold-microvolt = <75000 150000 237000 500000 500000 500000 500000 500000>;
+		qcom,mbhc-headset-vthreshold-microvolt = <1700000>;
+		qcom,mbhc-headphone-vthreshold-microvolt = <50000>;
+		qcom,rx-device = <&wcd_rx>;
+		qcom,tx-device = <&wcd_tx>;
+
+		reset-gpios = <&tlmm 43 GPIO_ACTIVE_LOW>;
+
+		vdd-buck-supply = <&pm8350_s10>;
+		vdd-rxtx-supply = <&pm8350_s10>;
+		vdd-io-supply = <&pm8350_s10>;
+		vdd-mic-bias-supply = <&vreg_bob>;
+
+		#sound-dai-cells = <1>;
+	};
+
+	chosen {
+		bootargs = "PMOS_NOSPLASH console=tty0";
+		stdout-path = "serial0:115200n8";
+	};
+
+	gpio-keys {
+		compatible = "gpio-keys";
+
+		pinctrl-0 = <&vol_up_n>;
+		pinctrl-names = "default";
+
+		key-volume-up {
+			label = "Volume Up";
+			linux,code = <KEY_VOLUMEUP>;
+			gpios = <&pm8350_gpios 6 GPIO_ACTIVE_LOW>;
+			debounce-interval = <15>;
+			linux,can-disable;
+			wakeup-source;
+		};
+	};
+
+	/*
+	pmic-glink {
+		compatible = "qcom,sm8450-pmic-glink", "qcom,pmic-glink";
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		connector@0 {
+			compatible = "usb-c-connector";
+			reg = <0>;
+			power-role = "dual";
+			data-role = "dual";
+
+			ports {
+				#address-cells = <1>;
+				#size-cells = <0>;
+
+				port@0 {
+					reg = <0>;
+
+					pmic_glink_hs_in: endpoint {
+						remote-endpoint = <&usb_1_dwc3_hs>;
+					};
+				};
+
+				// USB3 not present, so no port@1
+
+				port@2 {
+					reg = <2>;
+
+					pmic_glink_sbu: endpoint {
+						remote-endpoint = <&fsa4480_sbu_mux>;
+					};
+				};
+			};
+		};
+	};
+	*/
+
+	reserved-memory {
+		xbl_ramdump_mem: memory@a6b80000 {
+			reg = <0x0 0xa6b80000 0x0 0x280000>;
+			no-map;
+		};
+
+		xbl_sc_mem: memory@a6e00000 {
+			reg = <0x0 0xa6e00000 0x0 0x40000>;
+			no-map;
+		};
+
+		adsp_mem: memory@9fd00000 {
+			reg = <0x0 0x9fd00000 0x0 0x3100000>;
+			no-map;
+		};
+
+		/*
+		 * The rmtfs memory region in downstream is 'dynamically allocated'
+		 * but given the same address every time. Hard code it as this address is
+		 * where the modem firmware expects it to be.
+		 */
+		rmtfs_mem: memory@fe200000 {
+			compatible = "qcom,rmtfs-mem";
+			reg = <0x0 0xfe200000 0x0 0x280000>;
+			no-map;
+
+			qcom,client-id = <1>;
+			qcom,vmid = <15>;
+		};
+
+		/* The bootloader will enable mdss clocks if this code is present.
+		splash_memory: splash_region {
+			reg = <0x00 0xb8000000 0x0 0x2b00000>;
+			no-map;
+			label = "cont_splash_region";
+		};
+		*/
+
+		ramoops@a7000000 {
+			compatible = "ramoops";
+			reg = <0 0xa7000000 0x0 0x400000>;
+			console-size = <0x200000>;
+			pmsg-size = <0x200000>;
+			ecc-size = <16>;
+			no-map;
+		};
+	};
+
+	vph_pwr: vph-pwr-regulator {
+		compatible = "regulator-fixed";
+		regulator-name = "vph_pwr";
+		regulator-min-microvolt = <3700000>;
+		regulator-max-microvolt = <3700000>;
+
+		regulator-always-on;
+		regulator-boot-on;
+	};
+
+	/* This is a hack and taken from qcom,cnss-qca6490 in downstream */
+	wlan_regulator: wlan-regulator {
+		compatible = "regulator-wlan";
+		regulator-name = "wlan";
+
+		enable-gpio = <&tlmm 80 GPIO_ACTIVE_HIGH>;
+
+		supply-count = <6>;
+
+		vin0-supply = <&pmr735a_s2>; // vdd-wlan-aon
+		vin1-supply = <&pm8350_s11>; // vdd-wlan-dig
+		vin2-supply = <&pm8350_s10>; // vdd-wlan-io
+		vin3-supply = <&pm8350c_s1>; // vdd-wlan-rfa1
+		vin4-supply = <&pm8350_s12>; // vdd-wlan-rfa2
+		vin5-supply = <&pmr735a_l7>; // wlan-ant-switch
+	};
+};
+
+
+&apps_rsc {
+	regulators-0 {
+		compatible = "qcom,pm8350-rpmh-regulators";
+		qcom,pmic-id = "b";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+		vdd-s4-supply = <&vph_pwr>;
+		vdd-s5-supply = <&vph_pwr>;
+		vdd-s6-supply = <&vph_pwr>;
+		vdd-s7-supply = <&vph_pwr>;
+		vdd-s8-supply = <&vph_pwr>;
+		vdd-s9-supply = <&vph_pwr>;
+		vdd-s10-supply = <&vph_pwr>;
+		vdd-s11-supply = <&vph_pwr>;
+		vdd-s12-supply = <&vph_pwr>;
+
+		vdd-l1-l4-supply = <&pm8350_s11>;
+		vdd-l2-l7-supply = <&vreg_bob>;
+		vdd-l3-l5-supply = <&pm8350_s11>;
+		vdd-l6-l9-l10-supply = <&pm8350_s12>;
+		*/
+
+		/*
+		 * ARC regulators:
+		 * s5 - gfx.lvl
+		 * l8 - lcx.lvl
+		 */
+
+		pm8350_s10: smps10 {
+			regulator-name = "pm8350_s10";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1800000>;
+		};
+
+		pm8350_s11: smps11 {
+			regulator-name = "pm8350_s11";
+			regulator-min-microvolt = <848000>;
+			regulator-max-microvolt = <1104000>;
+		};
+
+		pm8350_s12: smps12 {
+			regulator-name = "pm8350_s12";
+			regulator-min-microvolt = <1224000>;
+			regulator-max-microvolt = <1400000>;
+		};
+
+		pm8350_l1: ldo1 {
+			regulator-name = "pm8350_l1";
+			regulator-min-microvolt = <912000>;
+			regulator-max-microvolt = <920000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l2: ldo2 {
+			regulator-name = "pm8350_l2";
+			regulator-min-microvolt = <3072000>;
+			regulator-max-microvolt = <3072000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l3: ldo3 {
+			regulator-name = "pm8350_l3";
+			regulator-min-microvolt = <904000>;
+			regulator-max-microvolt = <904000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l5: ldo5 {
+			regulator-name = "pm8350_l5";
+			regulator-min-microvolt = <880000>;
+			regulator-max-microvolt = <912000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l6: ldo6 {
+			regulator-name = "pm8350_l6";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1200000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l7: ldo7 {
+			regulator-name = "pm8350_l7";
+			regulator-min-microvolt = <2504000>;
+			regulator-max-microvolt = <2504000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l9: ldo9 {
+			regulator-name = "pm8350_l9";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1200000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+	};
+
+	regulators-1 {
+		compatible = "qcom,pm8350c-rpmh-regulators";
+		qcom,pmic-id = "c";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+		vdd-s4-supply = <&vph_pwr>;
+		vdd-s5-supply = <&vph_pwr>;
+		vdd-s6-supply = <&vph_pwr>;
+		vdd-s7-supply = <&vph_pwr>;
+		vdd-s8-supply = <&vph_pwr>;
+		vdd-s9-supply = <&vph_pwr>;
+		vdd-s10-supply = <&vph_pwr>;
+
+		vdd-l1-l12-supply = <&pm8350c_s1>;
+		vdd-l2-l8-supply = <&pm8350c_s1>;
+		vdd-l3-l4-l5-l7-l13-supply = <&vreg_bob>;
+		vdd-l6-l9-l11-supply = <&vreg_bob>;
+		vdd-l10-supply = <&pm8350_s12>;
+
+		vdd-bob-supply = <&vph_pwr>;
+		*/
+
+		/*
+		 * ARC regulators:
+		 * s2 - mxc.lvl
+		 * s4 - mss.lvl
+		 * s6 - cx.lvl
+		 */
+
+		pm8350c_s1: smps1 {
+			regulator-name = "pm8350c_s1";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <2024000>;
+		};
+
+		pm8350c_s10: smps10 {
+			regulator-name = "pm8350c_s10";
+			regulator-min-microvolt = <1000000>;
+			regulator-max-microvolt = <1100000>;
+		};
+
+		vreg_bob: bob {
+			regulator-name = "vreg_bob";
+			regulator-min-microvolt = <3080000>;
+			regulator-max-microvolt = <3960000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_AUTO>;
+		};
+
+		pm8350c_l1: ldo1 {
+			regulator-name = "pm8350c_l1";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1800000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l2: ldo2 {
+			regulator-name = "pm8350c_l2";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1800000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l3: ldo3 {
+			regulator-name = "pm8350c_l3";
+			regulator-min-microvolt = <3296000>;
+			regulator-max-microvolt = <3304000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l4: ldo4 {
+			regulator-name = "pm8350c_l4";
+			regulator-min-microvolt = <1704000>;
+			regulator-max-microvolt = <3000000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l5: ldo5 {
+			regulator-name = "pm8350c_l5";
+			regulator-min-microvolt = <1704000>;
+			regulator-max-microvolt = <3000000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l6: ldo6 {
+			regulator-name = "pm8350c_l6";
+			regulator-min-microvolt = <1800000>;
+			/* Originally max = 3008000 but SDHCI expects 2960000 */
+			regulator-max-microvolt = <2960000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l7: ldo7 {
+			regulator-name = "pm8350c_l7";
+			regulator-min-microvolt = <3008000>;
+			regulator-max-microvolt = <3304000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l8: ldo8 {
+			regulator-name = "pm8350c_l8";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1800000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l9: ldo9 {
+			regulator-name = "pm8350c_l9";
+			regulator-min-microvolt = <2960000>;
+			/* Originally max = 3008000 but SDHCI expects 2960000 */
+			regulator-max-microvolt = <2960000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l10: ldo10 {
+			regulator-name = "pm8350c_l10";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1220000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		/* https://github.com/Kenvyra/android_kernel_xiaomi_sm8450-devicetrees/blob/kenvyra-13.0/qcom/cupid-sm8450.dtsi#L57 */
+		pm8350c_l11: ldo11 {
+			regulator-name = "pm8350c_l11";
+			regulator-min-microvolt = <3200000>;
+			regulator-max-microvolt = <3200000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+
+			regulator-always-on;
+			regulator-boot-on;
+		};
+
+		pm8350c_l12: ldo12 {
+			regulator-name = "pm8350c_l12";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1968000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l13: ldo13 {
+			regulator-name = "pm8350c_l13";
+			regulator-min-microvolt = <3000000>;
+			regulator-max-microvolt = <3000000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+	};
+
+	regulators-2 {
+		compatible = "qcom,pm8450-rpmh-regulators";
+		qcom,pmic-id = "h";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+		vdd-s4-supply = <&vph_pwr>;
+		vdd-s5-supply = <&vph_pwr>;
+		vdd-s6-supply = <&vph_pwr>;
+
+		vdd-l2-supply = <&vreg_bob>;
+		vdd-l3-supply = <&vreg_bob>;
+		vdd-l4-supply = <&vreg_bob>;
+		*/
+
+		/*
+		 * ARC regulators:
+		 * S2 - ebi.lvl
+		 * S4 - mmcx.lvl
+		 * S6 - mx.lvl
+		 * L1 - lmx.lvl
+		 */
+
+		pm8450_s3: smps3 {
+			regulator-name = "pm8450_s3";
+			regulator-min-microvolt = <500000>;
+			regulator-max-microvolt = <600000>;
+		};
+
+		pm8450_l2: ldo2 {
+			regulator-name = "pm8450_l2";
+			regulator-min-microvolt = <880000>;
+			regulator-max-microvolt = <912000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8450_l3: ldo3 {
+			regulator-name = "pm8450_l3";
+			regulator-min-microvolt = <912000>;
+			regulator-max-microvolt = <912000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+	};
+
+	regulators-3 {
+		compatible = "qcom,pmr735a-rpmh-regulators";
+		qcom,pmic-id = "e";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+
+		vdd-l1-l2-supply = <&pmr735a_s2>;
+		vdd-l3-supply = <&pmr735a_s1>;
+		vdd-l4-supply = <&pm8350c_s1>;
+		vdd-l5-l6-supply = <&pm8350c_s1>;
+		vdd-l7-bob-supply = <&vreg_bob>;
+		*/
+
+		pmr735a_s1: smps1 {
+			regulator-name = "pmr735a_s1";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1296000>;
+		};
+
+		pmr735a_s2: smps2 {
+			regulator-name = "pmr735a_s2";
+			regulator-min-microvolt = <500000>;
+			regulator-max-microvolt = <1040000>;
+		};
+
+		pmr735a_s3: smps3 {
+			regulator-name = "pmr735a_s3";
+			regulator-min-microvolt = <435000>;
+			regulator-max-microvolt = <2352000>;
+		};
+
+		pmr735a_l1: ldo1 {
+			regulator-name = "pmr735a_l1";
+			regulator-min-microvolt = <800000>;
+			regulator-max-microvolt = <800000>;
+		};
+
+		pmr735a_l2: ldo2 {
+			regulator-name = "pmr735a_l2";
+			regulator-min-microvolt = <480000>;
+			regulator-max-microvolt = <912000>;
+		};
+
+		pmr735a_l3: ldo3 {
+			regulator-name = "pmr735a_l3";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1200000>;
+		};
+
+		pmr735a_l4: ldo4 {
+			regulator-name = "pmr735a_l4";
+			regulator-min-microvolt = <1776000>;
+			regulator-max-microvolt = <1776000>;
+		};
+
+		pmr735a_l5: ldo5 {
+			regulator-name = "pmr735a_l5";
+			regulator-min-microvolt = <880000>;
+			regulator-max-microvolt = <880000>;
+		};
+
+		pmr735a_l6: ldo6 {
+			regulator-name = "pmr735a_l6";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1200000>;
+		};
+
+		pmr735a_l7: ldo7 {
+			regulator-name = "pmr735a_l7";
+			regulator-min-microvolt = <2800000>;
+			regulator-max-microvolt = <2800000>;
+		};
+	};
+};
+
+&dispcc {
+	status = "okay";
+};
+
+&gpi_dma0 {
+	status = "okay";
+};
+
+&gpi_dma1 {
+	status = "okay";
+};
+
+&gpi_dma2 {
+	status = "okay";
+};
+
+&gpu {
+	status = "okay";
+
+	zap-shader {
+		firmware-name = "qcom/a730_zap.mbn";
+	};
+};
+
+// i2c0 in downstream
+&i2c5 {
+	clock-frequency = <100000>;
+	status = "okay";
+
+	typec-mux@42 {
+		compatible = "fcs,fsa4480";
+		reg = <0x42>;
+
+		interrupts-extended = <&tlmm 2 IRQ_TYPE_LEVEL_LOW>;
+
+		vcc-supply = <&vreg_bob>;
+		mode-switch;
+		orientation-switch;
+
+		/*
+		port {
+			fsa4480_sbu_mux: endpoint {
+				remote-endpoint = <&pmic_glink_sbu>;
+			};
+		};
+		*/
+	};
+
+	/* nq @ 64 */
+	/* pm8008i @ 8 */
+	/* pm8008i @ 9 */
+	/* pm8008j @ c */
+	/* pm8008j @ d */
+};
+
+// i2c1 in downstream
+&i2c9 {
+	clock-frequency = <100000>;
+	status = "okay";
+
+	/* nq @ 28 */
+};
+
+// i2c2 in downstream
+&i2c15 {
+	clock-frequency = <100000>;
+	status = "okay";
+
+	cs35l41_b: speaker-amp@40 {
+		compatible = "cirrus,cs35l41";
+		reg = <0x40>;
+		interrupt-parent = <&tlmm>;
+		interrupts = <118 0x2008>;
+		pinctrl-names = "default";
+		pinctrl-0 = <&cirrus_reset_default_0 &cirrus_irq_default_0>;
+		reset-gpios = <&tlmm 1 GPIO_ACTIVE_HIGH>;
+		cirrus,boost-peak-milliamp = <4000>;
+		cirrus,boost-ind-nanohenry = <1000>;
+		cirrus,boost-cap-microfarad = <15>;
+		cirrus,gpio2-src-select = <4>;
+		cirrus,gpio2-output-enable;
+		cirrus,asp-sdout-hiz = <3>;
+		sound-name-prefix = "SpkrRight";
+		#sound-dai-cells = <1>;
+	};
+
+	cs35l41_t: speaker-amp@42 {
+		compatible = "cirrus,cs35l41";
+		reg = <0x42>;
+		interrupt-parent = <&tlmm>;
+		interrupts = <63 0x2008>;
+		pinctrl-names = "default";
+		pinctrl-0 = <&cirrus_irq_default_1>;
+		reset-gpios = <&tlmm 1 GPIO_ACTIVE_HIGH>;
+		cirrus,boost-peak-milliamp = <4000>;
+		cirrus,boost-ind-nanohenry = <1000>;
+		cirrus,boost-cap-microfarad = <15>;
+		cirrus,gpio2-src-select = <4>;
+		cirrus,gpio2-output-enable;
+		cirrus,asp-sdout-hiz = <3>;
+		sound-name-prefix = "SpkrLeft";
+		#sound-dai-cells = <1>;
+	};
+};
+
+// i2c3 in downstream
+&i2c16 {
+	clock-frequency = <100000>;
+	status = "okay";
+
+	/* aw8697_haptic @ 5a */
+};
+
+&mdss {
+	status = "okay";
+};
+
+&mdss_dsi0 {
+	vdda-supply = <&pm8350_l6>;
+	status = "okay";
+
+	panel@0 {
+		compatible = "mdss,l3-42-02-0a-dsc";
+		reg = <0>;
+
+		reset-gpios = <&tlmm 0 GPIO_ACTIVE_HIGH>;
+		te-gpios = <&tlmm 86 GPIO_ACTIVE_HIGH>;
+
+		pinctrl-0 = <&sde_dsi_active &sde_te_active>;
+		pinctrl-1 = <&sde_dsi_suspend &sde_te_suspend>;
+		pinctrl-names = "default", "sleep";
+
+		vddd-supply = <&pm8350c_l10>;
+		vci-supply = <&pm8350c_l13>;
+		vddio-supply = <&pm8350c_l12>;
+
+		port {
+			panel_in: endpoint {
+				remote-endpoint = <&mdss_dsi0_out>;
+			};
+		};
+	};
+};
+
+&mdss_dsi0_out {
+	data-lanes = <0 1 2 3>;
+	remote-endpoint = <&panel_in>;
+};
+
+&mdss_dsi0_phy {
+	vdds-supply = <&pm8350_l5>;
+	status = "okay";
+};
+
+&pcie0 {
+	/*
+	vdda-supply = <&wlan_regulator>;
+
+	wake-gpios = <&tlmm 96 GPIO_ACTIVE_HIGH>;
+	perst-gpios = <&tlmm 94 GPIO_ACTIVE_LOW>;
+
+	pinctrl-0 = <&pcie0_default_state>, <&pmk8550_sleep_clk>;
+	pinctrl-names = "default";
+	*/
+
+	status = "okay";
+};
+
+&pcie0_phy {
+	vdda-phy-supply = <&pm8350_l5>;
+	vdda-pll-supply = <&pm8350_l6>;
+	status = "okay";
+};
+
+&pmk8350_rtc {
+	nvmem-cells = <&rtc_offset>;
+	nvmem-cell-names = "offset";
+
+	status = "okay";
+};
+
+&pmk8350_sdam_2 {
+	status = "okay";
+
+	rtc_offset: rtc-offset@bc {
+		reg = <0xbc 0x4>;
+	};
+};
+
+&pon_pwrkey {
+	status = "okay";
+};
+
+&pon_resin {
+	linux,code = <KEY_VOLUMEDOWN>;
+	status = "okay";
+};
+
+&remoteproc_adsp {
+	status = "okay";
+	firmware-name = "qcom/sm8450/cupid/adsp.mbn";
+};
+
+&remoteproc_cdsp {
+	status = "okay";
+	firmware-name = "qcom/sm8450/cupid/cdsp.mbn";
+};
+
+&remoteproc_mpss {
+	status = "okay";
+	firmware-name = "qcom/sm8450/cupid/modem.mbn";
+};
+
+&remoteproc_slpi {
+	status = "okay";
+	firmware-name = "qcom/sm8450/cupid/slpi.mbn";
+};
+
+&tlmm {
+	gpio-reserved-ranges = <28 4>, <36 4>;
+
+	bt_default: bt-default-state {
+		bt-en-pins {
+			pins = "gpio81";
+			function = "gpio";
+			drive-strength = <2>;
+			bias-pull-down;
+			output-low;
+		};
+
+		/*
+		sw-ctrl-pins {
+			pins = "gpio82";
+			function = "gpio";
+			bias-pull-down;
+		};
+		*/
+	};
+
+	cirrus_reset_default_0: cirrus-reset-default-0 {
+		pins = "gpio1";
+		function = "gpio";
+		drive-strength = <16>;
+		bias-disable;
+		output-high;
+	};
+
+	cirrus_irq_default_0: cirrus-irq-default-0 {
+		pins = "gpio118";
+		function = "gpio";
+		drive-strength = <2>;
+		input-enable;
+		bias-pull-up;
+	};
+
+	cirrus_irq_default_1: cirrus-irq-default-1 {
+		pins = "gpio63";
+		function = "gpio";
+		drive-strength = <2>;
+		input-enable;
+		bias-pull-up;
+	};
+
+	sde_dsi_active: sde-dsi-active-state {
+		pins = "gpio0";
+		function = "gpio";
+		drive-strength = <8>;
+		bias-disable;
+	};
+
+	sde_dsi_suspend: sde-dsi-sleep-state {
+		pins = "gpio0";
+		function = "gpio";
+		drive-strength = <2>;
+		bias-pull-down;
+	};
+
+	sde_te_active: sde-te-active-state {
+		pins = "gpio86";
+		function = "mdp_vsync";
+		drive-strength = <2>;
+		bias-pull-down;
+	};
+
+	sde_te_suspend: sde-te-sleep-state {
+		pins = "gpio86";
+		function = "mdp_vsync";
+		drive-strength = <2>;
+		bias-pull-down;
+	};
+
+	wcd_default: wcd-reset-n-active-state {
+		pins = "gpio43";
+		function = "gpio";
+		drive-strength = <16>;
+		bias-disable;
+		output-low;
+	};
+};
+
+&pm8350_gpios {
+	vol_up_n: vol-up-n-state {
+		pins = "gpio6";
+		function = "normal";
+		power-source = <1>;
+		bias-pull-up;
+		input-enable;
+	};
+};
+
+// Pretty broken, need to look into this again later
+&pm8350c_flash {
+	status = "okay";
+
+	led-0 {
+		function = LED_FUNCTION_FLASH;
+		color = <LED_COLOR_ID_WHITE>;
+		led-sources = <1>, <4>;
+		led-max-microamp = <500000>;
+		flash-max-microamp = <1500000>;
+		flash-max-timeout-us = <1280000>;
+		default-state = "on";
+	};
+};
+
+&qupv3_id_0 {
+	status = "okay";
+};
+
+&qupv3_id_1 {
+	status = "okay";
+};
+
+&qupv3_id_2 {
+	status = "okay";
+};
+
+&sound {
+	compatible = "qcom,sm8450-sndcard";
+	model = "Xiaomi 12";
+	/*
+	audio-routing = "SpkrLeft IN", "WSA_SPK1 OUT",
+			"SpkrRight IN", "WSA_SPK2 OUT",
+			"IN1_HPHL", "HPHL_OUT",
+			"IN2_HPHR", "HPHR_OUT",
+			"AMIC1", "MIC BIAS1",
+			"AMIC2", "MIC BIAS2",
+			"AMIC3", "MIC BIAS3",
+			"AMIC4", "MIC BIAS3",
+			"AMIC5", "MIC BIAS4",
+			"VA DMIC0", "MIC BIAS3",
+			"VA DMIC1", "MIC BIAS3",
+			"VA DMIC2", "MIC BIAS1",
+			"VA DMIC3", "MIC BIAS1",
+			"TX DMIC0", "MIC BIAS3",
+			"TX DMIC1", "MIC BIAS3",
+			"TX DMIC2", "MIC BIAS1",
+			"TX DMIC3", "MIC BIAS1",
+			"TX SWR_INPUT0", "ADC1_OUTPUT",
+			"TX SWR_INPUT1", "ADC2_OUTPUT",
+			"TX SWR_INPUT2", "ADC3_OUTPUT",
+			"TX SWR_INPUT3", "ADC4_OUTPUT";
+	*/
+
+	wcd-playback-dai-link {
+		link-name = "WCD Playback";
+
+		cpu {
+			sound-dai = <&q6apmbedai RX_CODEC_DMA_RX_0>;
+		};
+
+		codec {
+			sound-dai = <&wcd938x 0>, <&swr1 0>, <&rxmacro 0>;
+		};
+
+		platform {
+			sound-dai = <&q6apm>;
+		};
+	};
+
+	wcd-capture-dai-link {
+		link-name = "WCD Capture";
+
+		cpu {
+			sound-dai = <&q6apmbedai TX_CODEC_DMA_TX_3>;
+		};
+
+		codec {
+			sound-dai = <&wcd938x 1>, <&swr2 0>, <&txmacro 0>;
+		};
+
+		platform {
+			sound-dai = <&q6apm>;
+		};
+	};
+
+	/*
+	wsa-dai-link {
+		link-name = "WSA Playback";
+
+		cpu {
+			sound-dai = <&q6apmbedai WSA_CODEC_DMA_RX_0>;
+		};
+
+		codec {
+			sound-dai = <&cs35l41_t>, <&cs35l41_b>, <&wsamacro 0>; //, <&swr0 0>, <&wsamacro 0>;
+		};
+
+		platform {
+			sound-dai = <&q6apm>;
+		};
+	};
+	*/
+
+	va-dai-link {
+		link-name = "VA Capture";
+
+		cpu {
+			sound-dai = <&q6apmbedai TX_CODEC_DMA_TX_3>;
+		};
+
+		codec {
+			sound-dai = <&vamacro 0>;
+		};
+
+		platform {
+			sound-dai = <&q6apm>;
+		};
+	};
+};
+
+&swr1 {
+	status = "okay";
+
+	/* WCD9380 RX */
+	wcd_rx: codec@0,4 {
+		compatible = "sdw20217010d00";
+		reg = <0 4>;
+		qcom,rx-port-mapping = <1 2 3 4 5>;
+	};
+};
+
+&swr2 {
+	status = "okay";
+
+	/* WCD9380 TX */
+	wcd_tx: codec@0,3 {
+		compatible = "sdw20217010d00";
+		reg = <0 3>;
+		qcom,tx-port-mapping = <1 1 2 3>;
+	};
+};
+
+&uart7 {
+	status = "okay";
+};
+
+/*
+&uart20 {
+	status = "okay";
+
+	bluetooth {
+		compatible = "qcom,wcn6855-bt";
+
+		// TODO: vddio-supply?
+		vddaon-supply = <&pm8350_s11>;
+		vdddig-supply = <&pm8350_s11>;
+		vddrfa1-supply = <&pm8350c_s1>; // TODO: Which rfa1?
+		vddrfa2p2-supply = <&pm8350_s12>; // TODO: Which rfa1?
+		vddasd-supply= <&pmr735a_l7>;
+
+		max-speed = <3200000>;
+
+		enable-gpios = <&tlmm 81 GPIO_ACTIVE_HIGH>;
+		swctrl-gpios = <&tlmm 82 GPIO_ACTIVE_HIGH>;
+
+		pinctrl-0 = <&bt_default>;
+		pinctrl-names = "default";
+	};
+};
+*/
+
+&ufs_mem_hc {
+	status = "okay";
+
+	reset-gpios = <&tlmm 210 GPIO_ACTIVE_LOW>;
+
+	vcc-supply = <&pm8350_l7>;
+	vcc-max-microamp = <1100000>;
+	vccq-supply = <&pm8350_l9>;
+	vccq-max-microamp = <1200000>;
+	vdd-hba-supply = <&pm8350_l9>;
+};
+
+&ufs_mem_phy {
+	status = "okay";
+
+	vdda-phy-supply = <&pm8350_l5>;
+	vdda-pll-supply = <&pm8350_l6>;
+};
+
+&usb_1 {
+	/* USB 2.0 only */
+	qcom,select-utmi-as-pipe-clk;
+	status = "okay";
+};
+
+&usb_1_dwc3 {
+	dr_mode = "otg";
+	//usb-role-switch;
+	maximum-speed = "high-speed";
+	/* Remove USB3 phy */
+	phys = <&usb_1_hsphy>;
+	phy-names = "usb2-phy";
+};
+
+/*
+&usb_1_dwc3_hs {
+	remote-endpoint = <&pmic_glink_hs_in>;
+};
+*/
+
+&usb_1_hsphy {
+	vdda-pll-supply = <&pm8350_l5>;
+	vdda18-supply = <&pm8350c_l1>;
+	vdda33-supply = <&pm8350_l2>;
+
+	status = "okay";
+};
+
+&vamacro {
+	//pinctrl-0 = <&dmic01_default>, <&dmic02_default>;
+	//pinctrl-names = "default";
+	vdd-micb-supply = <&pm8350_s10>;
+	qcom,dmic-sample-rate = <600000>;
+
+	status = "okay";
+};
+
+&wsamacro {
+	status = "disabled";
+};
+
+&wsa2macro {
+	status = "disabled";
+};
+
+&wsa_swr_active {
+	status = "disabled";
+};
+
+&wsa2_swr_active {
+	status = "disabled";
+};

From 24fb298ba14126ad79240c9ca1a9ddcf7aca2318 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Sun, 28 Jan 2024 00:33:29 +0300
Subject: [PATCH 679/707] dt-bindings: arm: qcom,ids: Add IDs for SM8475 family

Add Qualcomm SM8475/SM8475P/SM8475_2 (cape) SoC IDs.

Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
---
 include/dt-bindings/arm/qcom,ids.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h
index dc7ba87b50d7bd..19ac7b36f608ea 100644
--- a/include/dt-bindings/arm/qcom,ids.h
+++ b/include/dt-bindings/arm/qcom,ids.h
@@ -252,8 +252,11 @@
 #define QCOM_ID_IPQ9510			521
 #define QCOM_ID_QRB4210			523
 #define QCOM_ID_QRB2210			524
+#define QCOM_ID_SM8475			530
+#define QCOM_ID_SM8475P			531
 #define QCOM_ID_SA8775P			534
 #define QCOM_ID_QRU1000			539
+#define QCOM_ID_SM8475_2		540
 #define QCOM_ID_QDU1000			545
 #define QCOM_ID_SM8650			557
 #define QCOM_ID_SM4450			568

From 58feaf1c2c67416298bb8f8bcc2b9682654db4b2 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Sun, 28 Jan 2024 00:39:52 +0300
Subject: [PATCH 680/707] soc: qcom: socinfo: Add Soc IDs for SM8475 family

Add Soc IDs table entries for Qualcomm SM8475 family.

Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
---
 drivers/soc/qcom/socinfo.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/soc/qcom/socinfo.c b/drivers/soc/qcom/socinfo.c
index 5a44ad870fb6d1..e8ff9819ac4774 100644
--- a/drivers/soc/qcom/socinfo.c
+++ b/drivers/soc/qcom/socinfo.c
@@ -424,8 +424,11 @@ static const struct soc_id soc_id[] = {
 	{ qcom_board_id(IPQ9510) },
 	{ qcom_board_id(QRB4210) },
 	{ qcom_board_id(QRB2210) },
+	{ qcom_board_id(SM8475) },
+	{ qcom_board_id(SM8475P) },
 	{ qcom_board_id(SA8775P) },
 	{ qcom_board_id(QRU1000) },
+	{ qcom_board_id(SM8475_2) },
 	{ qcom_board_id(QDU1000) },
 	{ qcom_board_id(SM8650) },
 	{ qcom_board_id(SM4450) },

From 62165243f9f46cc104596c90658194c8bb1401f9 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Sun, 28 Jan 2024 01:33:21 +0300
Subject: [PATCH 681/707] HACK! dt-bindings: clock: Add SM8475 support

---
 include/dt-bindings/clock/qcom,gcc-sm8450.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/dt-bindings/clock/qcom,gcc-sm8450.h b/include/dt-bindings/clock/qcom,gcc-sm8450.h
index 9679410843a0d0..5f1f9ab71a2203 100644
--- a/include/dt-bindings/clock/qcom,gcc-sm8450.h
+++ b/include/dt-bindings/clock/qcom,gcc-sm8450.h
@@ -194,6 +194,8 @@
 #define GCC_VIDEO_AXI0_CLK					182
 #define GCC_VIDEO_AXI1_CLK					183
 #define GCC_VIDEO_XO_CLK					184
+#define GCC_GPLL2						185
+#define GCC_GPLL3						186
 
 /* GCC resets */
 #define GCC_CAMERA_BCR						0

From 6bbf3e84156694758ebcf55961d8da6bf7fea3a9 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Sun, 28 Jan 2024 01:32:46 +0300
Subject: [PATCH 682/707] HACK! clk: qcom: gcc-sm8450: Add SM8475 support

---
 drivers/clk/qcom/gcc-sm8450.c | 180 ++++++++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)

diff --git a/drivers/clk/qcom/gcc-sm8450.c b/drivers/clk/qcom/gcc-sm8450.c
index 56354298255160..ed61f40830a088 100644
--- a/drivers/clk/qcom/gcc-sm8450.c
+++ b/drivers/clk/qcom/gcc-sm8450.c
@@ -26,6 +26,8 @@ enum {
 	P_BI_TCXO,
 	P_GCC_GPLL0_OUT_EVEN,
 	P_GCC_GPLL0_OUT_MAIN,
+	P_GCC_GPLL2_OUT_EVEN,
+	P_GCC_GPLL3_OUT_EVEN,
 	P_GCC_GPLL4_OUT_MAIN,
 	P_GCC_GPLL9_OUT_MAIN,
 	P_PCIE_1_PHY_AUX_CLK,
@@ -36,6 +38,15 @@ enum {
 	P_USB3_PHY_WRAPPER_GCC_USB30_PIPE_CLK,
 };
 
+static struct clk_init_data gcc_gpll0_sm8475_init = {
+	.name = "gcc_gpll0",
+	.parent_data = &(const struct clk_parent_data){
+		.fw_name = "bi_tcxo",
+	},
+	.num_parents = 1,
+	.ops = &clk_alpha_pll_fixed_lucid_ole_ops,
+};
+
 static struct clk_alpha_pll gcc_gpll0 = {
 	.offset = 0x0,
 	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_EVO],
@@ -53,6 +64,15 @@ static struct clk_alpha_pll gcc_gpll0 = {
 	},
 };
 
+static struct clk_init_data gcc_gpll0_out_even_sm8475_init = {
+	.name = "gcc_gpll0_out_even",
+	.parent_hws = (const struct clk_hw*[]) {
+		&gcc_gpll0.clkr.hw,
+	},
+	.num_parents = 1,
+	.ops = &clk_alpha_pll_postdiv_lucid_ole_ops,
+};
+
 static const struct clk_div_table post_div_table_gcc_gpll0_out_even[] = {
 	{ 0x1, 2 },
 	{ }
@@ -75,6 +95,49 @@ static struct clk_alpha_pll_postdiv gcc_gpll0_out_even = {
 	},
 };
 
+static struct clk_alpha_pll gcc_gpll2 = {
+	.offset = 0x2000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE],
+	.clkr = {
+		.enable_reg = 0x62018,
+		.enable_mask = BIT(2),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_gpll2",
+			.parent_data = &(const struct clk_parent_data){
+				.fw_name = "bi_tcxo",
+			},
+			.num_parents = 1,
+			.ops = &clk_alpha_pll_fixed_lucid_ole_ops,
+		},
+	},
+};
+
+static struct clk_alpha_pll gcc_gpll3 = {
+	.offset = 0x3000,
+	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE],
+	.clkr = {
+		.enable_reg = 0x62018,
+		.enable_mask = BIT(3),
+		.hw.init = &(struct clk_init_data){
+			.name = "gcc_gpll3",
+			.parent_data = &(const struct clk_parent_data){
+				.fw_name = "bi_tcxo",
+			},
+			.num_parents = 1,
+			.ops = &clk_alpha_pll_fixed_lucid_ole_ops,
+		},
+	},
+};
+
+static struct clk_init_data gcc_gpll4_sm8475_init = {
+	.name = "gcc_gpll4",
+	.parent_data = &(const struct clk_parent_data){
+		.fw_name = "bi_tcxo",
+	},
+	.num_parents = 1,
+	.ops = &clk_alpha_pll_fixed_lucid_ole_ops,
+};
+
 static struct clk_alpha_pll gcc_gpll4 = {
 	.offset = 0x4000,
 	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_EVO],
@@ -92,6 +155,15 @@ static struct clk_alpha_pll gcc_gpll4 = {
 	},
 };
 
+static struct clk_init_data gcc_gpll9_sm8475_init = {
+	.name = "gcc_gpll9",
+	.parent_data = &(const struct clk_parent_data){
+		.fw_name = "bi_tcxo",
+	},
+	.num_parents = 1,
+	.ops = &clk_alpha_pll_fixed_lucid_ole_ops,
+};
+
 static struct clk_alpha_pll gcc_gpll9 = {
 	.offset = 0x9000,
 	.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_EVO],
@@ -153,6 +225,22 @@ static const struct clk_parent_data gcc_parent_data_3[] = {
 	{ .fw_name = "bi_tcxo" },
 };
 
+static const struct parent_map gcc_parent_map_sm8475_3[] = {
+	{ P_BI_TCXO, 0 },
+	{ P_GCC_GPLL0_OUT_MAIN, 1 },
+	{ P_GCC_GPLL2_OUT_EVEN, 2 },
+	{ P_GCC_GPLL3_OUT_EVEN, 3 },
+	{ P_GCC_GPLL0_OUT_EVEN, 6 },
+};
+
+static const struct clk_parent_data gcc_parent_data_sm8475_3[] = {
+	{ .fw_name = "bi_tcxo" },
+	{ .hw = &gcc_gpll0.clkr.hw },
+	{ .hw = &gcc_gpll2.clkr.hw },
+	{ .hw = &gcc_gpll3.clkr.hw },
+	{ .hw = &gcc_gpll0_out_even.clkr.hw },
+};
+
 static const struct parent_map gcc_parent_map_5[] = {
 	{ P_PCIE_1_PHY_AUX_CLK, 0 },
 	{ P_BI_TCXO, 2 },
@@ -915,6 +1003,16 @@ static struct clk_rcg2 gcc_qupv3_wrap2_s6_clk_src = {
 	.clkr.hw.init = &gcc_qupv3_wrap2_s6_clk_src_init,
 };
 
+static const struct freq_tbl ftbl_gcc_sdcc2_apps_clk_src_sm8475[] = {
+	F(400000, P_BI_TCXO, 12, 1, 4),
+	F(25000000, P_GCC_GPLL0_OUT_EVEN, 12, 0, 0),
+	F(37000000, P_GCC_GPLL9_OUT_MAIN, 16, 0, 0),
+	F(50000000, P_GCC_GPLL0_OUT_EVEN, 6, 0, 0),
+	F(100000000, P_GCC_GPLL0_OUT_EVEN, 3, 0, 0),
+	F(148000000, P_GCC_GPLL9_OUT_MAIN, 4, 0, 0),
+	{ }
+};
+
 static const struct freq_tbl ftbl_gcc_sdcc2_apps_clk_src[] = {
 	F(400000, P_BI_TCXO, 12, 1, 4),
 	F(25000000, P_GCC_GPLL0_OUT_EVEN, 12, 0, 0),
@@ -963,6 +1061,25 @@ static struct clk_rcg2 gcc_sdcc4_apps_clk_src = {
 	},
 };
 
+static const struct freq_tbl ftbl_gcc_ufs_phy_axi_clk_src_sm8475[] = {
+	F(25000000, P_GCC_GPLL0_OUT_EVEN, 12, 0, 0),
+	F(75000000, P_GCC_GPLL0_OUT_EVEN, 4, 0, 0),
+	F(150000000, P_GCC_GPLL0_OUT_MAIN, 4, 0, 0),
+	F(300000000, P_GCC_GPLL0_OUT_MAIN, 2, 0, 0),
+	F(600000000, P_GCC_GPLL0_OUT_MAIN, 1, 0, 0),
+	F(806400000, P_GCC_GPLL2_OUT_EVEN, 1, 0, 0),
+	F(850000000, P_GCC_GPLL2_OUT_EVEN, 1, 0, 0),
+	{ }
+};
+
+static struct clk_init_data gcc_ufs_phy_axi_clk_src_sm8475_init = {
+	.name = "gcc_ufs_phy_axi_clk_src",
+	.parent_data = gcc_parent_data_sm8475_3,
+	.num_parents = ARRAY_SIZE(gcc_parent_map_sm8475_3),
+	.flags = CLK_SET_RATE_PARENT,
+	.ops = &clk_rcg2_ops,
+};
+
 static const struct freq_tbl ftbl_gcc_ufs_phy_axi_clk_src[] = {
 	F(25000000, P_GCC_GPLL0_OUT_EVEN, 12, 0, 0),
 	F(75000000, P_GCC_GPLL0_OUT_EVEN, 4, 0, 0),
@@ -987,6 +1104,24 @@ static struct clk_rcg2 gcc_ufs_phy_axi_clk_src = {
 	},
 };
 
+static const struct freq_tbl ftbl_gcc_ufs_phy_ice_core_clk_src_sm8475[] = {
+	F(75000000, P_GCC_GPLL0_OUT_EVEN, 4, 0, 0),
+	F(150000000, P_GCC_GPLL0_OUT_MAIN, 4, 0, 0),
+	F(300000000, P_GCC_GPLL0_OUT_MAIN, 2, 0, 0),
+	F(600000000, P_GCC_GPLL0_OUT_MAIN, 1, 0, 0),
+	F(806400000, P_GCC_GPLL2_OUT_EVEN, 1, 0, 0),
+	F(850000000, P_GCC_GPLL2_OUT_EVEN, 1, 0, 0),
+	{ }
+};
+
+static struct clk_init_data gcc_ufs_phy_ice_core_clk_src_sm8475_init = {
+	.name = "gcc_ufs_phy_ice_core_clk_src",
+	.parent_data = gcc_parent_data_sm8475_3,
+	.num_parents = ARRAY_SIZE(gcc_parent_map_sm8475_3),
+	.flags = CLK_SET_RATE_PARENT,
+	.ops = &clk_rcg2_ops,
+};
+
 static const struct freq_tbl ftbl_gcc_ufs_phy_ice_core_clk_src[] = {
 	F(75000000, P_GCC_GPLL0_OUT_EVEN, 4, 0, 0),
 	F(150000000, P_GCC_GPLL0_OUT_MAIN, 4, 0, 0),
@@ -1032,6 +1167,14 @@ static struct clk_rcg2 gcc_ufs_phy_phy_aux_clk_src = {
 	},
 };
 
+static struct clk_init_data gcc_ufs_phy_unipro_core_clk_src_sm8475_init = {
+	.name = "gcc_ufs_phy_unipro_core_clk_src",
+	.parent_data = gcc_parent_data_sm8475_3,
+	.num_parents = ARRAY_SIZE(gcc_parent_map_sm8475_3),
+	.flags = CLK_SET_RATE_PARENT,
+	.ops = &clk_rcg2_ops,
+};
+
 static struct clk_rcg2 gcc_ufs_phy_unipro_core_clk_src = {
 	.cmd_rcgr = 0x8708c,
 	.mnd_width = 0,
@@ -3166,6 +3309,8 @@ static struct clk_regmap *gcc_sm8450_clocks[] = {
 	[GCC_USB3_PRIM_PHY_PIPE_CLK_SRC] = &gcc_usb3_prim_phy_pipe_clk_src.clkr,
 	[GCC_VIDEO_AXI0_CLK] = &gcc_video_axi0_clk.clkr,
 	[GCC_VIDEO_AXI1_CLK] = &gcc_video_axi1_clk.clkr,
+	[GCC_GPLL2] = &gcc_gpll2.clkr,
+	[GCC_GPLL3] = &gcc_gpll3.clkr,
 };
 
 static const struct qcom_reset_map gcc_sm8450_resets[] = {
@@ -3259,6 +3404,7 @@ static const struct qcom_cc_desc gcc_sm8450_desc = {
 
 static const struct of_device_id gcc_sm8450_match_table[] = {
 	{ .compatible = "qcom,gcc-sm8450" },
+	{ .compatible = "qcom,gcc-sm8475" },
 	{ }
 };
 MODULE_DEVICE_TABLE(of, gcc_sm8450_match_table);
@@ -3277,6 +3423,40 @@ static int gcc_sm8450_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
+	if (of_device_is_compatible(pdev->dev.of_node, "qcom,gcc-sm8475")) {
+		/* Update GCC PLL0 Config */
+		gcc_gpll0.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+		gcc_gpll0.clkr.hw.init = &gcc_gpll0_sm8475_init;
+
+		gcc_gpll0_out_even.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+		gcc_gpll0_out_even.clkr.hw.init = &gcc_gpll0_out_even_sm8475_init;
+
+		/* Update GCC PLL4 Config */
+		gcc_gpll4.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+		gcc_gpll4.clkr.hw.init = &gcc_gpll4_sm8475_init;
+
+		/* Update GCC PLL9 Config */
+		gcc_gpll9.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+		gcc_gpll9.clkr.hw.init = &gcc_gpll9_sm8475_init;
+
+		gcc_sdcc2_apps_clk_src.freq_tbl = ftbl_gcc_sdcc2_apps_clk_src_sm8475;
+
+		gcc_ufs_phy_axi_clk_src.parent_map = gcc_parent_map_sm8475_3;
+		gcc_ufs_phy_axi_clk_src.freq_tbl = ftbl_gcc_ufs_phy_axi_clk_src_sm8475;
+		gcc_ufs_phy_axi_clk_src.clkr.hw.init = &gcc_ufs_phy_axi_clk_src_sm8475_init;
+
+		gcc_ufs_phy_ice_core_clk_src.parent_map = gcc_parent_map_sm8475_3;
+		gcc_ufs_phy_ice_core_clk_src.freq_tbl = ftbl_gcc_ufs_phy_ice_core_clk_src_sm8475;
+		gcc_ufs_phy_ice_core_clk_src.clkr.hw.init = &gcc_ufs_phy_ice_core_clk_src_sm8475_init;
+
+		gcc_ufs_phy_unipro_core_clk_src.parent_map = gcc_parent_map_sm8475_3;
+		gcc_ufs_phy_unipro_core_clk_src.freq_tbl = ftbl_gcc_ufs_phy_ice_core_clk_src_sm8475;
+		gcc_ufs_phy_unipro_core_clk_src.clkr.hw.init = &gcc_ufs_phy_unipro_core_clk_src_sm8475_init;
+	} else {
+		gcc_sm8450_desc.clks[GCC_GPLL2] = NULL;
+		gcc_sm8450_desc.clks[GCC_GPLL3] = NULL;
+	}
+
 	/* FORCE_MEM_CORE_ON for ufs phy ice core clocks */
 	regmap_update_bits(regmap, gcc_ufs_phy_ice_core_clk.halt_reg, BIT(14), BIT(14));
 

From 67ec195fada65be7cfceb604e74f1dc2e51b9a89 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Sun, 28 Jan 2024 20:04:45 +0300
Subject: [PATCH 683/707] HACK! clk: qcom: dispcc-sm8450: Add SM8475 support

Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
---
 drivers/clk/qcom/dispcc-sm8450.c | 57 ++++++++++++++++++++++++++++++--
 1 file changed, 55 insertions(+), 2 deletions(-)

diff --git a/drivers/clk/qcom/dispcc-sm8450.c b/drivers/clk/qcom/dispcc-sm8450.c
index 2c4aecd75186b0..5273413985b313 100644
--- a/drivers/clk/qcom/dispcc-sm8450.c
+++ b/drivers/clk/qcom/dispcc-sm8450.c
@@ -75,7 +75,7 @@ static struct pll_vco lucid_evo_vco[] = {
 	{ 249600000, 2000000000, 0 },
 };
 
-static const struct alpha_pll_config disp_cc_pll0_config = {
+static struct alpha_pll_config disp_cc_pll0_config = {
 	.l = 0xD,
 	.alpha = 0x6492,
 	.config_ctl_val = 0x20485699,
@@ -85,6 +85,15 @@ static const struct alpha_pll_config disp_cc_pll0_config = {
 	.user_ctl_hi_val = 0x00000805,
 };
 
+static struct clk_init_data disp_cc_pll0_sm8475_init = {
+	.name = "disp_cc_pll0",
+	.parent_data = &(const struct clk_parent_data) {
+		.index = DT_BI_TCXO,
+	},
+	.num_parents = 1,
+	.ops = &clk_alpha_pll_reset_lucid_ole_ops,
+};
+
 static struct clk_alpha_pll disp_cc_pll0 = {
 	.offset = 0x0,
 	.vco_table = lucid_evo_vco,
@@ -102,7 +111,7 @@ static struct clk_alpha_pll disp_cc_pll0 = {
 	},
 };
 
-static const struct alpha_pll_config disp_cc_pll1_config = {
+static struct alpha_pll_config disp_cc_pll1_config = {
 	.l = 0x1F,
 	.alpha = 0x4000,
 	.config_ctl_val = 0x20485699,
@@ -112,6 +121,15 @@ static const struct alpha_pll_config disp_cc_pll1_config = {
 	.user_ctl_hi_val = 0x00000805,
 };
 
+static struct clk_init_data disp_cc_pll1_sm8475_init = {
+	.name = "disp_cc_pll1",
+	.parent_data = &(const struct clk_parent_data) {
+		.index = DT_BI_TCXO,
+	},
+	.num_parents = 1,
+	.ops = &clk_alpha_pll_reset_lucid_ole_ops,
+};
+
 static struct clk_alpha_pll disp_cc_pll1 = {
 	.offset = 0x1000,
 	.vco_table = lucid_evo_vco,
@@ -1758,6 +1776,7 @@ static struct qcom_cc_desc disp_cc_sm8450_desc = {
 
 static const struct of_device_id disp_cc_sm8450_match_table[] = {
 	{ .compatible = "qcom,sm8450-dispcc" },
+	{ .compatible = "qcom,sm8475-dispcc" },
 	{ }
 };
 MODULE_DEVICE_TABLE(of, disp_cc_sm8450_match_table);
@@ -1781,6 +1800,40 @@ static int disp_cc_sm8450_probe(struct platform_device *pdev)
 		goto err_put_rpm;
 	}
 
+	if (of_device_is_compatible(pdev->dev.of_node, "qcom,sm8475-dispcc")) {
+		/* Update DISPCC PLL0 Config */
+		disp_cc_pll0_config.l = 0xD;
+		disp_cc_pll0_config.alpha = 0x6492;
+		disp_cc_pll0_config.config_ctl_val = 0x20485699;
+		disp_cc_pll0_config.config_ctl_hi_val = 0x00182261;
+		disp_cc_pll0_config.config_ctl_hi1_val = 0x82AA299C;
+		disp_cc_pll0_config.test_ctl_val = 0x00000000;
+		disp_cc_pll0_config.test_ctl_hi_val = 0x00000003;
+		disp_cc_pll0_config.test_ctl_hi1_val = 0x00009000;
+		disp_cc_pll0_config.test_ctl_hi2_val = 0x00000034;
+		disp_cc_pll0_config.user_ctl_val = 0x00000000;
+		disp_cc_pll0_config.user_ctl_hi_val = 0x00000005;
+
+		disp_cc_pll0.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+		disp_cc_pll0.clkr.hw.init = &disp_cc_pll0_sm8475_init;
+
+		/* Update DISPCC PLL1 Config */
+		disp_cc_pll1_config.l = 0x1F;
+		disp_cc_pll1_config.alpha = 0x4000;
+		disp_cc_pll1_config.config_ctl_val = 0x20485699;
+		disp_cc_pll1_config.config_ctl_hi_val = 0x00182261;
+		disp_cc_pll1_config.config_ctl_hi1_val = 0x82AA299C;
+		disp_cc_pll1_config.test_ctl_val = 0x00000000;
+		disp_cc_pll1_config.test_ctl_hi_val = 0x00000003;
+		disp_cc_pll1_config.test_ctl_hi1_val = 0x00009000;
+		disp_cc_pll1_config.test_ctl_hi2_val = 0x00000034;
+		disp_cc_pll1_config.user_ctl_val = 0x00000000;
+		disp_cc_pll1_config.user_ctl_hi_val = 0x00000005;
+
+		disp_cc_pll1.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+		disp_cc_pll1.clkr.hw.init = &disp_cc_pll1_sm8475_init;
+	}
+
 	clk_lucid_evo_pll_configure(&disp_cc_pll0, regmap, &disp_cc_pll0_config);
 	clk_lucid_evo_pll_configure(&disp_cc_pll1, regmap, &disp_cc_pll1_config);
 

From aa4673de679ea3327cc7f3c2f2f973421ea903e8 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Sun, 28 Jan 2024 20:19:53 +0300
Subject: [PATCH 684/707] HACK! clk: qcom: gpucc-sm8450: Add SM8475 support

Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
---
 drivers/clk/qcom/gpucc-sm8450.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/drivers/clk/qcom/gpucc-sm8450.c b/drivers/clk/qcom/gpucc-sm8450.c
index 1c4769b646b0ee..b284b4988cdc0d 100644
--- a/drivers/clk/qcom/gpucc-sm8450.c
+++ b/drivers/clk/qcom/gpucc-sm8450.c
@@ -736,6 +736,7 @@ static const struct qcom_cc_desc gpu_cc_sm8450_desc = {
 
 static const struct of_device_id gpu_cc_sm8450_match_table[] = {
 	{ .compatible = "qcom,sm8450-gpucc" },
+	{ .compatible = "qcom,sm8475-gpucc" },
 	{ }
 };
 MODULE_DEVICE_TABLE(of, gpu_cc_sm8450_match_table);
@@ -748,6 +749,38 @@ static int gpu_cc_sm8450_probe(struct platform_device *pdev)
 	if (IS_ERR(regmap))
 		return PTR_ERR(regmap);
 
+	if (of_device_is_compatible(pdev->dev.of_node, "qcom,sm8475-gpucc")) {
+		/* Update GPUCC PLL0 Config */
+		gpu_cc_pll0_config.l = 0x1D;
+		gpu_cc_pll0_config.alpha = 0xB000;
+		gpu_cc_pll0_config.config_ctl_val = 0x20485699;
+		gpu_cc_pll0_config.config_ctl_hi_val = 0x00182261;
+		gpu_cc_pll0_config.config_ctl_hi1_val = 0x82AA299C;
+		gpu_cc_pll0_config.test_ctl_val = 0x00000000;
+		gpu_cc_pll0_config.test_ctl_hi_val = 0x00000003;
+		gpu_cc_pll0_config.test_ctl_hi1_val = 0x00009000;
+		gpu_cc_pll0_config.test_ctl_hi2_val = 0x00000034;
+		gpu_cc_pll0_config.user_ctl_val = 0x00000000;
+		gpu_cc_pll0_config.user_ctl_hi_val = 0x00000005;
+
+		gpu_cc_pll0.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+
+		/* Update GPUCC PLL1 Config */
+		gpu_cc_pll1_config.l = 0x34;
+		gpu_cc_pll1_config.alpha = 0x1555;
+		gpu_cc_pll1_config.config_ctl_val = 0x20485699;
+		gpu_cc_pll1_config.config_ctl_hi_val = 0x00182261;
+		gpu_cc_pll1_config.config_ctl_hi1_val = 0x82AA299C;
+		gpu_cc_pll1_config.test_ctl_val = 0x00000000;
+		gpu_cc_pll1_config.test_ctl_hi_val = 0x00000003;
+		gpu_cc_pll1_config.test_ctl_hi1_val = 0x00009000;
+		gpu_cc_pll1_config.test_ctl_hi2_val = 0x00000034;
+		gpu_cc_pll1_config.user_ctl_val = 0x00000000;
+		gpu_cc_pll1_config.user_ctl_hi_val = 0x00000005;
+
+		gpu_cc_pll1.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+	}
+
 	clk_lucid_evo_pll_configure(&gpu_cc_pll0, regmap, &gpu_cc_pll0_config);
 	clk_lucid_evo_pll_configure(&gpu_cc_pll1, regmap, &gpu_cc_pll1_config);
 

From 74d9d575d7d0131a4a02fae0b326e72864703d62 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Sun, 28 Jan 2024 20:28:15 +0300
Subject: [PATCH 685/707] HACK! clk: qcom: videocc-sm8450: Add SM8475 support

Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
---
 drivers/clk/qcom/videocc-sm8450.c | 37 +++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/clk/qcom/videocc-sm8450.c b/drivers/clk/qcom/videocc-sm8450.c
index 16a61146e61957..540459b593f4bb 100644
--- a/drivers/clk/qcom/videocc-sm8450.c
+++ b/drivers/clk/qcom/videocc-sm8450.c
@@ -35,7 +35,7 @@ static const struct pll_vco lucid_evo_vco[] = {
 	{ 249600000, 2020000000, 0 },
 };
 
-static const struct alpha_pll_config video_cc_pll0_config = {
+static struct alpha_pll_config video_cc_pll0_config = {
 	/* .l includes CAL_L_VAL, L_VAL fields */
 	.l = 0x0044001e,
 	.alpha = 0x0,
@@ -63,7 +63,7 @@ static struct clk_alpha_pll video_cc_pll0 = {
 	},
 };
 
-static const struct alpha_pll_config video_cc_pll1_config = {
+static struct alpha_pll_config video_cc_pll1_config = {
 	/* .l includes CAL_L_VAL, L_VAL fields */
 	.l = 0x0044002b,
 	.alpha = 0xc000,
@@ -397,6 +397,7 @@ static struct qcom_cc_desc video_cc_sm8450_desc = {
 
 static const struct of_device_id video_cc_sm8450_match_table[] = {
 	{ .compatible = "qcom,sm8450-videocc" },
+	{ .compatible = "qcom,sm8475-videocc" },
 	{ }
 };
 MODULE_DEVICE_TABLE(of, video_cc_sm8450_match_table);
@@ -420,6 +421,38 @@ static int video_cc_sm8450_probe(struct platform_device *pdev)
 		return PTR_ERR(regmap);
 	}
 
+	if (of_device_is_compatible(pdev->dev.of_node, "qcom,sm8475-videocc")) {
+		/* Update VideoCC PLL0 Config */
+		video_cc_pll0_config.l = 0x1E;
+		video_cc_pll0_config.alpha = 0x0;
+		video_cc_pll0_config.config_ctl_val = 0x20485699;
+		video_cc_pll0_config.config_ctl_hi_val = 0x00182261;
+		video_cc_pll0_config.config_ctl_hi1_val = 0x82AA299C;
+		video_cc_pll0_config.test_ctl_val = 0x00000000;
+		video_cc_pll0_config.test_ctl_hi_val = 0x00000003;
+		video_cc_pll0_config.test_ctl_hi1_val = 0x00009000;
+		video_cc_pll0_config.test_ctl_hi2_val = 0x00000034;
+		video_cc_pll0_config.user_ctl_val = 0x00000000;
+		video_cc_pll0_config.user_ctl_hi_val = 0x00000005;
+
+		video_cc_pll0.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+
+		/* Update VideoCC PLL1 Config */
+		video_cc_pll1_config.l = 0x2B;
+		video_cc_pll1_config.alpha = 0xC000;
+		video_cc_pll1_config.config_ctl_val = 0x20485699;
+		video_cc_pll1_config.config_ctl_hi_val = 0x00182261;
+		video_cc_pll1_config.config_ctl_hi1_val = 0x82AA299C;
+		video_cc_pll1_config.test_ctl_val = 0x00000000;
+		video_cc_pll1_config.test_ctl_hi_val = 0x00000003;
+		video_cc_pll1_config.test_ctl_hi1_val = 0x00009000;
+		video_cc_pll1_config.test_ctl_hi2_val = 0x00000034;
+		video_cc_pll1_config.user_ctl_val = 0x00000000;
+		video_cc_pll1_config.user_ctl_hi_val = 0x00000005;
+
+		video_cc_pll1.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+	}
+
 	clk_lucid_evo_pll_configure(&video_cc_pll0, regmap, &video_cc_pll0_config);
 	clk_lucid_evo_pll_configure(&video_cc_pll1, regmap, &video_cc_pll1_config);
 

From afd68270cd34b9d2a0c99ce4688e5e5e14057117 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Tue, 30 Jan 2024 18:23:51 +0300
Subject: [PATCH 686/707] HACK! clk: qcom: camcc-sm8450: Add SM8475 support

Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
---
 drivers/clk/qcom/camcc-sm8450.c | 272 ++++++++++++++++++++++++++++++--
 1 file changed, 263 insertions(+), 9 deletions(-)

diff --git a/drivers/clk/qcom/camcc-sm8450.c b/drivers/clk/qcom/camcc-sm8450.c
index 51338a2884d2e2..43ee961ba5be53 100644
--- a/drivers/clk/qcom/camcc-sm8450.c
+++ b/drivers/clk/qcom/camcc-sm8450.c
@@ -54,9 +54,13 @@ static const struct pll_vco rivian_evo_vco[] = {
 	{ 864000000, 1056000000, 0 },
 };
 
+static const struct pll_vco rivian_ole_vco[] = {
+	{ 864000000, 1075000000, 0 },
+};
+
 static const struct clk_parent_data pll_parent_data_tcxo = { .index = DT_BI_TCXO };
 
-static const struct alpha_pll_config cam_cc_pll0_config = {
+static struct alpha_pll_config cam_cc_pll0_config = {
 	.l = 0x3e,
 	.alpha = 0x8000,
 	.config_ctl_val = 0x20485699,
@@ -86,6 +90,16 @@ static const struct clk_div_table post_div_table_cam_cc_pll0_out_even[] = {
 	{ }
 };
 
+static struct clk_init_data cam_cc_pll0_out_even_sm8475_init = {
+	.name = "cam_cc_pll0_out_even",
+	.parent_hws = (const struct clk_hw*[]) {
+		&cam_cc_pll0.clkr.hw,
+	},
+	.num_parents = 1,
+	.flags = CLK_SET_RATE_PARENT,
+	.ops = &clk_alpha_pll_postdiv_lucid_ole_ops,
+};
+
 static struct clk_alpha_pll_postdiv cam_cc_pll0_out_even = {
 	.offset = 0x0,
 	.post_div_shift = 10,
@@ -109,6 +123,16 @@ static const struct clk_div_table post_div_table_cam_cc_pll0_out_odd[] = {
 	{ }
 };
 
+static struct clk_init_data cam_cc_pll0_out_odd_sm8475_init = {
+	.name = "cam_cc_pll0_out_odd",
+	.parent_hws = (const struct clk_hw*[]) {
+		&cam_cc_pll0.clkr.hw,
+	},
+	.num_parents = 1,
+	.flags = CLK_SET_RATE_PARENT,
+	.ops = &clk_alpha_pll_postdiv_lucid_ole_ops,
+};
+
 static struct clk_alpha_pll_postdiv cam_cc_pll0_out_odd = {
 	.offset = 0x0,
 	.post_div_shift = 14,
@@ -127,7 +151,7 @@ static struct clk_alpha_pll_postdiv cam_cc_pll0_out_odd = {
 	},
 };
 
-static const struct alpha_pll_config cam_cc_pll1_config = {
+static struct alpha_pll_config cam_cc_pll1_config = {
 	.l = 0x25,
 	.alpha = 0xeaaa,
 	.config_ctl_val = 0x20485699,
@@ -157,6 +181,16 @@ static const struct clk_div_table post_div_table_cam_cc_pll1_out_even[] = {
 	{ }
 };
 
+static struct clk_init_data cam_cc_pll1_out_even_sm8475_init = {
+	.name = "cam_cc_pll1_out_even",
+	.parent_hws = (const struct clk_hw*[]) {
+		&cam_cc_pll1.clkr.hw,
+	},
+	.num_parents = 1,
+	.flags = CLK_SET_RATE_PARENT,
+	.ops = &clk_alpha_pll_postdiv_lucid_ole_ops,
+};
+
 static struct clk_alpha_pll_postdiv cam_cc_pll1_out_even = {
 	.offset = 0x1000,
 	.post_div_shift = 10,
@@ -175,7 +209,7 @@ static struct clk_alpha_pll_postdiv cam_cc_pll1_out_even = {
 	},
 };
 
-static const struct alpha_pll_config cam_cc_pll2_config = {
+static struct alpha_pll_config cam_cc_pll2_config = {
 	.l = 0x32,
 	.alpha = 0x0,
 	.config_ctl_val = 0x90008820,
@@ -198,7 +232,7 @@ static struct clk_alpha_pll cam_cc_pll2 = {
 	},
 };
 
-static const struct alpha_pll_config cam_cc_pll3_config = {
+static struct alpha_pll_config cam_cc_pll3_config = {
 	.l = 0x2d,
 	.alpha = 0x0,
 	.config_ctl_val = 0x20485699,
@@ -228,6 +262,16 @@ static const struct clk_div_table post_div_table_cam_cc_pll3_out_even[] = {
 	{ }
 };
 
+static struct clk_init_data cam_cc_pll3_out_even_sm8475_init = {
+	.name = "cam_cc_pll3_out_even",
+	.parent_hws = (const struct clk_hw*[]) {
+		&cam_cc_pll3.clkr.hw,
+	},
+	.num_parents = 1,
+	.flags = CLK_SET_RATE_PARENT,
+	.ops = &clk_alpha_pll_postdiv_lucid_ole_ops,
+};
+
 static struct clk_alpha_pll_postdiv cam_cc_pll3_out_even = {
 	.offset = 0x3000,
 	.post_div_shift = 10,
@@ -246,7 +290,7 @@ static struct clk_alpha_pll_postdiv cam_cc_pll3_out_even = {
 	},
 };
 
-static const struct alpha_pll_config cam_cc_pll4_config = {
+static struct alpha_pll_config cam_cc_pll4_config = {
 	.l = 0x2d,
 	.alpha = 0x0,
 	.config_ctl_val = 0x20485699,
@@ -276,6 +320,16 @@ static const struct clk_div_table post_div_table_cam_cc_pll4_out_even[] = {
 	{ }
 };
 
+static struct clk_init_data cam_cc_pll4_out_even_sm8475_init = {
+	.name = "cam_cc_pll4_out_even",
+	.parent_hws = (const struct clk_hw*[]) {
+		&cam_cc_pll4.clkr.hw,
+	},
+	.num_parents = 1,
+	.flags = CLK_SET_RATE_PARENT,
+	.ops = &clk_alpha_pll_postdiv_lucid_ole_ops,
+};
+
 static struct clk_alpha_pll_postdiv cam_cc_pll4_out_even = {
 	.offset = 0x4000,
 	.post_div_shift = 10,
@@ -294,7 +348,7 @@ static struct clk_alpha_pll_postdiv cam_cc_pll4_out_even = {
 	},
 };
 
-static const struct alpha_pll_config cam_cc_pll5_config = {
+static struct alpha_pll_config cam_cc_pll5_config = {
 	.l = 0x2d,
 	.alpha = 0x0,
 	.config_ctl_val = 0x20485699,
@@ -324,6 +378,16 @@ static const struct clk_div_table post_div_table_cam_cc_pll5_out_even[] = {
 	{ }
 };
 
+static struct clk_init_data cam_cc_pll5_out_even_sm8475_init = {
+	.name = "cam_cc_pll5_out_even",
+	.parent_hws = (const struct clk_hw*[]) {
+		&cam_cc_pll5.clkr.hw,
+	},
+	.num_parents = 1,
+	.flags = CLK_SET_RATE_PARENT,
+	.ops = &clk_alpha_pll_postdiv_lucid_ole_ops,
+};
+
 static struct clk_alpha_pll_postdiv cam_cc_pll5_out_even = {
 	.offset = 0x5000,
 	.post_div_shift = 10,
@@ -342,7 +406,7 @@ static struct clk_alpha_pll_postdiv cam_cc_pll5_out_even = {
 	},
 };
 
-static const struct alpha_pll_config cam_cc_pll6_config = {
+static struct alpha_pll_config cam_cc_pll6_config = {
 	.l = 0x2d,
 	.alpha = 0x0,
 	.config_ctl_val = 0x20485699,
@@ -372,6 +436,16 @@ static const struct clk_div_table post_div_table_cam_cc_pll6_out_even[] = {
 	{ }
 };
 
+static struct clk_init_data cam_cc_pll6_out_even_sm8475_init = {
+	.name = "cam_cc_pll6_out_even",
+	.parent_hws = (const struct clk_hw*[]) {
+		&cam_cc_pll6.clkr.hw,
+	},
+	.num_parents = 1,
+	.flags = CLK_SET_RATE_PARENT,
+	.ops = &clk_alpha_pll_postdiv_lucid_ole_ops,
+};
+
 static struct clk_alpha_pll_postdiv cam_cc_pll6_out_even = {
 	.offset = 0x6000,
 	.post_div_shift = 10,
@@ -390,7 +464,7 @@ static struct clk_alpha_pll_postdiv cam_cc_pll6_out_even = {
 	},
 };
 
-static const struct alpha_pll_config cam_cc_pll7_config = {
+static struct alpha_pll_config cam_cc_pll7_config = {
 	.l = 0x2d,
 	.alpha = 0x0,
 	.config_ctl_val = 0x20485699,
@@ -420,6 +494,16 @@ static const struct clk_div_table post_div_table_cam_cc_pll7_out_even[] = {
 	{ }
 };
 
+static struct clk_init_data cam_cc_pll7_out_even_sm8475_init = {
+	.name = "cam_cc_pll7_out_even",
+	.parent_hws = (const struct clk_hw*[]) {
+		&cam_cc_pll7.clkr.hw,
+	},
+	.num_parents = 1,
+	.flags = CLK_SET_RATE_PARENT,
+	.ops = &clk_alpha_pll_postdiv_lucid_ole_ops,
+};
+
 static struct clk_alpha_pll_postdiv cam_cc_pll7_out_even = {
 	.offset = 0x7000,
 	.post_div_shift = 10,
@@ -438,7 +522,7 @@ static struct clk_alpha_pll_postdiv cam_cc_pll7_out_even = {
 	},
 };
 
-static const struct alpha_pll_config cam_cc_pll8_config = {
+static struct alpha_pll_config cam_cc_pll8_config = {
 	.l = 0x32,
 	.alpha = 0x0,
 	.config_ctl_val = 0x20485699,
@@ -468,6 +552,16 @@ static const struct clk_div_table post_div_table_cam_cc_pll8_out_even[] = {
 	{ }
 };
 
+static struct clk_init_data cam_cc_pll8_out_even_sm8475_init = {
+	.name = "cam_cc_pll8_out_even",
+	.parent_hws = (const struct clk_hw*[]) {
+		&cam_cc_pll8.clkr.hw,
+	},
+	.num_parents = 1,
+	.flags = CLK_SET_RATE_PARENT,
+	.ops = &clk_alpha_pll_postdiv_lucid_ole_ops,
+};
+
 static struct clk_alpha_pll_postdiv cam_cc_pll8_out_even = {
 	.offset = 0x8000,
 	.post_div_shift = 10,
@@ -2817,6 +2911,7 @@ static const struct qcom_cc_desc cam_cc_sm8450_desc = {
 
 static const struct of_device_id cam_cc_sm8450_match_table[] = {
 	{ .compatible = "qcom,sm8450-camcc" },
+	{ .compatible = "qcom,sm8475-camcc" },
 	{ }
 };
 MODULE_DEVICE_TABLE(of, cam_cc_sm8450_match_table);
@@ -2829,6 +2924,165 @@ static int cam_cc_sm8450_probe(struct platform_device *pdev)
 	if (IS_ERR(regmap))
 		return PTR_ERR(regmap);
 
+	if (of_device_is_compatible(pdev->dev.of_node, "qcom,sm8475-camcc")) {
+		/* Update CAMCC PLL0 Config */
+		cam_cc_pll0_config.l = 0x3E;
+		cam_cc_pll0_config.alpha = 0x8000;
+		cam_cc_pll0_config.config_ctl_val = 0x20485699;
+		cam_cc_pll0_config.config_ctl_hi_val = 0x00182261;
+		cam_cc_pll0_config.config_ctl_hi1_val = 0x82AA299C;
+		cam_cc_pll0_config.test_ctl_val = 0x00000000;
+		cam_cc_pll0_config.test_ctl_hi_val = 0x00000003;
+		cam_cc_pll0_config.test_ctl_hi1_val = 0x00009000;
+		cam_cc_pll0_config.test_ctl_hi2_val = 0x00000034;
+		cam_cc_pll0_config.user_ctl_val = 0x00008400;
+		cam_cc_pll0_config.user_ctl_hi_val = 0x00000005;
+
+		cam_cc_pll0.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+
+		cam_cc_pll0_out_even.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+		cam_cc_pll0_out_even.clkr.hw.init = &cam_cc_pll0_out_even_sm8475_init;
+		cam_cc_pll0_out_odd.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+		cam_cc_pll0_out_odd.clkr.hw.init = &cam_cc_pll0_out_odd_sm8475_init;
+
+		/* Update CAMCC PLL1 Config */
+		cam_cc_pll1_config.l = 0x25;
+		cam_cc_pll1_config.alpha = 0xEAAA;
+		cam_cc_pll1_config.config_ctl_val = 0x20485699;
+		cam_cc_pll1_config.config_ctl_hi_val = 0x00182261;
+		cam_cc_pll1_config.config_ctl_hi1_val = 0x82AA299C;
+		cam_cc_pll1_config.test_ctl_val = 0x00000000;
+		cam_cc_pll1_config.test_ctl_hi_val = 0x00000003;
+		cam_cc_pll1_config.test_ctl_hi1_val = 0x00009000;
+		cam_cc_pll1_config.test_ctl_hi2_val = 0x00000034;
+		cam_cc_pll1_config.user_ctl_val = 0x00000400;
+		cam_cc_pll1_config.user_ctl_hi_val = 0x00000005;
+
+		cam_cc_pll1.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+
+		cam_cc_pll1_out_even.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+		cam_cc_pll1_out_even.clkr.hw.init = &cam_cc_pll1_out_even_sm8475_init;
+
+		/* Update CAMCC PLL2 Config */
+		cam_cc_pll2_config.l = 0x32;
+		cam_cc_pll2_config.alpha = 0x0;
+		cam_cc_pll2_config.config_ctl_val = 0x10000030;
+		cam_cc_pll2_config.config_ctl_hi_val = 0x80890263;
+		cam_cc_pll2_config.config_ctl_hi1_val = 0x00000217;
+		cam_cc_pll2_config.user_ctl_val = 0x00000001;
+		cam_cc_pll2_config.user_ctl_hi_val = 0x00000000;
+
+		cam_cc_pll2.vco_table = rivian_ole_vco;
+
+		/* Update CAMCC PLL3 Config */
+		cam_cc_pll3_config.l = 0x2D;
+		cam_cc_pll3_config.alpha = 0x0;
+		cam_cc_pll3_config.config_ctl_val = 0x20485699;
+		cam_cc_pll3_config.config_ctl_hi_val = 0x00182261;
+		cam_cc_pll3_config.config_ctl_hi1_val = 0x82AA299C;
+		cam_cc_pll3_config.test_ctl_val = 0x00000000;
+		cam_cc_pll3_config.test_ctl_hi_val = 0x00000003;
+		cam_cc_pll3_config.test_ctl_hi1_val = 0x00009000;
+		cam_cc_pll3_config.test_ctl_hi2_val = 0x00000034;
+		cam_cc_pll3_config.user_ctl_val = 0x00000400;
+		cam_cc_pll3_config.user_ctl_hi_val = 0x00000005;
+
+		cam_cc_pll3.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+
+		cam_cc_pll3_out_even.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+		cam_cc_pll3_out_even.clkr.hw.init = &cam_cc_pll3_out_even_sm8475_init;
+
+		/* Update CAMCC PLL4 Config */
+		cam_cc_pll4_config.l = 0x2D;
+		cam_cc_pll4_config.alpha = 0x0;
+		cam_cc_pll4_config.config_ctl_val = 0x20485699;
+		cam_cc_pll4_config.config_ctl_hi_val = 0x00182261;
+		cam_cc_pll4_config.config_ctl_hi1_val = 0x82AA299C;
+		cam_cc_pll4_config.test_ctl_val = 0x00000000;
+		cam_cc_pll4_config.test_ctl_hi_val = 0x00000003;
+		cam_cc_pll4_config.test_ctl_hi1_val = 0x00009000;
+		cam_cc_pll4_config.test_ctl_hi2_val = 0x00000034;
+		cam_cc_pll4_config.user_ctl_val = 0x00000400;
+		cam_cc_pll4_config.user_ctl_hi_val = 0x00000005;
+
+		cam_cc_pll4.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+
+		cam_cc_pll4_out_even.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+		cam_cc_pll4_out_even.clkr.hw.init = &cam_cc_pll4_out_even_sm8475_init;
+
+		/* Update CAMCC PLL5 Config */
+		cam_cc_pll5_config.l = 0x2D;
+		cam_cc_pll5_config.alpha = 0x0;
+		cam_cc_pll5_config.config_ctl_val = 0x20485699;
+		cam_cc_pll5_config.config_ctl_hi_val = 0x00182261;
+		cam_cc_pll5_config.config_ctl_hi1_val = 0x82AA299C;
+		cam_cc_pll5_config.test_ctl_val = 0x00000000;
+		cam_cc_pll5_config.test_ctl_hi_val = 0x00000003;
+		cam_cc_pll5_config.test_ctl_hi1_val = 0x00009000;
+		cam_cc_pll5_config.test_ctl_hi2_val = 0x00000034;
+		cam_cc_pll5_config.user_ctl_val = 0x00000400;
+		cam_cc_pll5_config.user_ctl_hi_val = 0x00000005;
+
+		cam_cc_pll5.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+
+		cam_cc_pll5_out_even.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+		cam_cc_pll5_out_even.clkr.hw.init = &cam_cc_pll5_out_even_sm8475_init;
+
+		/* Update CAMCC PLL6 Config */
+		cam_cc_pll6_config.l = 0x2D;
+		cam_cc_pll6_config.alpha = 0x0;
+		cam_cc_pll6_config.config_ctl_val = 0x20485699;
+		cam_cc_pll6_config.config_ctl_hi_val = 0x00182261;
+		cam_cc_pll6_config.config_ctl_hi1_val = 0x82AA299C;
+		cam_cc_pll6_config.test_ctl_val = 0x00000000;
+		cam_cc_pll6_config.test_ctl_hi_val = 0x00000003;
+		cam_cc_pll6_config.test_ctl_hi1_val = 0x00009000;
+		cam_cc_pll6_config.test_ctl_hi2_val = 0x00000034;
+		cam_cc_pll6_config.user_ctl_val = 0x00000400;
+		cam_cc_pll6_config.user_ctl_hi_val = 0x00000005;
+
+		cam_cc_pll6.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+
+		cam_cc_pll6_out_even.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+		cam_cc_pll6_out_even.clkr.hw.init = &cam_cc_pll6_out_even_sm8475_init;
+
+		/* Update CAMCC PLL7 Config */
+		cam_cc_pll7_config.l = 0x2D;
+		cam_cc_pll7_config.alpha = 0x0;
+		cam_cc_pll7_config.config_ctl_val = 0x20485699;
+		cam_cc_pll7_config.config_ctl_hi_val = 0x00182261;
+		cam_cc_pll7_config.config_ctl_hi1_val = 0x82AA299C;
+		cam_cc_pll7_config.test_ctl_val = 0x00000000;
+		cam_cc_pll7_config.test_ctl_hi_val = 0x00000003;
+		cam_cc_pll7_config.test_ctl_hi1_val = 0x00009000;
+		cam_cc_pll7_config.test_ctl_hi2_val = 0x00000034;
+		cam_cc_pll7_config.user_ctl_val = 0x00000400;
+		cam_cc_pll7_config.user_ctl_hi_val = 0x00000005;
+
+		cam_cc_pll7.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+
+		cam_cc_pll7_out_even.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+		cam_cc_pll7_out_even.clkr.hw.init = &cam_cc_pll7_out_even_sm8475_init;
+
+		/* Update CAMCC PLL8 Config */
+		cam_cc_pll8_config.l = 0x32;
+		cam_cc_pll8_config.alpha = 0x0;
+		cam_cc_pll8_config.config_ctl_val = 0x20485699;
+		cam_cc_pll8_config.config_ctl_hi_val = 0x00182261;
+		cam_cc_pll8_config.config_ctl_hi1_val = 0x82AA299C;
+		cam_cc_pll8_config.test_ctl_val = 0x00000000;
+		cam_cc_pll8_config.test_ctl_hi_val = 0x00000003;
+		cam_cc_pll8_config.test_ctl_hi1_val = 0x00009000;
+		cam_cc_pll8_config.test_ctl_hi2_val = 0x00000034;
+		cam_cc_pll8_config.user_ctl_val = 0x00000400;
+		cam_cc_pll8_config.user_ctl_hi_val = 0x00000005;
+
+		cam_cc_pll8.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+
+		cam_cc_pll8_out_even.regs = clk_alpha_pll_regs[CLK_ALPHA_PLL_TYPE_LUCID_OLE];
+		cam_cc_pll8_out_even.clkr.hw.init = &cam_cc_pll8_out_even_sm8475_init;
+	}
+
 	clk_lucid_evo_pll_configure(&cam_cc_pll0, regmap, &cam_cc_pll0_config);
 	clk_lucid_evo_pll_configure(&cam_cc_pll1, regmap, &cam_cc_pll1_config);
 	clk_rivian_evo_pll_configure(&cam_cc_pll2, regmap, &cam_cc_pll2_config);

From f5fa65898cee8c3404f42e13221b57e9062fd0b6 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Sun, 28 Jan 2024 05:47:58 +0300
Subject: [PATCH 687/707] pinctrl: qcom: sm8450: Add pll_clk to pin group 98
 for SM8475

Add pll_clk to pin group 98 for compatibility with SM8475.

Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
---
 drivers/pinctrl/qcom/pinctrl-sm8450.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/pinctrl/qcom/pinctrl-sm8450.c b/drivers/pinctrl/qcom/pinctrl-sm8450.c
index 61728671169527..45ac8e72c1c753 100644
--- a/drivers/pinctrl/qcom/pinctrl-sm8450.c
+++ b/drivers/pinctrl/qcom/pinctrl-sm8450.c
@@ -957,7 +957,7 @@ static const char * const pll_bist_groups[] = {
 };
 
 static const char * const pll_clk_groups[] = {
-	"gpio107",
+	"gpio98", "gpio107",
 };
 
 static const char * const pri_mi2s_groups[] = {
@@ -1511,7 +1511,7 @@ static const struct msm_pingroup sm8450_groups[] = {
 	[95] = PINGROUP(95, pcie0_clkreqn, cmu_rng, phase_flag, _, _, _, _, _, _),
 	[96] = PINGROUP(96, cmu_rng, phase_flag, _, _, _, _, _, _, _),
 	[97] = PINGROUP(97, cmu_rng, phase_flag, _, _, _, _, _, _, _),
-	[98] = PINGROUP(98, pcie1_clkreqn, phase_flag, _, _, _, _, _, _, _),
+	[98] = PINGROUP(98, pcie1_clkreqn, phase_flag, pll_clk, _, _, _, _, _, _),
 	[99] = PINGROUP(99, phase_flag, cri_trng, _, _, _, _, _, _, _),
 	[100] = PINGROUP(100, cam_mclk, qdss_gpio, _, _, _, _, _, _, _),
 	[101] = PINGROUP(101, cam_mclk, qdss_gpio, _, _, _, _, _, _, _),

From f151446fb4e7d6aad13feda1c4ab17b5e3ab960e Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Sun, 28 Jan 2024 23:57:40 +0300
Subject: [PATCH 688/707] phy: qcom-qmp-ufs: Add SM8475 support

Add the tables and constants for init sequences for UFS QMP phy found in
SM8475 SoC.

Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
---
 drivers/phy/qualcomm/phy-qcom-qmp-ufs.c | 73 +++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c b/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c
index 38c4a4cc670a78..f22a830c7c903d 100644
--- a/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c
+++ b/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c
@@ -722,6 +722,38 @@ static const struct qmp_phy_init_tbl sm8350_ufsphy_g4_pcs[] = {
 	QMP_PHY_INIT_CFG(QPHY_V5_PCS_UFS_BIST_FIXED_PAT_CTRL, 0x0a),
 };
 
+static const struct qmp_phy_init_tbl sm8475_ufsphy_serdes[] = {
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_SYSCLK_EN_SEL, 0xd9),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_CMN_CONFIG_1, 0x16),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_SEL_1, 0x11),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_HS_SWITCH_SEL_1, 0x00),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP_EN, 0x01),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE_INITVAL2, 0x00),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x82),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_RCTRL_MODE0, 0x18),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CCTRL_MODE0, 0x18),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE0, 0xff),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE0, 0x0c),
+};
+
+static const struct qmp_phy_init_tbl sm8475_ufsphy_g4_serdes[] = {
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE_MAP, 0x04),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_IVCO, 0x0f),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_CP_CTRL_MODE0, 0x14),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE1, 0x98),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_CP_CTRL_MODE1, 0x14),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_RCTRL_MODE1, 0x18),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CCTRL_MODE1, 0x18),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE1, 0x32),
+	QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE1, 0x0f),
+};
+
+static const struct qmp_phy_init_tbl sm8475_ufsphy_g4_pcs[] = {
+	QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_PLL_CNTL, 0x0b),
+	QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_HSGEAR_CAPABILITY, 0x04),
+	QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_RX_HSGEAR_CAPABILITY, 0x04),
+};
+
 static const struct qmp_phy_init_tbl sm8550_ufsphy_serdes[] = {
 	QMP_PHY_INIT_CFG(QSERDES_V6_COM_SYSCLK_EN_SEL, 0xd9),
 	QMP_PHY_INIT_CFG(QSERDES_V6_COM_CMN_CONFIG_1, 0x16),
@@ -1385,6 +1417,44 @@ static const struct qmp_phy_cfg sm8450_ufsphy_cfg = {
 	.regs			= ufsphy_v5_regs_layout,
 };
 
+static const struct qmp_phy_cfg sm8475_ufsphy_cfg = {
+	.lanes			= 2,
+
+	.offsets		= &qmp_ufs_offsets_v6,
+	.max_supported_gear	= UFS_HS_G4,
+
+	.tbls = {
+		.serdes		= sm8475_ufsphy_serdes,
+		.serdes_num	= ARRAY_SIZE(sm8475_ufsphy_serdes),
+		.tx		= sm8550_ufsphy_tx,
+		.tx_num		= ARRAY_SIZE(sm8550_ufsphy_tx),
+		.rx		= sm8550_ufsphy_rx,
+		.rx_num		= ARRAY_SIZE(sm8550_ufsphy_rx),
+		.pcs		= sm8550_ufsphy_pcs,
+		.pcs_num	= ARRAY_SIZE(sm8550_ufsphy_pcs),
+	},
+	.tbls_hs_b = {
+		.serdes		= sm8550_ufsphy_hs_b_serdes,
+		.serdes_num	= ARRAY_SIZE(sm8550_ufsphy_hs_b_serdes),
+	},
+	.tbls_hs_overlay[0] = {
+		.serdes		= sm8475_ufsphy_g4_serdes,
+		.serdes_num	= ARRAY_SIZE(sm8475_ufsphy_g4_serdes),
+		.tx		= sm8550_ufsphy_g4_tx,
+		.tx_num		= ARRAY_SIZE(sm8550_ufsphy_g4_tx),
+		.rx		= sm8550_ufsphy_g4_rx,
+		.rx_num		= ARRAY_SIZE(sm8550_ufsphy_g4_rx),
+		.pcs		= sm8475_ufsphy_g4_pcs,
+		.pcs_num	= ARRAY_SIZE(sm8475_ufsphy_g4_pcs),
+		.max_gear	= UFS_HS_G4,
+	},
+	.clk_list		= sm8450_ufs_phy_clk_l,
+	.num_clks		= ARRAY_SIZE(sm8450_ufs_phy_clk_l),
+	.vreg_list		= qmp_phy_vreg_l,
+	.num_vregs		= ARRAY_SIZE(qmp_phy_vreg_l),
+	.regs			= ufsphy_v5_regs_layout,
+};
+
 static const struct qmp_phy_cfg sm8550_ufsphy_cfg = {
 	.lanes			= 2,
 
@@ -1990,6 +2060,9 @@ static const struct of_device_id qmp_ufs_of_match_table[] = {
 	}, {
 		.compatible = "qcom,sm8450-qmp-ufs-phy",
 		.data = &sm8450_ufsphy_cfg,
+	}, {
+		.compatible = "qcom,sm8475-qmp-ufs-phy",
+		.data = &sm8475_ufsphy_cfg,
 	}, {
 		.compatible = "qcom,sm8550-qmp-ufs-phy",
 		.data = &sm8550_ufsphy_cfg,

From 54ead298dbf4c5f5cd08407ebe044d745290b735 Mon Sep 17 00:00:00 2001
From: Luca Weiss <luca.weiss@fairphone.com>
Date: Fri, 28 Apr 2023 08:47:41 +0200
Subject: [PATCH 689/707] Input - goodix_berlin_a_driver: import from
 downstream

---
 drivers/input/touchscreen/Kconfig             |    2 +
 drivers/input/touchscreen/Makefile            |    1 +
 .../goodix_berlin_a_driver/Kconfig            |   20 +
 .../goodix_berlin_a_driver/Makefile           |   12 +
 .../goodix_brl_fwupdate.c                     | 1364 ++++++++
 .../goodix_berlin_a_driver/goodix_brl_hw.c    | 1716 ++++++++++
 .../goodix_berlin_a_driver/goodix_brl_i2c.c   |  264 ++
 .../goodix_berlin_a_driver/goodix_brl_spi.c   |  298 ++
 .../goodix_berlin_a_driver/goodix_cfg_bin.c   |  357 ++
 .../goodix_berlin_a_driver/goodix_ts_core.c   | 2545 ++++++++++++++
 .../goodix_berlin_a_driver/goodix_ts_core.h   |  850 +++++
 .../goodix_ts_gesture.c                       |  454 +++
 .../goodix_ts_inspect.c                       | 2954 +++++++++++++++++
 .../goodix_berlin_a_driver/goodix_ts_tools.c  |  503 +++
 .../goodix_berlin_a_driver/goodix_ts_utils.c  |  278 ++
 15 files changed, 11618 insertions(+)
 create mode 100644 drivers/input/touchscreen/goodix_berlin_a_driver/Kconfig
 create mode 100644 drivers/input/touchscreen/goodix_berlin_a_driver/Makefile
 create mode 100644 drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_fwupdate.c
 create mode 100644 drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_hw.c
 create mode 100644 drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_i2c.c
 create mode 100644 drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_spi.c
 create mode 100644 drivers/input/touchscreen/goodix_berlin_a_driver/goodix_cfg_bin.c
 create mode 100644 drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.c
 create mode 100644 drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.h
 create mode 100644 drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_gesture.c
 create mode 100644 drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_inspect.c
 create mode 100644 drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_tools.c
 create mode 100644 drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_utils.c

diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig
index c821fe3ee794e3..e260d8f02ab4a8 100644
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -1430,4 +1430,6 @@ config TOUCHSCREEN_HIMAX_HX83112B
 	  To compile this driver as a module, choose M here: the
 	  module will be called himax_hx83112b.
 
+source "drivers/input/touchscreen/goodix_berlin_a_driver/Kconfig"
+
 endif
diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile
index a81cb5aa21a5b9..36af0d8a43b012 100644
--- a/drivers/input/touchscreen/Makefile
+++ b/drivers/input/touchscreen/Makefile
@@ -121,3 +121,4 @@ obj-$(CONFIG_TOUCHSCREEN_IQS5XX)	+= iqs5xx.o
 obj-$(CONFIG_TOUCHSCREEN_IQS7211)	+= iqs7211.o
 obj-$(CONFIG_TOUCHSCREEN_ZINITIX)	+= zinitix.o
 obj-$(CONFIG_TOUCHSCREEN_HIMAX_HX83112B)	+= himax_hx83112b.o
+obj-$(CONFIG_TOUCHSCREEN_GOODIX_BRL) += goodix_berlin_a_driver/
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/Kconfig b/drivers/input/touchscreen/goodix_berlin_a_driver/Kconfig
new file mode 100644
index 00000000000000..396268b3af4231
--- /dev/null
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/Kconfig
@@ -0,0 +1,20 @@
+#
+# Goodix touchscreen driver configuration
+#
+menuconfig TOUCHSCREEN_GOODIX_BRL
+	tristate "Goodix berlin touchscreen"
+	help
+	  Say Y here if you have a Goodix berlin series touch controller
+	  to your system.
+
+	  If build module, say M.
+	  If unsure, say N.
+
+if TOUCHSCREEN_GOODIX_BRL
+
+config TOUCHSCREEN_GOODIX_BRL_SPI
+	bool "support SPI bus connection"
+	help
+	  Say Y here if the touchscreen is connected via SPI bus.
+
+endif
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/Makefile b/drivers/input/touchscreen/goodix_berlin_a_driver/Makefile
new file mode 100644
index 00000000000000..903a1cccb5f7c2
--- /dev/null
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/Makefile
@@ -0,0 +1,12 @@
+obj-$(CONFIG_TOUCHSCREEN_GOODIX_BRL) += goodix_core.o
+goodix_core-y := \
+				goodix_brl_i2c.o \
+				goodix_brl_spi.o \
+				goodix_ts_core.o \
+				goodix_brl_hw.o \
+				goodix_cfg_bin.o \
+				goodix_ts_utils.o \
+				goodix_brl_fwupdate.o \
+				goodix_ts_gesture.o \
+				goodix_ts_inspect.o \
+				goodix_ts_tools.o
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_fwupdate.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_fwupdate.c
new file mode 100644
index 00000000000000..52abae0385b7c6
--- /dev/null
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_fwupdate.c
@@ -0,0 +1,1364 @@
+/*
+ * Goodix Touchscreen Driver
+ * Copyright (C) 2020 - 2021 Goodix, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be a reference
+ * to you, when you are integrating the GOODiX's CTP IC into your system,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+#include "goodix_ts_core.h"
+
+#define BUS_TYPE_SPI					1
+#define BUS_TYPE_I2C					0
+
+#define GOODIX_BUS_RETRY_TIMES			3
+
+#define FW_HEADER_SIZE_BRA				256
+#define FW_HEADER_SIZE					512
+#define FW_SUBSYS_INFO_SIZE				10
+#define FW_SUBSYS_INFO_OFFSET_BRA		36
+#define FW_SUBSYS_INFO_OFFSET			42
+#define FW_SUBSYS_MAX_NUM				47
+
+#define ISP_MAX_BUFFERSIZE				4096
+
+#define FW_PID_LEN						8
+#define FW_VID_LEN						4
+#define FLASH_CMD_LEN					11
+
+#define FW_FILE_CHECKSUM_OFFSET			8
+#define CONFIG_DATA_TYPE				4
+
+#define ISP_RAM_ADDR_BRA				0x18400
+#define ISP_RAM_ADDR_BRB				0x57000
+#define ISP_RAM_ADDR_BRD				0x23800
+#define HW_REG_CPU_RUN_FROM				0x10000
+#define FLASH_CMD_REG_BRA				0x10400
+#define FLASH_CMD_REG_BRB				0x13400
+#define FLASH_CMD_REG_BRD				0x12400
+#define HW_REG_ISP_BUFFER_BRA			0x10410
+#define HW_REG_ISP_BUFFER_BRB			0x13410
+#define HW_REG_ISP_BUFFER_BRD			0x12410
+#define CONFIG_DATA_ADDR_BRA			0x3E000
+#define CONFIG_DATA_ADDR_BRB			0x40000
+#define CONFIG_DATA_ADDR_BRD			0x3E000
+
+#define HOLD_CPU_REG_W					0x0002
+#define HOLD_CPU_REG_R					0x2000
+#define MISCTL_REG_BRA					0xD807
+#define MISCTL_REG_BRB					0xD80B
+#define MISCTL_REG_BRD					0xD804
+#define ENABLE_MISCTL_BRA				0x08
+#define ENABLE_MISCTL_BRB				0x40
+#define ENABLE_MISCTL_BRD				0x20700000
+#define ESD_KEY_REG						0xCC58
+#define WATCH_DOG_REG_BRA				0xCC54
+#define WATCH_DOG_REG_BRB				0xD054
+#define WATCH_DOG_REG_BRD				0xD040
+
+#define FLASH_CMD_TYPE_READ				0xAA
+#define FLASH_CMD_TYPE_WRITE			0xBB
+#define FLASH_CMD_ACK_CHK_PASS			0xEE
+#define FLASH_CMD_ACK_CHK_ERROR			0x33
+#define FLASH_CMD_ACK_IDLE				0x11
+#define FLASH_CMD_W_STATUS_CHK_PASS		0x22
+#define FLASH_CMD_W_STATUS_CHK_FAIL		0x33
+#define FLASH_CMD_W_STATUS_ADDR_ERR		0x44
+#define FLASH_CMD_W_STATUS_WRITE_ERR	0x55
+#define FLASH_CMD_W_STATUS_WRITE_OK		0xEE
+
+#define CHIP_TYPE_BRA					0x96
+#define CHIP_TYPE_BRB					0x97
+#define CHIP_TYPE_BRD					0x98
+
+
+struct update_info_t {
+	int header_size;
+	int subsys_info_offset;
+	u32 isp_ram_reg;
+	u32 flash_cmd_reg;
+	u32 isp_buffer_reg;
+	u32 config_data_reg;
+	u32 misctl_reg;
+	u32 watch_dog_reg;
+	u32 enable_misctl_val;
+};
+
+/* berlinA update into */
+struct update_info_t update_bra = {
+	FW_HEADER_SIZE_BRA,
+	FW_SUBSYS_INFO_OFFSET_BRA,
+	ISP_RAM_ADDR_BRA,
+	FLASH_CMD_REG_BRA,
+	HW_REG_ISP_BUFFER_BRA,
+	CONFIG_DATA_ADDR_BRA,
+	MISCTL_REG_BRA,
+	WATCH_DOG_REG_BRA,
+	ENABLE_MISCTL_BRA,
+};
+
+/* berlinB update info */
+struct update_info_t update_brb = {
+	FW_HEADER_SIZE,
+	FW_SUBSYS_INFO_OFFSET,
+	ISP_RAM_ADDR_BRB,
+	FLASH_CMD_REG_BRB,
+	HW_REG_ISP_BUFFER_BRB,
+	CONFIG_DATA_ADDR_BRB,
+	MISCTL_REG_BRB,
+	WATCH_DOG_REG_BRB,
+	ENABLE_MISCTL_BRB,
+};
+
+/* berlinD update info */
+struct update_info_t update_brd = {
+	FW_HEADER_SIZE,
+	FW_SUBSYS_INFO_OFFSET,
+	ISP_RAM_ADDR_BRD,
+	FLASH_CMD_REG_BRD,
+	HW_REG_ISP_BUFFER_BRD,
+	CONFIG_DATA_ADDR_BRD,
+	MISCTL_REG_BRD,
+	WATCH_DOG_REG_BRD,
+	ENABLE_MISCTL_BRD,
+};
+
+/**
+ * fw_subsys_info - subsytem firmware information
+ * @type: sybsystem type
+ * @size: firmware size
+ * @flash_addr: flash address
+ * @data: firmware data
+ */
+struct fw_subsys_info {
+	u8 type;
+	u32 size;
+	u32 flash_addr;
+	const u8 *data;
+};
+
+/**
+ *  firmware_summary
+ * @size: fw total length
+ * @checksum: checksum of fw
+ * @hw_pid: mask pid string
+ * @hw_pid: mask vid code
+ * @fw_pid: fw pid string
+ * @fw_vid: fw vid code
+ * @subsys_num: number of fw subsystem
+ * @chip_type: chip type
+ * @protocol_ver: firmware packing
+ *   protocol version
+ * @bus_type: 0 represent I2C, 1 for SPI
+ * @subsys: sybsystem info
+ */
+#pragma pack(1)
+struct  firmware_summary {
+	u32 size;
+	u32 checksum;
+	u8 hw_pid[6];
+	u8 hw_vid[3];
+	u8 fw_pid[FW_PID_LEN];
+	u8 fw_vid[FW_VID_LEN];
+	u8 subsys_num;
+	u8 chip_type;
+	u8 protocol_ver;
+	u8 bus_type;
+	u8 flash_protect;
+	// u8 reserved[8];
+	struct fw_subsys_info subsys[FW_SUBSYS_MAX_NUM];
+};
+#pragma pack()
+
+/**
+ * firmware_data - firmware data structure
+ * @fw_summary: firmware information
+ * @firmware: firmware data structure
+ */
+struct firmware_data {
+	struct firmware_summary fw_summary;
+	const struct firmware *firmware;
+	struct firmware *fw_sysfs;
+};
+
+struct config_data {
+	u8 *data;
+	int size;
+};
+
+#pragma pack(1)
+struct goodix_flash_cmd {
+	union {
+		struct {
+			u8 status;
+			u8 ack;
+			u8 len;
+			u8 cmd;
+			u8 fw_type;
+			u16 fw_len;
+			u32 fw_addr;
+			//u16 checksum;
+		};
+		u8 buf[16];
+	};
+};
+#pragma pack()
+
+enum update_status {
+	UPSTA_NOTWORK = 0,
+	UPSTA_PREPARING,
+	UPSTA_UPDATING,
+	UPSTA_SUCCESS,
+	UPSTA_FAILED
+};
+
+enum compare_status {
+	COMPARE_EQUAL = 0,
+	COMPARE_NOCODE,
+	COMPARE_PIDMISMATCH,
+	COMPARE_FW_NOTEQUAL,
+	COMPARE_CFG_NOTEQUAL,
+};
+
+/**
+ * fw_update_ctrl - structure used to control the
+ *  firmware update process
+ * @initialized: struct init state
+ * @mode: indicate weather reflash config or not, fw data source,
+ *        and run on block mode or not.
+ * @status: update status
+ * @progress: indicate the progress of update
+ * @fw_data: firmware data
+ * @fw_name: firmware name
+ * @attr_fwimage: sysfs bin attrs, for storing fw image
+ * @fw_data_src: firmware data source form sysfs, request or head file
+ * @kobj: pointer to the sysfs kobject
+ */
+struct fw_update_ctrl {
+	struct mutex mutex;
+	int initialized;
+	char fw_name[GOODIX_MAX_STR_LABLE_LEN];
+	int mode;
+	enum update_status status;
+	int spend_time;
+
+	struct firmware_data fw_data;
+	struct goodix_ic_config *ic_config;
+	struct goodix_ts_core *core_data;
+	struct update_info_t *update_info;
+
+	struct bin_attribute attr_fwimage;
+	struct kobject *kobj;
+};
+static struct fw_update_ctrl goodix_fw_update_ctrl;
+
+static int goodix_fw_update_reset(int delay)
+{
+	struct goodix_ts_hw_ops *hw_ops;
+
+	hw_ops = goodix_fw_update_ctrl.core_data->hw_ops;
+	return hw_ops->reset(goodix_fw_update_ctrl.core_data, delay);
+}
+
+static int get_fw_version_info(struct goodix_fw_version *fw_version)
+{
+	struct goodix_ts_hw_ops *hw_ops =
+		goodix_fw_update_ctrl.core_data->hw_ops;
+
+	return hw_ops->read_version(goodix_fw_update_ctrl.core_data,
+				fw_version);
+}
+
+static int goodix_reg_write(unsigned int addr,
+			unsigned char *data, unsigned int len)
+{
+	struct goodix_ts_hw_ops *hw_ops =
+			goodix_fw_update_ctrl.core_data->hw_ops;
+
+	return hw_ops->write(goodix_fw_update_ctrl.core_data,
+			addr, data, len);
+}
+
+static int goodix_reg_read(unsigned int addr,
+			unsigned char *data, unsigned int len)
+{
+	struct goodix_ts_hw_ops *hw_ops =
+			goodix_fw_update_ctrl.core_data->hw_ops;
+
+	return hw_ops->read(goodix_fw_update_ctrl.core_data,
+			addr, data, len);
+}
+
+/**
+ * goodix_parse_firmware - parse firmware header information
+ *	and subsystem information from firmware data buffer
+ *
+ * @fw_data: firmware struct, contains firmware header info
+ *	and firmware data.
+ * return: 0 - OK, < 0 - error
+ */
+/* sizeof(length) + sizeof(checksum) */
+
+static int goodix_parse_firmware(struct firmware_data *fw_data)
+{
+	const struct firmware *firmware;
+	struct  firmware_summary *fw_summary;
+	unsigned int i, fw_offset, info_offset;
+	u32 checksum;
+	int ic_type =
+		goodix_fw_update_ctrl.core_data->bus->ic_type;
+	int subsys_info_offset =
+		goodix_fw_update_ctrl.update_info->subsys_info_offset;
+	int header_size =
+		goodix_fw_update_ctrl.update_info->header_size;
+	int r = 0;
+
+	fw_summary = &fw_data->fw_summary;
+
+	/* copy firmware head info */
+	if (goodix_fw_update_ctrl.mode & UPDATE_MODE_SRC_SYSFS)
+		firmware = fw_data->fw_sysfs;
+	else
+		firmware = fw_data->firmware;
+
+	if (firmware->size < subsys_info_offset) {
+		ts_err("Invalid firmware size:%zu", firmware->size);
+		r = -EINVAL;
+		goto err_size;
+	}
+	memcpy(fw_summary, firmware->data, sizeof(*fw_summary));
+
+	/* check firmware size */
+	fw_summary->size = le32_to_cpu(fw_summary->size);
+	if (firmware->size != fw_summary->size + FW_FILE_CHECKSUM_OFFSET) {
+		ts_err("Bad firmware, size not match, %zu != %d",
+				firmware->size,
+				fw_summary->size + FW_FILE_CHECKSUM_OFFSET);
+		r = -EINVAL;
+		goto err_size;
+	}
+
+	for (i = FW_FILE_CHECKSUM_OFFSET, checksum = 0;
+	     i < firmware->size; i += 2)
+		checksum += firmware->data[i] + (firmware->data[i+1] << 8);
+
+	/* byte order change, and check */
+	fw_summary->checksum = le32_to_cpu(fw_summary->checksum);
+	if (checksum != fw_summary->checksum) {
+		ts_err("Bad firmware, cheksum error");
+		r = -EINVAL;
+		goto err_size;
+	}
+
+	if (fw_summary->subsys_num > FW_SUBSYS_MAX_NUM) {
+		ts_err("Bad firmware, invalid subsys num: %d",
+		       fw_summary->subsys_num);
+		r = -EINVAL;
+		goto err_size;
+	}
+
+	/* parse subsystem info */
+	fw_offset = header_size;
+	for (i = 0; i < fw_summary->subsys_num; i++) {
+		info_offset = subsys_info_offset +
+					i * FW_SUBSYS_INFO_SIZE;
+
+		fw_summary->subsys[i].type = firmware->data[info_offset];
+		fw_summary->subsys[i].size =
+		    le32_to_cpup((__le32 *)&firmware->data[info_offset + 1]);
+
+		fw_summary->subsys[i].flash_addr =
+		    le32_to_cpup((__le32 *)&firmware->data[info_offset + 5]);
+		if (fw_offset > firmware->size) {
+			ts_err("Sybsys offset exceed Firmware size");
+			goto err_size;
+		}
+
+		fw_summary->subsys[i].data = firmware->data + fw_offset;
+		fw_offset += fw_summary->subsys[i].size;
+	}
+
+	ts_info("Firmware package protocol: V%u", fw_summary->protocol_ver);
+	ts_info("Firmware PID:GT%s", fw_summary->fw_pid);
+	ts_info("Firmware VID:%*ph", 4, fw_summary->fw_vid);
+	ts_info("Firmware chip type:0x%02X", fw_summary->chip_type);
+	ts_info("Firmware bus type:%s",
+		(fw_summary->bus_type & BUS_TYPE_SPI) ? "SPI" : "I2C");
+	ts_info("Firmware size:%u", fw_summary->size);
+	ts_info("Firmware subsystem num:%u", fw_summary->subsys_num);
+
+	for (i = 0; i < fw_summary->subsys_num; i++) {
+		ts_debug("------------------------------------------");
+		ts_debug("Index:%d", i);
+		ts_debug("Subsystem type:%02X", fw_summary->subsys[i].type);
+		ts_debug("Subsystem size:%u", fw_summary->subsys[i].size);
+		ts_debug("Subsystem flash_addr:%08X",
+				fw_summary->subsys[i].flash_addr);
+		ts_debug("Subsystem Ptr:%p", fw_summary->subsys[i].data);
+	}
+
+	if (fw_summary->chip_type == CHIP_TYPE_BRA &&
+		ic_type != IC_TYPE_BERLIN_A) {
+		ts_err("ic type mismatch!");
+		r = -EINVAL;
+	} else if (fw_summary->chip_type == CHIP_TYPE_BRB &&
+		ic_type != IC_TYPE_BERLIN_B) {
+		ts_err("ic type mismatch!");
+		r = -EINVAL;
+	} else if (fw_summary->chip_type == CHIP_TYPE_BRD &&
+		ic_type != IC_TYPE_BERLIN_D) {
+		ts_err("ic type mismatch!");
+		r = -EINVAL;
+	}
+
+err_size:
+	return r;
+}
+
+/**
+ * goodix_fw_version_compare - compare the active version with
+ * firmware file version.
+ * @fwu_ctrl: firmware information to be compared
+ * return: 0 equal, < 0 unequal
+ */
+#define GOODIX_NOCODE "NOCODE"
+static int goodix_fw_version_compare(struct fw_update_ctrl *fwu_ctrl)
+{
+	int ret = 0;
+	struct goodix_ts_core *cd = fwu_ctrl->core_data;
+	struct goodix_fw_version *ic_ver = &cd->fw_version;
+	struct goodix_ic_info *ic_info = &cd->ic_info;
+	struct firmware_summary *fw_summary = &fwu_ctrl->fw_data.fw_summary;
+	u32 file_cfg_id;
+
+	/* compare fw_version */
+	if (!memcmp(ic_ver->rom_pid, GOODIX_NOCODE, 6) ||
+		!memcmp(ic_ver->patch_pid, GOODIX_NOCODE, 6)) {
+		ts_info("there is no code in the chip");
+		return COMPARE_NOCODE;
+	}
+
+	if (memcmp(ic_ver->patch_pid, fw_summary->fw_pid, FW_PID_LEN)) {
+		ts_err("Product ID mismatch:%s != %s",
+			ic_ver->patch_pid, fw_summary->fw_pid);
+		return COMPARE_PIDMISMATCH;
+	}
+
+	ret = memcmp(ic_ver->patch_vid, fw_summary->fw_vid, FW_VID_LEN);
+	if (ret) {
+		ts_info("active firmware version:%*ph", FW_VID_LEN,
+				ic_ver->patch_vid);
+		ts_info("firmware file version: %*ph", FW_VID_LEN,
+				fw_summary->fw_vid);
+		return COMPARE_FW_NOTEQUAL;
+	}
+	ts_info("fw_version equal");
+
+	/* compare config id */
+	if (fwu_ctrl->ic_config && fwu_ctrl->ic_config->len > 0) {
+		file_cfg_id =
+			goodix_get_file_config_id(fwu_ctrl->ic_config->data);
+		if (ic_info->version.config_id != file_cfg_id) {
+			ts_info("ic_cfg_id:0x%x != file_cfg_id:0x%x",
+				ic_info->version.config_id, file_cfg_id);
+			return COMPARE_CFG_NOTEQUAL;
+		}
+		ts_info("config_id equal");
+	}
+
+	return COMPARE_EQUAL;
+}
+
+/**
+ * goodix_reg_write_confirm - write register and confirm the value
+ *  in the register.
+ * @dev: pointer to touch device
+ * @addr: register address
+ * @data: pointer to data buffer
+ * @len: data length
+ * return: 0 write success and confirm ok
+ *		   < 0 failed
+ */
+static int goodix_reg_write_confirm(unsigned int addr,
+		unsigned char *data, unsigned int len)
+{
+	u8 *cfm = NULL;
+	u8 cfm_buf[32];
+	int r, i;
+
+	if (len > sizeof(cfm_buf)) {
+		cfm = kzalloc(len, GFP_KERNEL);
+		if (!cfm)
+			return -ENOMEM;
+	} else {
+		cfm = &cfm_buf[0];
+	}
+
+	for (i = 0; i < GOODIX_BUS_RETRY_TIMES; i++) {
+		r = goodix_reg_write(addr, data, len);
+		if (r < 0)
+			goto exit;
+
+		r = goodix_reg_read(addr, cfm, len);
+		if (r < 0)
+			goto exit;
+
+		if (memcmp(data, cfm, len)) {
+			r = -EINVAL;
+			continue;
+		} else {
+			r = 0;
+			break;
+		}
+	}
+
+exit:
+	if (cfm != &cfm_buf[0])
+		kfree(cfm);
+	return r;
+}
+
+
+/**
+ * goodix_load_isp - load ISP program to device ram
+ * @dev: pointer to touch device
+ * @fw_data: firmware data
+ * return 0 ok, <0 error
+ */
+static int goodix_load_isp(struct firmware_data *fw_data)
+{
+	struct goodix_fw_version isp_fw_version;
+	struct fw_subsys_info *fw_isp;
+	u32 isp_ram_reg = goodix_fw_update_ctrl.update_info->isp_ram_reg;
+	u8 reg_val[8] = {0x00};
+	int r;
+
+	memset(&isp_fw_version, 0, sizeof(isp_fw_version));
+	fw_isp = &fw_data->fw_summary.subsys[0];
+
+	ts_info("Loading ISP start");
+	r = goodix_reg_write_confirm(isp_ram_reg,
+					(u8 *)fw_isp->data, fw_isp->size);
+	if (r < 0) {
+		ts_err("Loading ISP error");
+		return r;
+	}
+
+	ts_info("Success send ISP data");
+
+	/* SET BOOT OPTION TO 0X55 */
+	memset(reg_val, 0x55, 8);
+	r = goodix_reg_write_confirm(HW_REG_CPU_RUN_FROM, reg_val, 8);
+	if (r < 0) {
+		ts_err("Failed set REG_CPU_RUN_FROM flag");
+		return r;
+	}
+	ts_info("Success write [8]0x55 to 0x%x", HW_REG_CPU_RUN_FROM);
+
+	if (goodix_fw_update_reset(100))
+		ts_err("reset abnormal");
+	/*check isp state */
+	if (get_fw_version_info(&isp_fw_version)) {
+		ts_err("failed read isp version");
+		return -2;
+	}
+	if (memcmp(&isp_fw_version.patch_pid[3], "ISP", 3)) {
+		ts_err("patch id error %c%c%c != %s",
+		isp_fw_version.patch_pid[3], isp_fw_version.patch_pid[4],
+		isp_fw_version.patch_pid[5], "ISP");
+		return -3;
+	}
+	ts_info("ISP running successfully");
+	return 0;
+}
+
+/**
+ * goodix_update_prepare - update prepare, loading ISP program
+ *  and make sure the ISP is running.
+ * @fwu_ctrl: pointer to fimrware control structure
+ * return: 0 ok, <0 error
+ */
+static int goodix_update_prepare(struct fw_update_ctrl *fwu_ctrl)
+{
+	u32 misctl_reg = fwu_ctrl->update_info->misctl_reg;
+	u32 watch_dog_reg = fwu_ctrl->update_info->watch_dog_reg;
+	u32 enable_misctl_val = fwu_ctrl->update_info->enable_misctl_val;
+	u8 reg_val[4] = {0};
+	u8 temp_buf[64] = {0};
+	int retry = 20;
+	int r;
+
+	/*reset IC*/
+	ts_info("firmware update, reset");
+	if (goodix_fw_update_reset(5))
+		ts_err("reset abnormal");
+
+	retry = 100;
+	/* Hold cpu*/
+	do {
+		reg_val[0] = 0x01;
+		reg_val[1] = 0x00;
+		r = goodix_reg_write(HOLD_CPU_REG_W, reg_val, 2);
+		r |= goodix_reg_read(HOLD_CPU_REG_R, &temp_buf[0], 4);
+		r |= goodix_reg_read(HOLD_CPU_REG_R, &temp_buf[4], 4);
+		r |= goodix_reg_read(HOLD_CPU_REG_R, &temp_buf[8], 4);
+		if (!r && !memcmp(&temp_buf[0], &temp_buf[4], 4) &&
+			!memcmp(&temp_buf[4], &temp_buf[8], 4) &&
+			!memcmp(&temp_buf[0], &temp_buf[8], 4)) {
+			break;
+		}
+		usleep_range(1000, 1100);
+		ts_info("retry hold cpu %d", retry);
+		ts_debug("data:%*ph", 12, temp_buf);
+	} while (--retry);
+	if (!retry) {
+		ts_err("Failed to hold CPU, return =%d", r);
+		return -1;
+	}
+	ts_info("Success hold CPU");
+
+	/* enable misctl clock */
+	if (fwu_ctrl->core_data->bus->ic_type == IC_TYPE_BERLIN_D ||
+			fwu_ctrl->core_data->bus->ic_type == IC_TYPE_NOTTINGHAM)
+		goodix_reg_write(misctl_reg, (u8 *)&enable_misctl_val, 4);
+	else
+		goodix_reg_write(misctl_reg, (u8 *)&enable_misctl_val, 1);
+	ts_info("enbale misctl clock");
+
+	if (fwu_ctrl->core_data->bus->ic_type == IC_TYPE_BERLIN_A) {
+		/* open ESD_KEY */
+		retry = 20;
+		do {
+			reg_val[0] = 0x95;
+			r = goodix_reg_write(ESD_KEY_REG, reg_val, 1);
+			r |= goodix_reg_read(ESD_KEY_REG, temp_buf, 1);
+			if (!r && temp_buf[0] == 0x01)
+				break;
+			usleep_range(1000, 1100);
+			ts_info("retry %d enable esd key, 0x%x",
+				retry, temp_buf[0]);
+		} while (--retry);
+		if (!retry) {
+			ts_err("Failed to enable esd key, return =%d", r);
+			return -2;
+		}
+		ts_info("success enable esd key");
+	}
+
+	/* disable watch dog */
+	reg_val[0] = 0x00;
+	r = goodix_reg_write(watch_dog_reg, reg_val, 1);
+	ts_info("disable watch dog");
+
+	/* load ISP code and run form isp */
+	r = goodix_load_isp(&fwu_ctrl->fw_data);
+	if (r < 0)
+		ts_err("Failed load and run isp");
+
+	return r;
+}
+
+/*	goodix_send_flash_cmd: send command to read or write flash data
+ *	@flash_cmd: command need to send.
+ */
+static int goodix_send_flash_cmd(struct goodix_flash_cmd *flash_cmd)
+{
+	int i, ret, retry;
+	struct goodix_flash_cmd tmp_cmd;
+	u32 flash_cmd_reg = goodix_fw_update_ctrl.update_info->flash_cmd_reg;
+
+	ts_info("try send flash cmd:%*ph", (int)sizeof(flash_cmd->buf),
+		flash_cmd->buf);
+	memset(tmp_cmd.buf, 0, sizeof(tmp_cmd));
+	ret = goodix_reg_write(flash_cmd_reg,
+		flash_cmd->buf, sizeof(flash_cmd->buf));
+	if (ret) {
+		ts_err("failed send flash cmd %d", ret);
+		return ret;
+	}
+
+	retry = 5;
+	for (i = 0; i < retry; i++) {
+		ret = goodix_reg_read(flash_cmd_reg,
+			tmp_cmd.buf, sizeof(tmp_cmd.buf));
+		if (!ret && tmp_cmd.ack == FLASH_CMD_ACK_CHK_PASS)
+			break;
+		usleep_range(5000, 5100);
+		ts_info("flash cmd ack error retry %d, ack 0x%x, ret %d",
+			i, tmp_cmd.ack, ret);
+	}
+	if (tmp_cmd.ack != FLASH_CMD_ACK_CHK_PASS) {
+		ts_err("flash cmd ack error, ack 0x%x, ret %d",
+			tmp_cmd.ack, ret);
+		ts_err("data:%*ph", (int)sizeof(tmp_cmd.buf), tmp_cmd.buf);
+		return -EINVAL;
+	}
+	ts_info("flash cmd ack check pass");
+
+	msleep(50);
+	retry = 20;
+	for (i = 0; i < retry; i++) {
+		ret = goodix_reg_read(flash_cmd_reg,
+			tmp_cmd.buf, sizeof(tmp_cmd.buf));
+		if (!ret && tmp_cmd.ack == FLASH_CMD_ACK_CHK_PASS &&
+			tmp_cmd.status == FLASH_CMD_W_STATUS_WRITE_OK) {
+			ts_info("flash status check pass");
+			return 0;
+		}
+
+		ts_info("flash cmd status not ready, retry %d, ack 0x%x, status 0x%x, ret %d",
+				i, tmp_cmd.ack, tmp_cmd.status, ret);
+		usleep_range(10000, 11000);
+	}
+
+	ts_err("flash cmd status error %d, ack 0x%x, status 0x%x, ret %d",
+		i, tmp_cmd.ack, tmp_cmd.status, ret);
+	if (ret) {
+		ts_info("reason: bus or paltform error");
+		return -EINVAL;
+	}
+
+	switch (tmp_cmd.status) {
+	case FLASH_CMD_W_STATUS_CHK_PASS:
+		ts_err("data check pass, but failed get follow-up results");
+		return -EFAULT;
+	case FLASH_CMD_W_STATUS_CHK_FAIL:
+		ts_err("data check failed, please retry");
+		return -EAGAIN;
+	case FLASH_CMD_W_STATUS_ADDR_ERR:
+		ts_err("flash target addr error, please check");
+		return -EFAULT;
+	case FLASH_CMD_W_STATUS_WRITE_ERR:
+		ts_err("flash data write err, please retry");
+		return -EAGAIN;
+	default:
+		ts_err("unknown status");
+		return -EFAULT;
+	}
+}
+
+static int goodix_flash_package(u8 subsys_type, u8 *pkg,
+	u32 flash_addr, u16 pkg_len)
+{
+	int ret, retry;
+	struct goodix_flash_cmd flash_cmd;
+	u32 isp_buffer_reg = goodix_fw_update_ctrl.update_info->isp_buffer_reg;
+
+	retry = 2;
+	do {
+		ret = goodix_reg_write(isp_buffer_reg, pkg, pkg_len);
+		if (ret < 0) {
+			ts_err("Failed to write firmware packet");
+			return ret;
+		}
+
+		flash_cmd.status = 0;
+		flash_cmd.ack = 0;
+		flash_cmd.len = FLASH_CMD_LEN;
+		flash_cmd.cmd = FLASH_CMD_TYPE_WRITE;
+		flash_cmd.fw_type = subsys_type;
+		flash_cmd.fw_len = cpu_to_le16(pkg_len);
+		flash_cmd.fw_addr = cpu_to_le32(flash_addr);
+
+		goodix_append_checksum(&(flash_cmd.buf[2]),
+				9, CHECKSUM_MODE_U8_LE);
+
+		ret = goodix_send_flash_cmd(&flash_cmd);
+		if (!ret) {
+			ts_info("success write package to 0x%05X, len %d",
+				flash_addr, pkg_len - 4);
+			return 0;
+		}
+	} while (ret == -EAGAIN && --retry);
+
+	return ret;
+}
+
+/**
+ * goodix_flash_subsystem - flash subsystem firmware,
+ *  Main flow of flashing firmware.
+ *	Each firmware subsystem is divided into several
+ *	packets, the max size of packet is limited to
+ *	@{ISP_MAX_BUFFERSIZE}
+ * @dev: pointer to touch device
+ * @subsys: subsystem information
+ * return: 0 ok, < 0 error
+ */
+static int goodix_flash_subsystem(struct fw_subsys_info *subsys)
+{
+	u32 data_size, offset;
+	u32 total_size;
+	//TODO: confirm flash addr ,<< 8??
+	u32 subsys_base_addr = subsys->flash_addr;
+	u8 *fw_packet = NULL;
+	int r = 0;
+
+	/*
+	 * if bus(i2c/spi) error occued, then exit, we will do
+	 * hardware reset and re-prepare ISP and then retry
+	 * flashing
+	 */
+	total_size = subsys->size;
+	fw_packet = kzalloc(ISP_MAX_BUFFERSIZE + 4, GFP_KERNEL);
+	if (!fw_packet) {
+		ts_err("Failed alloc memory");
+		return -EINVAL;
+	}
+
+	offset = 0;
+	while (total_size > 0) {
+		data_size = total_size > ISP_MAX_BUFFERSIZE ?
+				ISP_MAX_BUFFERSIZE : total_size;
+		ts_info("Flash firmware to 0x%05X,size:%u bytes",
+			subsys_base_addr + offset, data_size);
+
+		memcpy(fw_packet, &subsys->data[offset], data_size);
+		/* set checksum for package data */
+		goodix_append_checksum(fw_packet,
+				data_size, CHECKSUM_MODE_U16_LE);
+
+		r = goodix_flash_package(subsys->type, fw_packet,
+				subsys_base_addr + offset, data_size + 4);
+		if (r) {
+			ts_err("failed flash to 0x%05X,size:%u bytes",
+			subsys_base_addr + offset, data_size);
+			break;
+		}
+		offset += data_size;
+		total_size -= data_size;
+	} /* end while */
+
+	kfree(fw_packet);
+	return r;
+}
+
+/**
+ * goodix_flash_firmware - flash firmware
+ * @dev: pointer to touch device
+ * @fw_data: firmware data
+ * return: 0 ok, < 0 error
+ */
+static int goodix_flash_firmware(struct fw_update_ctrl *fw_ctrl)
+{
+	struct firmware_data *fw_data = &fw_ctrl->fw_data;
+	struct  firmware_summary  *fw_summary;
+	struct fw_subsys_info *fw_x;
+	struct fw_subsys_info subsys_cfg = {0};
+	u32 config_data_reg = fw_ctrl->update_info->config_data_reg;
+	int retry = GOODIX_BUS_RETRY_TIMES;
+	int i, r = 0, fw_num;
+
+	/*	start from subsystem 1,
+	 *	subsystem 0 is the ISP program
+	 */
+
+	fw_summary = &fw_data->fw_summary;
+	fw_num = fw_summary->subsys_num;
+
+	/* flash config data first if we have */
+	if (fw_ctrl->ic_config && fw_ctrl->ic_config->len) {
+		subsys_cfg.data = fw_ctrl->ic_config->data;
+		subsys_cfg.size = GOODIX_CFG_MAX_SIZE;
+		subsys_cfg.flash_addr = config_data_reg;
+		subsys_cfg.type = CONFIG_DATA_TYPE;
+		r = goodix_flash_subsystem(&subsys_cfg);
+		if (r) {
+			ts_err("failed flash config with ISP, %d", r);
+			return r;
+		}
+		ts_info("success flash config with ISP");
+	}
+
+	for (i = 1; i < fw_num && retry;) {
+		ts_info("--- Start to flash subsystem[%d] ---", i);
+		fw_x = &fw_summary->subsys[i];
+		r = goodix_flash_subsystem(fw_x);
+		if (r == 0) {
+			ts_info("--- End flash subsystem[%d]: OK ---", i);
+			i++;
+		} else if (r == -EAGAIN) {
+			retry--;
+			ts_err("--- End flash subsystem%d: Fail, errno:%d, retry:%d ---",
+				i, r, GOODIX_BUS_RETRY_TIMES - retry);
+		} else if (r < 0) { /* bus error */
+			ts_err("--- End flash subsystem%d: Fatal error:%d exit ---",
+				i, r);
+			goto exit_flash;
+		}
+	}
+
+exit_flash:
+	return r;
+}
+
+/**
+ * goodix_update_finish - update finished, FREE resource
+ *  and reset flags---
+ * @fwu_ctrl: pointer to fw_update_ctrl structrue
+ * return: 0 ok, < 0 error
+ */
+static int goodix_update_finish(struct fw_update_ctrl *fwu_ctrl)
+{
+	struct goodix_ts_core *cd = fwu_ctrl->core_data;
+	int ret;
+
+	/* step 1: reset IC */
+	goodix_fw_update_reset(100);
+	/* step 2: read version */
+	ret = get_fw_version_info(&cd->fw_version);
+	if (ret < 0) {
+		ts_err("still failed to read version after upgraded");
+		return -EFAULT;
+	}
+	/* step 3: read ic info */
+	ret = cd->hw_ops->get_ic_info(cd, &cd->ic_info);
+	if (ret < 0) {
+		ts_err("still failed to read ic info after upgraded");
+		return -EFAULT;
+	}
+
+	ret = goodix_fw_version_compare(fwu_ctrl);
+	if (ret == COMPARE_EQUAL || ret == COMPARE_CFG_NOTEQUAL)
+		return 0;
+
+	return -EFAULT;
+}
+
+/**
+ * goodix_fw_update_proc - firmware update process, the entry of
+ *  firmware update flow
+ * @fwu_ctrl: firmware control
+ * return: = 0 update ok, < 0 error or NO_NEED_UPDATE
+ */
+int goodix_fw_update_proc(struct fw_update_ctrl *fwu_ctrl)
+{
+#define FW_UPDATE_RETRY		2
+	int retry0 = FW_UPDATE_RETRY;
+	int retry1 = FW_UPDATE_RETRY;
+	int ret = 0;
+
+	ret = goodix_parse_firmware(&fwu_ctrl->fw_data);
+	if (ret < 0)
+		return ret;
+
+	if (!(fwu_ctrl->mode & UPDATE_MODE_FORCE)) {
+		ret = goodix_fw_version_compare(fwu_ctrl);
+		if (!ret) {
+			ts_info("no need to upgrade");
+			return 0;
+		}
+		ts_info("need to upgrade");
+	}
+
+start_update:
+	fwu_ctrl->status = UPSTA_PREPARING;
+	do {
+		ret = goodix_update_prepare(fwu_ctrl);
+		if (ret) {
+			ts_err("failed prepare ISP, retry %d",
+				FW_UPDATE_RETRY - retry0);
+		}
+	} while (ret && --retry0 > 0);
+	if (ret) {
+		ts_err("Failed to prepare ISP, exit update:%d", ret);
+		goto err_fw_prepare;
+	}
+
+	/* progress: 20%~100% */
+	fwu_ctrl->status = UPSTA_UPDATING;
+	ret = goodix_flash_firmware(fwu_ctrl);
+	if (ret < 0 && --retry1 > 0) {
+		ts_err("Bus error, retry firmware update:%d",
+				FW_UPDATE_RETRY - retry1);
+		goto start_update;
+	}
+	if (ret)
+		ts_err("flash fw data enter error, ret:%d", ret);
+	else
+		ts_info("flash fw data success, need check version");
+
+err_fw_prepare:
+	ret = goodix_update_finish(fwu_ctrl);
+	if (!ret)
+		ts_info("Firmware update successfully");
+	else
+		ts_err("Firmware update failed, ret:%d", ret);
+
+	return ret;
+}
+
+/*
+ * goodix_sysfs_update_en_store: start fw update manually
+ * @buf: '1'[001] update in blocking mode with fwdata from sysfs
+ *       '2'[010] update in blocking mode with fwdata from request
+ *       '5'[101] update in unblocking mode with fwdata from sysfs
+ *       '6'[110] update in unblocking mode with fwdata from request
+ */
+static ssize_t goodix_sysfs_update_en_store(
+		struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	int ret = 0;
+	int mode = 0;
+	struct fw_update_ctrl *fw_ctrl = &goodix_fw_update_ctrl;
+
+	if (!buf || count <= 0) {
+		ts_err("invalid params");
+		return -EINVAL;
+	}
+	if (!fw_ctrl || !fw_ctrl->initialized) {
+		ts_err("fw module uninit");
+		return -EINVAL;
+	}
+
+	ts_info("set update mode:0x%x", buf[0]);
+	if (buf[0] == '1') {
+		mode = UPDATE_MODE_FORCE | UPDATE_MODE_BLOCK |
+			UPDATE_MODE_SRC_SYSFS;
+	} else if (buf[0] == '2') {
+		mode = UPDATE_MODE_FORCE | UPDATE_MODE_BLOCK |
+			UPDATE_MODE_SRC_REQUEST;
+	} else if (buf[0] == '5') {
+		mode = UPDATE_MODE_FORCE | UPDATE_MODE_SRC_SYSFS;
+	} else if (buf[0] == '6') {
+		mode = UPDATE_MODE_FORCE | UPDATE_MODE_SRC_REQUEST;
+	} else {
+		ts_err("invalid update mode:0x%x", buf[0]);
+		return -EINVAL;
+	}
+
+	ret = goodix_do_fw_update(NULL, mode);
+	if (!ret) {
+		ts_info("success do update work");
+		return count;
+	}
+	ts_err("failed do fw update work");
+	return -EINVAL;
+}
+
+static ssize_t goodix_sysfs_fwimage_store(struct file *file,
+		struct kobject *kobj, struct bin_attribute *attr,
+		char *buf, loff_t pos, size_t count)
+{
+	struct firmware **fw = &goodix_fw_update_ctrl.fw_data.fw_sysfs;
+
+	if (*fw == NULL) {
+		*fw = kzalloc(sizeof(**fw), GFP_KERNEL);
+		if (*fw == NULL)
+			return -ENOMEM;
+		(*fw)->data = vmalloc(GOODIX_FW_MAX_SIEZE);
+		if ((*fw)->data == NULL) {
+			kfree(*fw);
+			*fw = NULL;
+			return -ENOMEM;
+		}
+	}
+
+	if (pos + count > GOODIX_FW_MAX_SIEZE)
+		return -EFAULT;
+	memcpy((u8 *)&(*fw)->data[pos], buf, count);
+	(*fw)->size = pos + count;
+
+	return count;
+}
+
+/* return fw_update result */
+static ssize_t goodix_sysfs_result_show(
+		struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	struct fw_update_ctrl *fw_ctrl = &goodix_fw_update_ctrl;
+	char str[GOODIX_MAX_STR_LABLE_LEN] = {0};
+	int r = -EINVAL;
+
+	if (!fw_ctrl)
+		return r;
+
+	switch (fw_ctrl->status) {
+	case UPSTA_PREPARING:
+		sprintf(str, "preparing");
+		break;
+	case UPSTA_UPDATING:
+		sprintf(str, "updating");
+		break;
+	case UPSTA_SUCCESS:
+		sprintf(str, "success");
+		break;
+	case UPSTA_FAILED:
+		sprintf(str, "failed");
+		break;
+	case UPSTA_NOTWORK:
+	default:
+		sprintf(str, "notwork");
+		break;
+	}
+
+	r = snprintf(buf, PAGE_SIZE, "result:%s  spend_time:%dms\n",
+			str, fw_ctrl->spend_time);
+
+	return r;
+}
+
+static DEVICE_ATTR(update_en, 0220, NULL, goodix_sysfs_update_en_store);
+static DEVICE_ATTR(result, 0664, goodix_sysfs_result_show, NULL);
+
+static struct attribute *goodix_fwu_attrs[] = {
+	&dev_attr_update_en.attr,
+	&dev_attr_result.attr
+};
+
+static int goodix_fw_sysfs_init(struct goodix_ts_core *core_data,
+		struct fw_update_ctrl *fw_ctrl)
+{
+	int ret = 0, i;
+
+	fw_ctrl->kobj = kobject_create_and_add("fwupdate",
+					&core_data->pdev->dev.kobj);
+	if (!fw_ctrl->kobj) {
+		ts_err("failed create sub dir for fwupdate");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(goodix_fwu_attrs) && !ret; i++)
+		ret = sysfs_create_file(fw_ctrl->kobj, goodix_fwu_attrs[i]);
+
+	if (ret) {
+		ts_err("failed create fwu sysfs files");
+		while (--i >= 0)
+			sysfs_remove_file(fw_ctrl->kobj, goodix_fwu_attrs[i]);
+
+		kobject_put(fw_ctrl->kobj);
+		return -EINVAL;
+	}
+
+	fw_ctrl->attr_fwimage.attr.name = "fwimage";
+	fw_ctrl->attr_fwimage.attr.mode = 0664;
+	fw_ctrl->attr_fwimage.size = 0;
+	fw_ctrl->attr_fwimage.write = goodix_sysfs_fwimage_store;
+	ret = sysfs_create_bin_file(fw_ctrl->kobj, &fw_ctrl->attr_fwimage);
+	if (ret) {
+		ts_err("failed create fwimage bin node, %d", ret);
+		for (i = 0; i < ARRAY_SIZE(goodix_fwu_attrs); i++)
+			sysfs_remove_file(fw_ctrl->kobj, goodix_fwu_attrs[i]);
+		kobject_put(fw_ctrl->kobj);
+	}
+
+	return ret;
+}
+
+static void goodix_fw_sysfs_remove(void)
+{
+	struct fw_update_ctrl *fw_ctrl = &goodix_fw_update_ctrl;
+	int i;
+
+	sysfs_remove_bin_file(fw_ctrl->kobj, &fw_ctrl->attr_fwimage);
+
+	for (i = 0; i < ARRAY_SIZE(goodix_fwu_attrs); i++)
+		sysfs_remove_file(fw_ctrl->kobj,
+				goodix_fwu_attrs[i]);
+
+	kobject_put(fw_ctrl->kobj);
+}
+
+
+/**
+ * goodix_request_firmware - request firmware data from user space
+ *
+ * @fw_data: firmware struct, contains firmware header info
+ *	and firmware data pointer.
+ * return: 0 - OK, < 0 - error
+ */
+static int goodix_request_firmware(struct firmware_data *fw_data,
+				const char *name)
+{
+	struct fw_update_ctrl *fw_ctrl =
+		container_of(fw_data, struct fw_update_ctrl, fw_data);
+	struct device *dev = &(fw_ctrl->core_data->pdev->dev);
+	int r;
+	int retry = GOODIX_RETRY_3;
+
+	ts_info("Request firmware image [%s]", name);
+
+	while (retry--) {
+		r = request_firmware(&fw_data->firmware, name, dev);
+		if (!r)
+			break;
+		ts_info("get fw bin retry:[%d]", GOODIX_RETRY_3 - retry);
+		msleep(200);
+	}
+	if (retry < 0) {
+		ts_err("Firmware image [%s] not available,errno:%d", name, r);
+		return r;
+	}
+
+	ts_info("Firmware image [%s] is ready", name);
+	return 0;
+}
+
+/**
+ * relase firmware resources
+ *
+ */
+static inline void goodix_release_firmware(struct firmware_data *fw_data)
+{
+	if (fw_data->firmware) {
+		release_firmware(fw_data->firmware);
+		fw_data->firmware = NULL;
+	}
+}
+
+static int goodix_fw_update_thread(void *data)
+{
+	struct fw_update_ctrl *fwu_ctrl = data;
+	ktime_t start, end;
+	int r = -EINVAL;
+
+	start = ktime_get();
+	fwu_ctrl->spend_time = 0;
+	fwu_ctrl->status = UPSTA_NOTWORK;
+	mutex_lock(&fwu_ctrl->mutex);
+
+	ts_debug("notify update start");
+	goodix_ts_blocking_notify(NOTIFY_FWUPDATE_START, NULL);
+
+	if (fwu_ctrl->mode & UPDATE_MODE_SRC_REQUEST) {
+		ts_info("Firmware request update starts");
+		r = goodix_request_firmware(&fwu_ctrl->fw_data,
+						fwu_ctrl->fw_name);
+		if (r < 0)
+			goto out;
+	} else if (fwu_ctrl->mode & UPDATE_MODE_SRC_SYSFS) {
+		if (!fwu_ctrl->fw_data.fw_sysfs) {
+			ts_err("Invalid firmware from sysfs");
+			r = -EINVAL;
+			goto out;
+		}
+		if (fwu_ctrl->fw_data.fw_sysfs->size < 4096) {
+			ts_err("Invalid firmware size[%ld] from sysfs",
+					fwu_ctrl->fw_data.fw_sysfs->size);
+			vfree(fwu_ctrl->fw_data.fw_sysfs->data);
+			kfree(fwu_ctrl->fw_data.fw_sysfs);
+			fwu_ctrl->fw_data.fw_sysfs = NULL;
+			r = -EINVAL;
+			goto out;
+		}
+	} else {
+		ts_err("unknown update mode 0x%x", fwu_ctrl->mode);
+		r = -EINVAL;
+		goto out;
+	}
+
+	/* ready to update */
+	ts_debug("start update proc");
+	r = goodix_fw_update_proc(fwu_ctrl);
+
+	/* clean */
+	if (fwu_ctrl->mode & UPDATE_MODE_SRC_SYSFS) {
+		vfree(fwu_ctrl->fw_data.fw_sysfs->data);
+		kfree(fwu_ctrl->fw_data.fw_sysfs);
+		fwu_ctrl->fw_data.fw_sysfs = NULL;
+	} else if (fwu_ctrl->mode & UPDATE_MODE_SRC_REQUEST) {
+		goodix_release_firmware(&fwu_ctrl->fw_data);
+	}
+out:
+	fwu_ctrl->mode = UPDATE_MODE_DEFAULT;
+	mutex_unlock(&fwu_ctrl->mutex);
+
+	if (r) {
+		ts_err("fw update failed, %d", r);
+		fwu_ctrl->status = UPSTA_FAILED;
+		goodix_ts_blocking_notify(NOTIFY_FWUPDATE_FAILED, NULL);
+	} else {
+		ts_info("fw update success");
+		fwu_ctrl->status = UPSTA_SUCCESS;
+		goodix_ts_blocking_notify(NOTIFY_FWUPDATE_SUCCESS, NULL);
+	}
+
+	end = ktime_get();
+	fwu_ctrl->spend_time = ktime_to_ms(ktime_sub(end, start));
+
+	return r;
+}
+
+int goodix_do_fw_update(struct goodix_ic_config *ic_config, int mode)
+{
+	struct task_struct *fwu_thrd;
+	struct fw_update_ctrl *fwu_ctrl = &goodix_fw_update_ctrl;
+	int ret;
+
+	if (!fwu_ctrl->initialized) {
+		ts_err("fw mode uninit");
+		return -EINVAL;
+	}
+
+	fwu_ctrl->mode = mode;
+	fwu_ctrl->ic_config = ic_config;
+	ts_debug("fw update mode 0x%x", mode);
+	if (fwu_ctrl->mode & UPDATE_MODE_BLOCK) {
+		ret = goodix_fw_update_thread(fwu_ctrl);
+		ts_info("fw update return %d", ret);
+		return ret;
+	}
+	/* create and run update thread */
+	fwu_thrd = kthread_run(goodix_fw_update_thread,
+			fwu_ctrl, "goodix-fwu");
+	if (IS_ERR_OR_NULL(fwu_thrd)) {
+		ts_err("Failed to create update thread:%ld",
+				PTR_ERR(fwu_thrd));
+		return -EFAULT;
+	}
+	ts_info("success create fw update thread");
+	return 0;
+}
+
+int goodix_fw_update_init(struct goodix_ts_core *core_data)
+{
+	int ret;
+
+	if (!core_data || !core_data->hw_ops) {
+		ts_err("core_data && hw_ops cann't be null");
+		return -ENODEV;
+	}
+
+	mutex_init(&goodix_fw_update_ctrl.mutex);
+	goodix_fw_update_ctrl.core_data = core_data;
+	goodix_fw_update_ctrl.mode = 0;
+
+	strlcpy(goodix_fw_update_ctrl.fw_name, core_data->board_data.fw_name,
+		sizeof(goodix_fw_update_ctrl.fw_name));
+
+	ret = goodix_fw_sysfs_init(core_data, &goodix_fw_update_ctrl);
+	if (ret) {
+		ts_err("failed create fwupate sysfs node");
+		return ret;
+	}
+	if (core_data->bus->ic_type == IC_TYPE_BERLIN_A)
+		goodix_fw_update_ctrl.update_info = &update_bra;
+	else if (core_data->bus->ic_type == IC_TYPE_BERLIN_B)
+		goodix_fw_update_ctrl.update_info = &update_brb;
+	else
+		goodix_fw_update_ctrl.update_info = &update_brd;
+
+	goodix_fw_update_ctrl.initialized = 1;
+	return 0;
+}
+
+void goodix_fw_update_uninit(void)
+{
+	if (!goodix_fw_update_ctrl.initialized)
+		return;
+
+	mutex_lock(&goodix_fw_update_ctrl.mutex);
+	goodix_fw_sysfs_remove();
+	goodix_fw_update_ctrl.initialized = 0;
+	mutex_unlock(&goodix_fw_update_ctrl.mutex);
+	mutex_destroy(&goodix_fw_update_ctrl.mutex);
+}
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_hw.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_hw.c
new file mode 100644
index 00000000000000..cbc8f82cc92319
--- /dev/null
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_hw.c
@@ -0,0 +1,1716 @@
+ /*
+  * Goodix Touchscreen Driver
+  * Copyright (C) 2020 - 2021 Goodix, Inc.
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2 of the License, or
+  * (at your option) any later version.
+  *
+  * This program is distributed in the hope that it will be a reference
+  * to you, when you are integrating the GOODiX's CTP IC into your system,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * General Public License for more details.
+  *
+  */
+#include "goodix_ts_core.h"
+
+/* berlin_A SPI mode setting */
+#define GOODIX_SPI_MODE_REG			0xC900
+#define GOODIX_SPI_NORMAL_MODE_0	0x01
+
+/* berlin_A D12 setting */
+#define GOODIX_REG_CLK_STA0			0xD807
+#define GOODIX_CLK_STA0_ENABLE		0xFF
+#define GOODIX_REG_CLK_STA1			0xD806
+#define GOODIX_CLK_STA1_ENABLE		0x77
+#define GOODIX_REG_TRIM_D12			0xD006
+#define GOODIX_TRIM_D12_LEVEL		0x3C
+#define GOODIX_REG_RESET			0xD808
+#define GOODIX_RESET_EN				0xFA
+#define HOLD_CPU_REG_W				0x0002
+#define HOLD_CPU_REG_R				0x2000
+
+#define DEV_CONFIRM_VAL				0xAA
+#define BOOTOPTION_ADDR				0x10000
+#define FW_VERSION_INFO_ADDR_BRA	0x1000C
+#define FW_VERSION_INFO_ADDR		0x10014
+
+#define GOODIX_IC_INFO_MAX_LEN		1024
+#define GOODIX_IC_INFO_ADDR_BRA		0x10068
+#define GOODIX_IC_INFO_ADDR			0x10070
+
+
+enum brl_request_code {
+	BRL_REQUEST_CODE_CONFIG = 0x01,
+	BRL_REQUEST_CODE_REF_ERR = 0x02,
+	BRL_REQUEST_CODE_RESET = 0x03,
+	BRL_REQUEST_CODE_CLOCK = 0x04,
+};
+
+/*Add by T2M-mingwu.zhang for FP5-538 remarks: TP/LCD Device Information Development.[Begin]*/
+#ifdef CONFIG_EMKIT_INFO
+char emkit_buf[256] = {0,};
+#endif
+/*Add by T2M-mingwu.zhang [End]*/
+
+static int brl_select_spi_mode(struct goodix_ts_core *cd)
+{
+	int ret;
+	int i;
+	u8 w_value = GOODIX_SPI_NORMAL_MODE_0;
+	u8 r_value;
+
+	if (cd->bus->bus_type == GOODIX_BUS_TYPE_I2C ||
+			cd->bus->ic_type != IC_TYPE_BERLIN_A)
+		return 0;
+
+ts_err("zmw---brl_select_spi_mode");
+
+	for (i = 0; i < GOODIX_RETRY_5; i++) {
+		cd->hw_ops->write(cd, GOODIX_SPI_MODE_REG,
+				&w_value, 1);
+		ret = cd->hw_ops->read(cd, GOODIX_SPI_MODE_REG,
+				&r_value, 1);
+		if (!ret && r_value == w_value)
+			return 0;
+	}
+	ts_err("failed switch SPI mode, ret:%d r_value:%02x", ret, r_value);
+	return -EINVAL;
+}
+
+static int brl_dev_confirm(struct goodix_ts_core *cd)
+{
+	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+	int ret = 0;
+	int retry = GOODIX_RETRY_3;
+	u8 tx_buf[8] = {0};
+	u8 rx_buf[8] = {0};
+	u8 i =0;
+
+/*Add by T2M-mingwu.zhang for FP5-195 remarks: Double click on driver update.[Begin]*/	
+return ret;
+/*Add by T2M-mingwu.zhang [End]*/
+
+	if (cd->bus->ic_type == IC_TYPE_BERLIN_A &&
+			cd->bus->bus_type == GOODIX_BUS_TYPE_SPI)
+		return brl_select_spi_mode(cd);
+
+	memset(tx_buf, DEV_CONFIRM_VAL, sizeof(tx_buf));
+
+	for(i=0;i<8;i++)
+		ts_err("zmw---tx_buf[%d]",tx_buf[i]);	
+
+	while (retry--) {
+		ret = hw_ops->write(cd, BOOTOPTION_ADDR,
+			tx_buf, sizeof(tx_buf));
+		if (ret < 0)
+			return ret;
+		ret = hw_ops->read(cd, BOOTOPTION_ADDR,
+			rx_buf, sizeof(rx_buf));
+		if (ret < 0)
+			return ret;
+		if (!memcmp(tx_buf, rx_buf, sizeof(tx_buf)))
+			break;
+		usleep_range(5000, 5100);
+	}
+
+	if (retry < 0) {
+		ts_err("device confirm failed, rx_buf:%*ph", 8, rx_buf);
+		return -EINVAL;
+	}
+
+	ts_info("device connected");
+	return ret;
+}
+
+static int brl_reset_after(struct goodix_ts_core *cd)
+{
+	u8 reg_val[2] = {0};
+	u8 temp_buf[12] = {0};
+	int ret;
+	int retry;
+
+	if (cd->bus->ic_type != IC_TYPE_BERLIN_A)
+		return 0;
+
+	ts_info("IN");
+
+	/* select spi mode */
+/*Add by T2M-mingwu.zhang for FP5-195 remarks: Double click on driver update.[Begin]*/	
+	ret = brl_select_spi_mode(cd);
+	if (ret < 0)
+		return ret;
+/*Add by T2M-mingwu.zhang [End]*/
+
+	/* hold cpu */
+	retry = GOODIX_RETRY_10;
+	while (retry--) {
+		reg_val[0] = 0x01;
+		reg_val[1] = 0x00;
+		ret = cd->hw_ops->write(cd, HOLD_CPU_REG_W, reg_val, 2);
+		ret |= cd->hw_ops->read(cd, HOLD_CPU_REG_R, &temp_buf[0], 4);
+		ret |= cd->hw_ops->read(cd, HOLD_CPU_REG_R, &temp_buf[4], 4);
+		ret |= cd->hw_ops->read(cd, HOLD_CPU_REG_R, &temp_buf[8], 4);
+		if (!ret && !memcmp(&temp_buf[0], &temp_buf[4], 4) &&
+			!memcmp(&temp_buf[4], &temp_buf[8], 4) &&
+			!memcmp(&temp_buf[0], &temp_buf[8], 4)) {
+			break;
+		}
+	}
+	if (retry < 0) {
+		ts_err("failed to hold cpu, status:%*ph", 12, temp_buf);
+		return -EINVAL;
+	}
+
+	/* enable sta0 clk */
+	retry = GOODIX_RETRY_5;
+	while (retry--) {
+		reg_val[0] = GOODIX_CLK_STA0_ENABLE;
+		ret = cd->hw_ops->write(cd, GOODIX_REG_CLK_STA0, reg_val, 1);
+		ret |= cd->hw_ops->read(cd, GOODIX_REG_CLK_STA0, temp_buf, 1);
+		if (!ret && temp_buf[0] == GOODIX_CLK_STA0_ENABLE)
+			break;
+	}
+	if (retry < 0) {
+		ts_err("failed to enable group0 clock, ret:%d status:%02x",
+				ret, temp_buf[0]);
+		return -EINVAL;
+	}
+
+	/* enable sta1 clk */
+	retry = GOODIX_RETRY_5;
+	while (retry--) {
+		reg_val[0] = GOODIX_CLK_STA1_ENABLE;
+		ret = cd->hw_ops->write(cd, GOODIX_REG_CLK_STA1, reg_val, 1);
+		ret |= cd->hw_ops->read(cd, GOODIX_REG_CLK_STA1, temp_buf, 1);
+		if (!ret && temp_buf[0] == GOODIX_CLK_STA1_ENABLE)
+			break;
+	}
+	if (retry < 0) {
+		ts_err("failed to enable group1 clock, ret:%d status:%02x",
+				ret, temp_buf[0]);
+		return -EINVAL;
+	}
+
+	/* set D12 level */
+	retry = GOODIX_RETRY_5;
+	while (retry--) {
+		reg_val[0] = GOODIX_TRIM_D12_LEVEL;
+		ret = cd->hw_ops->write(cd, GOODIX_REG_TRIM_D12, reg_val, 1);
+		ret |= cd->hw_ops->read(cd, GOODIX_REG_TRIM_D12, temp_buf, 1);
+		if (!ret && temp_buf[0] == GOODIX_TRIM_D12_LEVEL)
+			break;
+	}
+	if (retry < 0) {
+		ts_err("failed to set D12, ret:%d status:%02x",
+				ret, temp_buf[0]);
+		return -EINVAL;
+	}
+
+	usleep_range(5000, 5100);
+	/* soft reset */
+	reg_val[0] = GOODIX_RESET_EN;
+	ret = cd->hw_ops->write(cd, GOODIX_REG_RESET, reg_val, 1);
+	if (ret < 0)
+		return ret;
+
+	/* select spi mode */
+	ret = brl_select_spi_mode(cd);
+	if (ret < 0)
+		return ret;
+
+	ts_info("OUT");
+
+	return 0;
+}
+
+static int brl_power_on(struct goodix_ts_core *cd, bool on)
+{
+	int ret = 0;
+	int iovdd_gpio = cd->board_data.iovdd_gpio;
+	int avdd_gpio = cd->board_data.avdd_gpio;
+	int reset_gpio = cd->board_data.reset_gpio;
+
+ts_err("zmw---brl_power_on---op");
+	if (on) {
+		if (avdd_gpio > 0) {
+			gpio_direction_output(avdd_gpio, 1);
+		} else if (cd->avdd) {
+			ret = regulator_enable(cd->avdd);
+ts_err("zmw---brl_power_on---222");			
+			if (ret < 0) {
+				ts_err("Failed to enable avdd:%d", ret);
+				goto power_off;
+			}
+		}
+
+		usleep_range(3000, 3100);
+		
+		if (iovdd_gpio > 0) {
+			gpio_direction_output(iovdd_gpio, 1);
+		} else if (cd->iovdd) {
+ts_err("zmw---brl_power_on---111");			
+			ret = regulator_enable(cd->iovdd);
+			if (ret < 0) {
+				ts_err("Failed to enable iovdd:%d", ret);
+				goto power_off;
+			}
+		}
+		usleep_range(15000, 15100);
+		gpio_direction_output(reset_gpio, 1);
+		usleep_range(4000, 4100);
+		msleep(GOODIX_NORMAL_RESET_DELAY_MS);	
+			
+		ret = brl_dev_confirm(cd);
+ts_err("zmw---brl_power_on---333");		
+		if (ret < 0)
+			goto power_off;
+		ret = brl_reset_after(cd);
+ts_err("zmw---brl_power_on---444");		
+		if (ret < 0)
+			goto power_off;
+
+		msleep(GOODIX_NORMAL_RESET_DELAY_MS);
+		return 0;
+	}
+
+power_off:
+	gpio_direction_output(reset_gpio, 0);
+	if (iovdd_gpio > 0)
+		gpio_direction_output(iovdd_gpio, 0);
+	else if (cd->iovdd)
+		regulator_disable(cd->iovdd);
+	if (avdd_gpio > 0)
+		gpio_direction_output(avdd_gpio, 0);
+	else if (cd->avdd)
+		regulator_disable(cd->avdd);
+	return ret;
+}
+
+int brl_suspend(struct goodix_ts_core *cd)
+{
+	u32 cmd_reg = cd->ic_info.misc.cmd_addr;
+	u8 sleep_cmd[] = {0x00, 0x00, 0x04, 0x84, 0x88, 0x00};
+
+	return cd->hw_ops->write(cd, cmd_reg, sleep_cmd, sizeof(sleep_cmd));
+}
+
+int brl_resume(struct goodix_ts_core *cd)
+{
+	return cd->hw_ops->reset(cd, GOODIX_NORMAL_RESET_DELAY_MS);
+}
+
+#define GOODIX_GESTURE_CMD_BA	0x12
+#define GOODIX_GESTURE_CMD		0xA6
+int brl_gesture(struct goodix_ts_core *cd, int gesture_type)
+{
+	struct goodix_ts_cmd cmd;
+	u32 type = ~(cd->gesture_type);
+
+	if (cd->bus->ic_type == IC_TYPE_BERLIN_A)
+		cmd.cmd = GOODIX_GESTURE_CMD_BA;
+	else
+		cmd.cmd = GOODIX_GESTURE_CMD;
+	cmd.len = 6;
+	cmd.data[0] = type & 0xFF;
+	cmd.data[1] = (type >> 8) & 0xFF;
+	if (cd->hw_ops->send_cmd(cd, &cmd))
+		ts_err("failed send gesture cmd");
+
+	return 0;
+}
+
+static int brl_reset(struct goodix_ts_core *cd, int delay)
+{
+	ts_info("chip_reset");
+
+	gpio_direction_output(cd->board_data.reset_gpio, 0);
+	usleep_range(2000, 2100);
+	gpio_direction_output(cd->board_data.reset_gpio, 1);
+	if (delay < 20)
+		usleep_range(delay * 1000, delay * 1000 + 100);
+	else
+		msleep(delay);
+
+	return brl_select_spi_mode(cd);
+}
+
+static int brl_irq_enbale(struct goodix_ts_core *cd, bool enable)
+{
+	if (enable && !atomic_cmpxchg(&cd->irq_enabled, 0, 1)) {
+		enable_irq(cd->irq);
+		ts_info("Irq enabled");
+		return 0;
+	}
+
+	if (!enable && atomic_cmpxchg(&cd->irq_enabled, 1, 0)) {
+		disable_irq(cd->irq);
+		ts_info("Irq disabled");
+		return 0;
+	}
+	ts_debug("warnning: irq deepth inbalance!");
+	return 0;
+}
+
+static int brl_read(struct goodix_ts_core *cd, unsigned int addr,
+		unsigned char *data, unsigned int len)
+{
+	struct goodix_bus_interface *bus = cd->bus;
+	int ret;
+
+	ret = bus->read(bus->dev, addr, data, len);
+	if (ret == 0)
+		ts_debug("[0x%04X]:%*ph", addr, len > 24 ? 24 : len, data);
+
+	return ret;
+}
+
+static int brl_write(struct goodix_ts_core *cd, unsigned int addr,
+		 unsigned char *data, unsigned int len)
+{
+	struct goodix_bus_interface *bus = cd->bus;
+	int ret;
+
+	ret = bus->write(bus->dev, addr, data, len);
+	if (ret == 0)
+		ts_debug("[0x%04X]:%*ph", addr, len > 24 ? 24 : len, data);	
+
+	return ret;
+}
+
+/* command ack info */
+#define CMD_ACK_IDLE             0x01
+#define CMD_ACK_BUSY             0x02
+#define CMD_ACK_BUFFER_OVERFLOW  0x03
+#define CMD_ACK_CHECKSUM_ERROR   0x04
+#define CMD_ACK_OK               0x80
+
+#define GOODIX_CMD_RETRY 6
+static DEFINE_MUTEX(cmd_mutex);
+static int brl_send_cmd(struct goodix_ts_core *cd,
+	struct goodix_ts_cmd *cmd)
+{
+	int ret, retry, i;
+	struct goodix_ts_cmd cmd_ack;
+	struct goodix_ic_info_misc *misc = &cd->ic_info.misc;
+	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+
+ts_err("zmw---brl_send_cmd---cmd_addr=[%d] fw_buffer_addr=[%d] touch_data_addr=[%d]op",
+misc->cmd_addr,misc->fw_buffer_addr,misc->touch_data_addr);
+
+	mutex_lock(&cmd_mutex);
+
+	cmd->state = 0;
+	cmd->ack = 0;
+	goodix_append_checksum(&(cmd->buf[2]), cmd->len - 2,
+		CHECKSUM_MODE_U8_LE);
+	ts_debug("cmd data %*ph", cmd->len, &(cmd->buf[2]));
+
+	retry = 0;
+	while (retry++ < GOODIX_CMD_RETRY) {
+ts_err("zmw---brl_send_cmd---111");		
+		ret = hw_ops->write(cd, misc->cmd_addr,
+				    cmd->buf, sizeof(*cmd));
+		if (ret < 0) {
+			ts_err("failed write command");
+			goto exit;
+		}
+		for (i = 0; i < GOODIX_CMD_RETRY; i++) {
+		
+			/* check command result */
+			ret = hw_ops->read(cd, misc->cmd_addr,
+				cmd_ack.buf, sizeof(cmd_ack));
+			if (ret < 0) {
+				ts_err("failed read command ack, %d", ret);
+				goto exit;
+			}
+			ts_err("cmd ack data %*ph",
+				 (int)sizeof(cmd_ack), cmd_ack.buf);
+			if (cmd_ack.ack == CMD_ACK_OK) {
+				msleep(40);		// wait for cmd response
+				ret = 0;
+				goto exit;
+			}
+			if (cmd_ack.ack == CMD_ACK_BUSY ||
+			    cmd_ack.ack == 0x00) {
+				usleep_range(1000, 1100);
+				continue;
+			}
+ts_err("zmw---brl_send_cmd---222");	
+			if (cmd_ack.ack == CMD_ACK_BUFFER_OVERFLOW)
+				usleep_range(10000, 11000);
+			usleep_range(1000, 1100);
+			break;
+		}
+	}
+	ts_err("failed get valid cmd ack");
+	ret = -EINVAL;
+exit:
+	mutex_unlock(&cmd_mutex);
+	return ret;
+}
+
+/* read from flash */
+#define FLASH_CMD_R_START           0x09 
+#define FLASH_CMD_W_START           0x0A
+#define FLASH_CMD_RW_FINISH         0x0B
+#define FLASH_CMD_STATE_READY       0x04
+#define FLASH_CMD_STATE_CHECKERR    0x05
+#define FLASH_CMD_STATE_DENY        0x06
+#define FLASH_CMD_STATE_OKAY        0x07
+static int goodix_flash_cmd(struct goodix_ts_core *cd,
+						uint8_t cmd, uint8_t status,
+						int retry_count)
+{
+	u32 cmd_addr = cd->ic_info.misc.cmd_addr;
+	struct goodix_ts_cmd temp_cmd;
+    int ret;
+    int i;
+    u8 rcv_buf[2];
+
+	temp_cmd.state = 0;
+	temp_cmd.ack = 0;
+    temp_cmd.len = 4;
+    temp_cmd.cmd = cmd;
+	goodix_append_checksum(&temp_cmd.buf[2], temp_cmd.len - 2,
+		CHECKSUM_MODE_U8_LE);
+	ret = brl_write(cd, cmd_addr, temp_cmd.buf, temp_cmd.len + 2);
+	if (ret < 0) {
+		ts_err("send flash cmd[%x] failed", cmd);
+		return ret;
+	}
+
+    for (i = 0; i < retry_count; i++) {
+		msleep(20);
+        ret = brl_read(cd, cmd_addr, rcv_buf, 2);
+        if (rcv_buf[0] == status && rcv_buf[1] == 0x80)
+            return 0;
+    }
+
+    ts_err("r_sta[0x%x] != status[0x%x]", rcv_buf[0], status);
+    return -EINVAL;
+}
+
+static int brl_flash_read(struct goodix_ts_core *cd,
+						unsigned int addr, unsigned char *buf,
+						unsigned int len)
+{
+    int i;
+    int ret;
+    u8 *tmp_buf;
+    u32 buffer_addr = cd->ic_info.misc.fw_buffer_addr;
+    struct goodix_ts_cmd temp_cmd;
+    uint32_t checksum = 0;
+    struct flash_head head_info;
+    u8 *p = (u8 *)&head_info.address;
+
+    tmp_buf = kzalloc(len + sizeof(head_info), GFP_KERNEL);
+    if (!tmp_buf)
+        return -ENOMEM;
+
+    head_info.address = cpu_to_le32(addr);
+    head_info.length = cpu_to_le32(len);
+    for (i = 0; i < 8; i += 2)
+        checksum += p[i] | (p[i + 1] << 8);
+    head_info.checksum = checksum;
+
+    ret = goodix_flash_cmd(cd, FLASH_CMD_R_START, FLASH_CMD_STATE_READY, 15);
+    if (ret < 0) {
+        ts_err("failed enter flash read state");
+        goto read_end;
+    }
+
+    ret = brl_write(cd, buffer_addr, (u8 *)&head_info, sizeof(head_info));
+    if (ret < 0) {
+        ts_err("failed write flash head info");
+        goto read_end;   
+    }
+
+    ret = goodix_flash_cmd(cd, FLASH_CMD_RW_FINISH, FLASH_CMD_STATE_OKAY, 50);
+    if (ret) {
+        ts_err("faild read flash ready state");
+        goto read_end;
+    }
+
+    ret = brl_read(cd, buffer_addr, tmp_buf, len + sizeof(head_info));
+    if (ret < 0) {
+        ts_err("failed read data len %lu", len + sizeof(head_info));
+        goto read_end;
+    }
+
+    checksum = 0;
+    for (i = 0; i < len + sizeof(head_info) - 4; i += 2)
+        checksum += tmp_buf[4 + i] | (tmp_buf[5 + i] << 8);
+
+    if (checksum != le32_to_cpup((__le32 *)tmp_buf)) {
+        ts_err("read back data checksum error");
+        ret = -EINVAL;
+        goto read_end;
+    }
+
+    memcpy(buf, tmp_buf + sizeof(head_info), len);
+    ret = 0;    
+read_end:
+    temp_cmd.len = 4;
+    temp_cmd.cmd = 0x0C;
+    brl_send_cmd(cd, &temp_cmd);
+    return ret;
+}
+
+#pragma  pack(1)
+struct goodix_config_head {
+	union {
+		struct {
+			u8 panel_name[8];
+			u8 fw_pid[8];
+			u8 fw_vid[4];
+			u8 project_name[8];
+			u8 file_ver[2];
+			u32 cfg_id;
+			u8 cfg_ver;
+			u8 cfg_time[8];
+			u8 reserved[15];
+			u8 flag;
+			u16 cfg_len;
+			u8 cfg_num;
+			u16 checksum;
+		};
+		u8 buf[64];
+	};
+};
+#pragma pack()
+
+#define CONFIG_CND_LEN			4
+#define CONFIG_CMD_START		0x04
+#define CONFIG_CMD_WRITE		0x05
+#define CONFIG_CMD_EXIT			0x06
+#define CONFIG_CMD_READ_START	0x07
+#define CONFIG_CMD_READ_EXIT	0x08
+
+#define CONFIG_CMD_STATUS_PASS	0x80
+#define CONFIG_CMD_WAIT_RETRY	20
+
+static int wait_cmd_status(struct goodix_ts_core *cd,
+	u8 target_status, int retry)
+{
+	struct goodix_ts_cmd cmd_ack;
+	struct goodix_ic_info_misc *misc = &cd->ic_info.misc;
+	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+	int i, ret;
+
+	for (i = 0; i < retry; i++) {
+		ret = hw_ops->read(cd, misc->cmd_addr, cmd_ack.buf,
+			sizeof(cmd_ack));
+		if (!ret && cmd_ack.state == target_status) {
+			ts_debug("status check pass");
+			return 0;
+		}
+		ts_debug("cmd buf %*ph", (int)sizeof(cmd_ack), cmd_ack.buf);
+		msleep(20);
+	}
+
+	ts_err("cmd status not ready, retry %d, ack 0x%x, status 0x%x, ret %d",
+			i, cmd_ack.ack, cmd_ack.state, ret);
+	return -EINVAL;
+}
+
+static int send_cfg_cmd(struct goodix_ts_core *cd,
+	struct goodix_ts_cmd *cfg_cmd)
+{
+	int ret;
+
+	ret = cd->hw_ops->send_cmd(cd, cfg_cmd);
+	if (ret) {
+		ts_err("failed write cfg prepare cmd %d", ret);
+		return ret;
+	}
+	ret = wait_cmd_status(cd, CONFIG_CMD_STATUS_PASS,
+		CONFIG_CMD_WAIT_RETRY);
+	if (ret) {
+		ts_err("failed wait for fw ready for config, %d", ret);
+		return ret;
+	}
+	return 0;
+}
+
+static int brl_send_config(struct goodix_ts_core *cd, u8 *cfg, int len)
+{
+	int ret;
+	u8 *tmp_buf;
+	struct goodix_ts_cmd cfg_cmd;
+	struct goodix_ic_info_misc *misc = &cd->ic_info.misc;
+	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+
+	if (len > misc->fw_buffer_max_len) {
+		ts_err("config len exceed limit %d > %d",
+			len, misc->fw_buffer_max_len);
+		return -EINVAL;
+	}
+
+	tmp_buf = kzalloc(len, GFP_KERNEL);
+	if (!tmp_buf)
+		return -ENOMEM;
+
+	cfg_cmd.len = CONFIG_CND_LEN;
+	cfg_cmd.cmd = CONFIG_CMD_START;
+	ret = send_cfg_cmd(cd, &cfg_cmd);
+	if (ret) {
+		ts_err("failed write cfg prepare cmd %d", ret);
+		goto exit;
+	}
+
+	ts_debug("try send config to 0x%x, len %d", misc->fw_buffer_addr, len);
+	ret = hw_ops->write(cd, misc->fw_buffer_addr, cfg, len);
+	if (ret) {
+		ts_err("failed write config data, %d", ret);
+		goto exit;
+	}
+	ret = hw_ops->read(cd, misc->fw_buffer_addr, tmp_buf, len);
+	if (ret) {
+		ts_err("failed read back config data");
+		goto exit;
+	}
+
+	if (memcmp(cfg, tmp_buf, len)) {
+		ts_err("config data read back compare file");
+		ret = -EINVAL;
+		goto exit;
+	}
+	/* notify fw for recive config */
+	memset(cfg_cmd.buf, 0, sizeof(cfg_cmd));
+	cfg_cmd.len = CONFIG_CND_LEN;
+	cfg_cmd.cmd = CONFIG_CMD_WRITE;
+	ret = send_cfg_cmd(cd, &cfg_cmd);
+	if (ret)
+		ts_err("failed send config data ready cmd %d", ret);
+
+exit:
+	memset(cfg_cmd.buf, 0, sizeof(cfg_cmd));
+	cfg_cmd.len = CONFIG_CND_LEN;
+	cfg_cmd.cmd = CONFIG_CMD_EXIT;
+	if (send_cfg_cmd(cd, &cfg_cmd)) {
+		ts_err("failed send config write end command");
+		ret = -EINVAL;
+	}
+
+	if (!ret) {
+		ts_info("success send config");
+		msleep(100);
+	}
+
+	kfree(tmp_buf);
+	return ret;
+}
+
+/*
+ * return: return config length on succes, other wise return < 0
+ **/
+static int brl_read_config(struct goodix_ts_core *cd, u8 *cfg, int size)
+{
+	int ret;
+	struct goodix_ts_cmd cfg_cmd;
+	struct goodix_ic_info_misc *misc = &cd->ic_info.misc;
+	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+	struct goodix_config_head cfg_head;
+
+	if (!cfg)
+		return -EINVAL;
+
+ts_err("zmw---brl_read_config---op");
+
+	cfg_cmd.len = CONFIG_CND_LEN;
+	cfg_cmd.cmd = CONFIG_CMD_READ_START;
+	ret = send_cfg_cmd(cd, &cfg_cmd);
+	if (ret) {
+		ts_err("failed send config read prepare command");
+		return ret;
+	}
+
+	ret = hw_ops->read(cd, misc->fw_buffer_addr,
+			   cfg_head.buf, sizeof(cfg_head));
+	if (ret) {
+		ts_err("failed read config head %d", ret);
+		goto exit;
+	}
+
+	if (checksum_cmp(cfg_head.buf, sizeof(cfg_head), CHECKSUM_MODE_U8_LE)) {
+		ts_err("config head checksum error");
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	cfg_head.cfg_len = le16_to_cpu(cfg_head.cfg_len);
+	if (cfg_head.cfg_len > misc->fw_buffer_max_len ||
+	    cfg_head.cfg_len > size) {
+		ts_err("cfg len exceed buffer size %d > %d", cfg_head.cfg_len,
+			 misc->fw_buffer_max_len);
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	memcpy(cfg, cfg_head.buf, sizeof(cfg_head));
+	ret = hw_ops->read(cd, misc->fw_buffer_addr + sizeof(cfg_head),
+			   cfg + sizeof(cfg_head), cfg_head.cfg_len);
+	if (ret) {
+		ts_err("failed read cfg pack, %d", ret);
+		goto exit;
+	}
+
+	ts_info("config len %d", cfg_head.cfg_len);
+	if (checksum_cmp(cfg + sizeof(cfg_head),
+			 cfg_head.cfg_len, CHECKSUM_MODE_U16_LE)) {
+		ts_err("config body checksum error");
+		ret = -EINVAL;
+		goto exit;
+	}
+	ts_info("success read config data: len %zu",
+		cfg_head.cfg_len + sizeof(cfg_head));
+exit:
+	memset(cfg_cmd.buf, 0, sizeof(cfg_cmd));
+	cfg_cmd.len = CONFIG_CND_LEN;
+	cfg_cmd.cmd = CONFIG_CMD_READ_EXIT;
+	if (send_cfg_cmd(cd, &cfg_cmd)) {
+		ts_err("failed send config read finish command");
+		ret = -EINVAL;
+	}
+	if (ret)
+		return -EINVAL;
+	return cfg_head.cfg_len + sizeof(cfg_head);
+}
+
+/*
+ *	return: 0 for no error.
+ *	GOODIX_EBUS when encounter a bus error
+ *	GOODIX_ECHECKSUM version checksum error
+ *	GOODIX_EVERSION  patch ID compare failed,
+ *	in this case the sensorID is valid.
+ */
+static int brl_read_version(struct goodix_ts_core *cd,
+			struct goodix_fw_version *version)
+{
+	int ret, i;
+	u32 fw_addr;
+	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+	u8 buf[sizeof(struct goodix_fw_version)] = {0};
+	u8 temp_pid[8] = {0};
+/*Add by T2M-mingwu.zhang for FP5-538 remarks: TP/LCD Device Information Development.[Begin]*/	
+#ifdef CONFIG_EMKIT_INFO	
+	int cnt = -EINVAL;
+#endif
+/*Add by T2M-mingwu.zhang [End]*/
+
+	if (cd->bus->ic_type == IC_TYPE_BERLIN_A)
+		fw_addr = FW_VERSION_INFO_ADDR_BRA;
+	else
+		fw_addr = FW_VERSION_INFO_ADDR;
+
+	for (i = 0; i < 2; i++) {
+		ret = hw_ops->read(cd, fw_addr, buf, sizeof(buf));
+		if (ret) {
+			ts_info("read fw version: %d, retry %d", ret, i);
+			ret = -GOODIX_EBUS;
+			usleep_range(5000, 5100);
+			continue;
+		}
+
+		if (!checksum_cmp(buf, sizeof(buf), CHECKSUM_MODE_U8_LE))
+			break;
+
+		ts_info("invalid fw version: checksum error!");
+		ts_info("fw version:%*ph", (int)sizeof(buf), buf);
+		ret = -GOODIX_ECHECKSUM;
+		usleep_range(10000, 11000);
+	}
+	if (ret) {
+		ts_err("failed get valied fw version");
+		return ret;
+	}
+	memcpy(version, buf, sizeof(*version));
+	memcpy(temp_pid, version->rom_pid, sizeof(version->rom_pid));
+
+/*Add by T2M-mingwu.zhang for FP5-538 remarks: TP/LCD Device Information Development.[Begin]*/	
+#ifdef CONFIG_EMKIT_INFO
+		cnt = snprintf(&emkit_buf[0], 256,"touch_ic:%s\n", version->patch_pid);
+		cnt += snprintf(&emkit_buf[cnt], 256,"fw_ver:%02x%02x%02x%02x\n", 
+			version->patch_vid[0],version->patch_vid[1],version->patch_vid[2],version->patch_vid[3]);
+		cnt += snprintf(&emkit_buf[cnt], 256,"vendor:%s\n", GOODIX_NAME);
+        SetModuleName(MODULE_TOUCH, emkit_buf, __FUNCTION__);
+#endif
+/*Add by T2M-mingwu.zhang [End]*/
+	
+	ts_info("rom_pid:%s", temp_pid);
+	ts_info("rom_vid:%*ph", (int)sizeof(version->rom_vid),
+		version->rom_vid);
+	ts_info("pid:%s", version->patch_pid);
+	ts_info("vid:%*ph", (int)sizeof(version->patch_vid),
+		version->patch_vid);
+	ts_info("sensor_id:%d", version->sensor_id);
+
+	return 0;
+}
+
+#define LE16_TO_CPU(x)  (x = le16_to_cpu(x))
+#define LE32_TO_CPU(x)  (x = le32_to_cpu(x))
+static int convert_ic_info(struct goodix_ts_core *cd, const u8 *data)
+{
+	int i;
+	struct goodix_ic_info *info = &cd->ic_info;
+	struct goodix_ic_info_version *version = &info->version;
+	struct goodix_ic_info_feature *feature = &info->feature;
+	struct goodix_ic_info_param *parm = &info->parm;
+	struct goodix_ic_info_misc *misc = &info->misc;
+	struct goodix_ic_info_other *other = &info->other;
+
+	info->length = le16_to_cpup((__le16 *)data);
+
+	data += 2;
+	memcpy(version, data, sizeof(*version));
+	version->config_id = le32_to_cpu(version->config_id);
+
+	data += sizeof(struct goodix_ic_info_version);
+	memcpy(feature, data, sizeof(*feature));
+	feature->freqhop_feature =
+		le16_to_cpu(feature->freqhop_feature);
+	feature->calibration_feature =
+		le16_to_cpu(feature->calibration_feature);
+	feature->gesture_feature =
+		le16_to_cpu(feature->gesture_feature);
+	feature->side_touch_feature =
+		le16_to_cpu(feature->side_touch_feature);
+	feature->stylus_feature =
+		le16_to_cpu(feature->stylus_feature);
+
+	data += sizeof(struct goodix_ic_info_feature);
+	parm->drv_num = *(data++);
+	parm->sen_num = *(data++);
+	parm->button_num = *(data++);
+	parm->force_num = *(data++);
+	parm->active_scan_rate_num = *(data++);
+	if (parm->active_scan_rate_num > MAX_SCAN_RATE_NUM) {
+		ts_err("invalid scan rate num %d > %d",
+			parm->active_scan_rate_num, MAX_SCAN_RATE_NUM);
+		return -EINVAL;
+	}
+	for (i = 0; i < parm->active_scan_rate_num; i++)
+		parm->active_scan_rate[i] =
+			le16_to_cpup((__le16 *)(data + i * 2));
+
+	data += parm->active_scan_rate_num * 2;
+	parm->mutual_freq_num = *(data++);
+	if (parm->mutual_freq_num > MAX_SCAN_FREQ_NUM) {
+		ts_err("invalid mntual freq num %d > %d",
+			parm->mutual_freq_num, MAX_SCAN_FREQ_NUM);
+		return -EINVAL;
+	}
+	for (i = 0; i < parm->mutual_freq_num; i++)
+		parm->mutual_freq[i] =
+			le16_to_cpup((__le16 *)(data + i * 2));
+
+	data += parm->mutual_freq_num * 2;
+	parm->self_tx_freq_num = *(data++);
+	if (parm->self_tx_freq_num > MAX_SCAN_FREQ_NUM) {
+		ts_err("invalid tx freq num %d > %d",
+			parm->self_tx_freq_num, MAX_SCAN_FREQ_NUM);
+		return -EINVAL;
+	}
+	for (i = 0; i < parm->self_tx_freq_num; i++)
+		parm->self_tx_freq[i] =
+			le16_to_cpup((__le16 *)(data + i * 2));
+
+	data += parm->self_tx_freq_num * 2;
+	parm->self_rx_freq_num = *(data++);
+	if (parm->self_rx_freq_num > MAX_SCAN_FREQ_NUM) {
+		ts_err("invalid rx freq num %d > %d",
+			parm->self_rx_freq_num, MAX_SCAN_FREQ_NUM);
+		return -EINVAL;
+	}
+	for (i = 0; i < parm->self_rx_freq_num; i++)
+		parm->self_rx_freq[i] =
+			le16_to_cpup((__le16 *)(data + i * 2));
+
+	data += parm->self_rx_freq_num * 2;
+	parm->stylus_freq_num = *(data++);
+	if (parm->stylus_freq_num > MAX_FREQ_NUM_STYLUS) {
+		ts_err("invalid stylus freq num %d > %d",
+			parm->stylus_freq_num, MAX_FREQ_NUM_STYLUS);
+		return -EINVAL;
+	}
+	for (i = 0; i < parm->stylus_freq_num; i++)
+		parm->stylus_freq[i] =
+			le16_to_cpup((__le16 *)(data + i * 2));
+
+	data += parm->stylus_freq_num * 2;
+	memcpy(misc, data, sizeof(*misc));
+	misc->cmd_addr = le32_to_cpu(misc->cmd_addr);
+	misc->cmd_max_len = le16_to_cpu(misc->cmd_max_len);
+	misc->cmd_reply_addr = le32_to_cpu(misc->cmd_reply_addr);
+	misc->cmd_reply_len = le16_to_cpu(misc->cmd_reply_len);
+	misc->fw_state_addr = le32_to_cpu(misc->fw_state_addr);
+	misc->fw_state_len = le16_to_cpu(misc->fw_state_len);
+	misc->fw_buffer_addr = le32_to_cpu(misc->fw_buffer_addr);
+	misc->fw_buffer_max_len = le16_to_cpu(misc->fw_buffer_max_len);
+	misc->frame_data_addr = le32_to_cpu(misc->frame_data_addr);
+	misc->frame_data_head_len = le16_to_cpu(misc->frame_data_head_len);
+
+	misc->fw_attr_len = le16_to_cpu(misc->fw_attr_len);
+	misc->fw_log_len = le16_to_cpu(misc->fw_log_len);
+	misc->stylus_struct_len = le16_to_cpu(misc->stylus_struct_len);
+	misc->mutual_struct_len = le16_to_cpu(misc->mutual_struct_len);
+	misc->self_struct_len = le16_to_cpu(misc->self_struct_len);
+	misc->noise_struct_len = le16_to_cpu(misc->noise_struct_len);
+	misc->touch_data_addr = le32_to_cpu(misc->touch_data_addr);
+	misc->touch_data_head_len = le16_to_cpu(misc->touch_data_head_len);
+	misc->point_struct_len = le16_to_cpu(misc->point_struct_len);
+	LE32_TO_CPU(misc->mutual_rawdata_addr);
+	LE32_TO_CPU(misc->mutual_diffdata_addr);
+	LE32_TO_CPU(misc->mutual_refdata_addr);
+	LE32_TO_CPU(misc->self_rawdata_addr);
+	LE32_TO_CPU(misc->self_diffdata_addr);
+	LE32_TO_CPU(misc->self_refdata_addr);
+	LE32_TO_CPU(misc->iq_rawdata_addr);
+	LE32_TO_CPU(misc->iq_refdata_addr);
+	LE32_TO_CPU(misc->im_rawdata_addr);
+	LE16_TO_CPU(misc->im_rawdata_len);
+	LE32_TO_CPU(misc->noise_rawdata_addr);
+	LE16_TO_CPU(misc->noise_rawdata_len);
+	LE32_TO_CPU(misc->stylus_rawdata_addr);
+	LE16_TO_CPU(misc->stylus_rawdata_len);
+	LE32_TO_CPU(misc->noise_data_addr);
+	LE32_TO_CPU(misc->esd_addr);
+
+	data += sizeof(*misc);
+	memcpy((u8 *)other, data, sizeof(*other));
+
+	return 0;
+}
+
+static void goodix_compatible_ic_info(struct goodix_ts_core *cd)
+{
+	struct goodix_ic_info_v2 *info_v2 = &cd->ic_info_v2;
+	struct goodix_ic_info *info = &cd->ic_info;
+	int i;
+
+	info->length = info_v2->length;
+	info->version.info_customer_id = info_v2->info_customer_id;
+	info->version.info_version_id = info_v2->info_version_id;
+	info->version.ic_die_id = info_v2->version.ic_die_id;
+	info->version.ic_version_id = info_v2->version.ic_version_id;
+	info->version.config_id = info_v2->version.config_id;
+	info->version.config_version = info_v2->version.config_version;
+	info->version.frame_data_customer_id = info_v2->version.frame_data_customer_id;
+	info->version.frame_data_version_id = info_v2->version.frame_data_version_id;
+	info->version.touch_data_customer_id = info_v2->version.touch_data_customer_id;
+	info->version.touch_data_version_id = info_v2->version.touch_data_version_id;
+
+	info->feature.freqhop_feature = info_v2->sample.freqhop_feature;
+	info->feature.calibration_feature = info_v2->sample.calibration_feature;
+	info->feature.gesture_feature = info_v2->sample.gesture_feature;
+	// info->feature.side_touch_feature = 0;
+	info->feature.stylus_feature = info_v2->sample.stylus_feature;
+
+	info->parm.drv_num = info_v2->sample.drv_num;
+	info->parm.sen_num = info_v2->sample.sen_num;
+	info->parm.button_num = info_v2->sample.button_num;
+	info->parm.force_num = info_v2->sample.force_num;
+	info->parm.active_scan_rate_num = info_v2->sample.active_scan_rate_num;
+	for (i = 0; i < info->parm.active_scan_rate_num; i++)
+		info->parm.active_scan_rate[i] = info_v2->sample.active_scan_rate[i];
+	info->parm.mutual_freq_num = info_v2->sample.mutual_freq_num;
+	for (i = 0; i < info->parm.mutual_freq_num; i++)
+		info->parm.mutual_freq[i] = info_v2->sample.mutual_freq[i];
+	info->parm.self_tx_freq_num = info_v2->sample.self_tx_freq_num;
+	for (i = 0; i < info->parm.self_tx_freq_num; i++)
+		info->parm.self_tx_freq[i] = info_v2->sample.self_tx_freq[i];
+	info->parm.self_rx_freq_num = info_v2->sample.self_rx_freq_num;
+	for (i = 0; i < info->parm.self_rx_freq_num; i++)
+		info->parm.self_rx_freq[i] = info_v2->sample.self_rx_freq[i];
+	info->parm.stylus_freq_num = info_v2->sample.stylus_freq_num;
+	for (i = 0; i < info->parm.stylus_freq_num; i++)
+		info->parm.stylus_freq[i] = info_v2->sample.stylus_freq[i];
+	
+	info->misc.cmd_addr = info_v2->address.cmd_addr;
+	info->misc.cmd_max_len = info_v2->address.cmd_max_len;
+	info->misc.cmd_reply_addr = info_v2->address.cmd_reply_addr;
+	info->misc.cmd_reply_len = info_v2->address.cmd_reply_len;
+	info->misc.fw_state_addr = info_v2->address.fw_state_addr;
+	info->misc.fw_state_len = info_v2->address.fw_state_len;
+	info->misc.fw_buffer_addr = info_v2->address.fw_buffer_addr;
+	info->misc.fw_buffer_max_len = info_v2->address.fw_buffer_max_len;
+	info->misc.frame_data_addr = info_v2->address.frame_data_addr;
+	info->misc.frame_data_head_len = info_v2->address.frame_data_head_len;
+	info->misc.fw_attr_len = info_v2->address.fw_attr_len;
+	info->misc.fw_log_len = info_v2->address.fw_log_len;
+	info->misc.pack_max_num = info_v2->address.pack_max_num;
+	info->misc.pack_compress_version = info_v2->address.pack_compress_version;
+	info->misc.stylus_struct_len = info_v2->address.stylus_struct_len;
+	info->misc.mutual_struct_len = info_v2->address.mutual_struct_len;
+	info->misc.self_struct_len = info_v2->address.self_struct_len;
+	info->misc.noise_struct_len = info_v2->address.noise_struct_len;
+	info->misc.touch_data_addr = info_v2->address.touch_data_addr;
+	info->misc.touch_data_head_len = info_v2->address.touch_data_head_len;
+	info->misc.point_struct_len = info_v2->address.point_struct_len;
+	info->misc.screen_real_max_x = info_v2->sample.screen_real_max_x;
+	info->misc.screen_real_max_y = info_v2->sample.screen_real_max_y;
+	info->misc.mutual_rawdata_addr = info_v2->address.mutual_rawdata_addr;
+	info->misc.mutual_diffdata_addr = info_v2->address.mutual_diffdata_addr;
+	info->misc.mutual_refdata_addr = info_v2->address.mutual_refdata_addr;
+	info->misc.self_rawdata_addr = info_v2->address.self_rawdata_addr;
+	info->misc.self_diffdata_addr = info_v2->address.self_diffdata_addr;
+	info->misc.self_refdata_addr = info_v2->address.self_refdata_addr;
+	info->misc.iq_rawdata_addr = info_v2->address.iq_rawdata_addr;
+	info->misc.iq_refdata_addr = info_v2->address.iq_refdata_addr;
+	info->misc.im_rawdata_addr = info_v2->address.im_rawdata_addr;
+	info->misc.im_rawdata_len = info_v2->address.im_rawdata_len;
+	info->misc.noise_rawdata_addr = info_v2->address.noise_rawdata_addr;
+	info->misc.noise_rawdata_len = info_v2->address.noise_rawdata_len;
+	info->misc.stylus_rawdata_addr = info_v2->address.stylus_rawdata_addr;
+	info->misc.stylus_rawdata_len = info_v2->address.stylus_rawdata_len;
+	info->misc.noise_data_addr = info_v2->address.noise_data_addr;
+	info->misc.esd_addr = info_v2->address.esd_addr;
+	info->misc.auto_scan_cmd_addr = info_v2->address.auto_scan_cmd_addr;
+	info->misc.auto_scan_info_addr = info_v2->address.auto_scan_info_addr;
+
+	info->other.normalize_k_version = info_v2->version.normalize_k_version;
+	info->other.irrigation_data_addr = info_v2->address.irrigation_data_addr;
+	info->other.algo_debug_data_addr = info_v2->address.algo_debug_data_addr;
+	info->other.algo_debug_data_len = info_v2->address.algo_debug_data_len;
+	info->other.update_sync_data_addr = info_v2->address.update_sync_data_addr;
+	info->other.screen_max_x = info_v2->sample.screen_max_x;
+	info->other.screen_max_y = info_v2->sample.screen_max_y;
+}
+
+static int convert_ic_info_v2(struct goodix_ts_core *cd, const u8 *data)
+{
+	struct goodix_ic_info_v2 *info_v2 = &cd->ic_info_v2;
+	int i;
+
+	info_v2->length = le16_to_cpup((__le16 *)data);
+	info_v2->info_customer_id = data[2];
+	info_v2->info_version_id = data[3];
+
+	// sub version
+	data += 4;
+	info_v2->version.length = le16_to_cpup((__le16 *)data);
+	memcpy((u8 *)&info_v2->version, data, info_v2->version.length);
+
+	// sub sample
+	data += info_v2->version.length;
+	info_v2->sample.length = le16_to_cpup((__le16 *)data);
+	memcpy((u8 *)&info_v2->sample, data, 16);
+
+	data += 16;
+	info_v2->sample.active_scan_rate_num = *data++;
+	if (info_v2->sample.active_scan_rate_num > MAX_SCAN_RATE_NUM) {
+		ts_err("invalid scan rate num %d > %d",
+			info_v2->sample.active_scan_rate_num, MAX_SCAN_RATE_NUM);
+		return -EINVAL;
+	}
+	for (i = 0; i < info_v2->sample.active_scan_rate_num; i++)
+		info_v2->sample.active_scan_rate[i] = le16_to_cpup((__le16 *)(data + i * 2));
+
+	data += info_v2->sample.active_scan_rate_num * 2;
+	info_v2->sample.mutual_freq_num = *data++;
+	if (info_v2->sample.mutual_freq_num > MAX_SCAN_FREQ_NUM) {
+		ts_err("invalid mntual freq num %d > %d",
+			info_v2->sample.mutual_freq_num, MAX_SCAN_FREQ_NUM);
+		return -EINVAL;
+	}
+	for (i = 0; i < info_v2->sample.mutual_freq_num; i++)
+		info_v2->sample.mutual_freq[i] = le16_to_cpup((__le16 *)(data + i * 2));
+
+	data += info_v2->sample.mutual_freq_num * 2;
+	info_v2->sample.self_tx_freq_num = *data++;
+	if (info_v2->sample.self_tx_freq_num > MAX_SCAN_FREQ_NUM) {
+		ts_err("invalid tx freq num %d > %d",
+			info_v2->sample.self_tx_freq_num, MAX_SCAN_FREQ_NUM);
+		return -EINVAL;
+	}
+	for (i = 0; i < info_v2->sample.self_tx_freq_num; i++)
+		info_v2->sample.self_tx_freq[i] = le16_to_cpup((__le16 *)(data + i * 2));
+
+	data += info_v2->sample.self_tx_freq_num * 2;
+	info_v2->sample.self_rx_freq_num = *data++;
+	if (info_v2->sample.self_rx_freq_num > MAX_SCAN_FREQ_NUM) {
+		ts_err("invalid rx freq num %d > %d",
+			info_v2->sample.self_rx_freq_num, MAX_SCAN_FREQ_NUM);
+		return -EINVAL;
+	}
+	for (i = 0; i < info_v2->sample.self_rx_freq_num; i++)
+		info_v2->sample.self_rx_freq[i] = le16_to_cpup((__le16 *)(data + i * 2));
+
+	data += info_v2->sample.self_rx_freq_num * 2;
+	info_v2->sample.stylus_freq_num = *data++;
+	if (info_v2->sample.stylus_freq_num > MAX_FREQ_NUM_STYLUS) {
+		ts_err("invalid stylus freq num %d > %d",
+			info_v2->sample.stylus_freq_num, MAX_FREQ_NUM_STYLUS);
+		return -EINVAL;
+	}
+	for (i = 0; i < info_v2->sample.stylus_freq_num; i++)
+		info_v2->sample.stylus_freq[i] = le16_to_cpup((__le16 *)(data + i * 2));
+
+	data += info_v2->sample.stylus_freq_num * 2;
+	info_v2->sample.stylus_tx2_freq_num = *data++;
+	if (info_v2->sample.stylus_tx2_freq_num > MAX_FREQ_NUM_STYLUS) {
+		ts_err("invalid stylus tx2 freq num %d > %d",
+			info_v2->sample.stylus_tx2_freq_num, MAX_FREQ_NUM_STYLUS);
+		return -EINVAL;
+	}
+	for (i = 0; i < info_v2->sample.stylus_tx2_freq_num; i++)
+		info_v2->sample.stylus_tx2_freq[i] = le16_to_cpup((__le16 *)(data + i * 2));
+
+	// sub address
+	data += info_v2->sample.stylus_tx2_freq_num * 2;
+	info_v2->address.length = le16_to_cpup((__le16 *)data);
+	memcpy((u8 *)&info_v2->address, data, info_v2->address.length);
+
+	// sub customer
+	data += info_v2->address.length;
+	info_v2->customer.length = le16_to_cpup((__le16 *)data);
+	memcpy((u8 *)&info_v2->customer, data, info_v2->customer.length);
+
+	goodix_compatible_ic_info(cd);
+	return 0;
+}
+
+static int brl_get_ic_info(struct goodix_ts_core *cd,
+	struct goodix_ic_info *ic_info)
+{
+	int ret, i;
+	u16 length = 0;
+	u32 ic_addr;
+	u8 info_ver;
+	u8 afe_data[GOODIX_IC_INFO_MAX_LEN] = {0};
+	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+
+	if (cd->bus->ic_type == IC_TYPE_BERLIN_A)
+		ic_addr = GOODIX_IC_INFO_ADDR_BRA;
+	else
+		ic_addr = GOODIX_IC_INFO_ADDR;
+
+	for (i = 0; i < GOODIX_RETRY_3; i++) {
+		ret = hw_ops->read(cd, ic_addr,
+				   (u8 *)&length, sizeof(length));
+		if (ret) {
+			ts_info("failed get ic info length, %d", ret);
+			usleep_range(5000, 5100);
+			continue;
+		}
+		length = le16_to_cpu(length);
+		if (length >= GOODIX_IC_INFO_MAX_LEN) {
+			ts_info("invalid ic info length %d, retry %d",
+				length, i);
+			continue;
+		}
+
+		ret = hw_ops->read(cd, ic_addr, afe_data, length);
+		if (ret) {
+			ts_info("failed get ic info data, %d", ret);
+			usleep_range(5000, 5100);
+			continue;
+		}
+		/* judge whether the data is valid */
+		if (is_risk_data((const uint8_t *)afe_data, length)) {
+			ts_info("fw info data invalid");
+			usleep_range(5000, 5100);
+			continue;
+		}
+		if (checksum_cmp((const uint8_t *)afe_data,
+					length, CHECKSUM_MODE_U8_LE)) {
+			ts_info("fw info checksum error!");
+			usleep_range(5000, 5100);
+			continue;
+		}
+		break;
+	}
+	if (i == GOODIX_RETRY_3) {
+		ts_err("failed get ic info");
+		return -EINVAL;
+	}
+
+	info_ver = afe_data[3];
+	if (info_ver < 0x80)
+		ret = convert_ic_info(cd, afe_data);
+	else
+		ret = convert_ic_info_v2(cd, afe_data);
+	if (ret) {
+		ts_err("convert ic info encounter error");
+		return ret;
+	}
+
+	/* check some key info */
+	if (!ic_info->misc.cmd_addr || !ic_info->misc.fw_buffer_addr ||
+	    !ic_info->misc.touch_data_addr) {
+		ts_err("cmd_addr fw_buf_addr and touch_data_addr is null");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#define GOODIX_ESD_TICK_WRITE_DATA	0xAA
+static int brl_esd_check(struct goodix_ts_core *cd)
+{
+	int ret;
+	u32 esd_addr;
+	u8 esd_value;
+
+	if (!cd->ic_info.misc.esd_addr)
+		return 0;
+
+	esd_addr = cd->ic_info.misc.esd_addr;
+	ret = cd->hw_ops->read(cd, esd_addr, &esd_value, 1);
+	if (ret) {
+		ts_err("failed get esd value, %d", ret);
+		return ret;
+	}
+
+	if (esd_value == GOODIX_ESD_TICK_WRITE_DATA) {
+		ts_err("esd check failed, 0x%x", esd_value);
+		return -EINVAL;
+	}
+	esd_value = GOODIX_ESD_TICK_WRITE_DATA;
+	ret = cd->hw_ops->write(cd, esd_addr, &esd_value, 1);
+	if (ret) {
+		ts_err("failed refrash esd value");
+		return ret;
+	}
+	return 0;
+}
+
+#define IRQ_EVENT_HEAD_LEN			8
+#define BYTES_PER_POINT				8
+#define BYTES_STYLUS_LEN			16
+#define IRQ_EVENT_MAX_SIZE 			106
+
+#define GOODIX_TOUCH_EVENT			0x80
+#define GOODIX_REQUEST_EVENT		0x40
+#define GOODIX_GESTURE_EVENT		0x20
+#define GOODIX_FP_EVENT				0x08
+
+static void goodix_parse_finger(struct goodix_touch_data *touch_data,
+				u8 *buf, int id)
+{
+	touch_data->coords[id].status = TS_TOUCH;
+	touch_data->coords[id].x = le16_to_cpup((__le16 *)(buf + 2));
+	touch_data->coords[id].y = le16_to_cpup((__le16 *)(buf + 4));
+	touch_data->coords[id].w = le16_to_cpup((__le16 *)(buf + 6));
+	touch_data->touch_num += 1;
+}
+
+static unsigned int goodix_pen_btn_code[] = {BTN_STYLUS, BTN_STYLUS2};
+static void goodix_parse_pen(struct goodix_pen_data *pen_data,
+				u8 *event_head, u8 *buf)
+{
+	u8 cur_key_map;
+	int16_t x_angle, y_angle;
+	int i;
+
+	pen_data->coords.status = TS_TOUCH;
+	pen_data->coords.x = le16_to_cpup((__le16 *)(buf + 2));
+	pen_data->coords.y = le16_to_cpup((__le16 *)(buf + 4));
+	pen_data->coords.p = le16_to_cpup((__le16 *)(buf + 6));
+	x_angle = le16_to_cpup((__le16 *)(buf + 8));
+	y_angle = le16_to_cpup((__le16 *)(buf + 10));
+	pen_data->coords.tilt_x = x_angle / 100;
+	pen_data->coords.tilt_y = y_angle / 100;
+
+	cur_key_map = (event_head[3] & 0x0F) >> 1;
+	for (i = 0; i < GOODIX_MAX_PEN_KEY; i++) {
+		pen_data->keys[i].code = goodix_pen_btn_code[i];
+		if (!(cur_key_map & (1 << i)))
+			continue;
+		pen_data->keys[i].status = TS_TOUCH;
+	}
+}
+
+// TODO: confirm these touch key code.
+static unsigned int goodix_touch_btn_code[] = {
+	KEY_0, KEY_1, KEY_2, KEY_3, KEY_4,
+	KEY_5, KEY_6, KEY_7, KEY_8, KEY_9
+};
+static void goodix_parse_key(struct goodix_touch_data *touch_data,
+		u8 *buf)
+{
+	int i;
+	u32 key_map = ((buf[3] & 0x03) << 8) | buf[2];
+
+	touch_data->have_key = true;
+	for (i = 0; i < GOODIX_MAX_KEY; i++) {
+		touch_data->keys[i].code = goodix_touch_btn_code[i];
+		if (!(key_map & (1 << i)))
+			continue;
+		touch_data->keys[i].status = TS_TOUCH;
+	}
+	ts_debug("touch key:0x%06x", key_map);
+}
+
+static int goodix_touch_handler(struct goodix_ts_core *cd,
+				struct goodix_ts_event *ts_event,
+				u8 *buffer)
+{
+	struct goodix_touch_data *touch_data = &ts_event->touch_data;
+	struct goodix_pen_data *pen_data = &ts_event->pen_data;
+	int point_struct_len = cd->ic_info.misc.point_struct_len;
+	u8 touch_num = 0;
+	u8 *data;
+	u8 point_type;
+	int checksum_len = 0;
+	int ret = 0;
+	int i;
+	int tid;
+
+	touch_num = buffer[2] & 0x0F;
+
+	if (touch_num > 0) {
+		data = buffer + IRQ_EVENT_HEAD_LEN;
+		for (i = 0; i < touch_num; i++) {
+			point_type = data[0] & 0x0F;
+			tid = (data[0] >> 4) & 0x0F;
+			switch (point_type) {
+			case POINT_TYPE_STYLUS_HOVER:
+				pen_data->is_hover = true;
+			case POINT_TYPE_STYLUS:
+				goodix_parse_pen(pen_data, buffer, data);
+				checksum_len += BYTES_STYLUS_LEN;
+				data += BYTES_STYLUS_LEN;
+				break;
+			case POINT_TYPE_FINGER:
+				goodix_parse_finger(touch_data, data, tid);
+				checksum_len += point_struct_len;
+				data += point_struct_len;
+				break;
+			case POINT_TYPE_KEY:
+				goodix_parse_key(touch_data, data);
+				checksum_len += BYTES_PER_POINT;
+				data += BYTES_PER_POINT;
+				break;
+			case POINT_TYPE_GLOVE:
+				break;
+			default:
+				ts_debug("not support point type:%d", point_type);
+				break;
+			}
+		}
+		ret = checksum_cmp(&buffer[IRQ_EVENT_HEAD_LEN],
+					checksum_len + 2,
+					CHECKSUM_MODE_U8_LE);
+		if (ret) {
+			ts_debug("touch data checksum error");
+			return -EINVAL;
+		}
+	}
+
+	ts_event->fp_flag = buffer[0] & GOODIX_FP_EVENT;
+	return 0;
+}
+
+static int brl_event_handler(struct goodix_ts_core *cd,
+			 struct goodix_ts_event *ts_event)
+{
+	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+	struct goodix_ic_info_misc *misc = &cd->ic_info.misc;
+	u8 pre_buf[IRQ_EVENT_MAX_SIZE];
+	u8 event_status;
+	int ret;
+
+	memset(ts_event, 0, sizeof(*ts_event));
+
+	ret = hw_ops->read(cd, misc->touch_data_addr,
+			   pre_buf, sizeof(pre_buf));
+	if (ret) {
+		ts_debug("failed get event head data");
+		return ret;
+	}
+
+	if (pre_buf[0] == 0x00) {
+		ts_debug("invalid touch head");
+		return -EINVAL;
+	}
+
+	/* read done */
+	hw_ops->after_event_handler(cd);
+
+	if (checksum_cmp(pre_buf, IRQ_EVENT_HEAD_LEN, CHECKSUM_MODE_U8_LE)) {
+		ts_debug("touch head checksum err[%*ph]",
+				IRQ_EVENT_HEAD_LEN, pre_buf);
+		return -EINVAL;
+	}
+
+	event_status = pre_buf[0];
+	if (event_status & GOODIX_TOUCH_EVENT) {
+		ret = goodix_touch_handler(cd, ts_event, pre_buf);
+		if (ret < 0)
+			return ret;
+		ts_event->event_type |= EVENT_TOUCH;
+		if (cd->board_data.pen_enable)
+			ts_event->event_type |= EVENT_PEN;
+	}
+
+	if (event_status & GOODIX_REQUEST_EVENT) {
+		ts_event->event_type |= EVENT_REQUEST;
+		if (pre_buf[2] == BRL_REQUEST_CODE_CONFIG)
+			ts_event->request_code = REQUEST_TYPE_CONFIG;
+		else if (pre_buf[2] == BRL_REQUEST_CODE_RESET)
+			ts_event->request_code = REQUEST_TYPE_RESET;
+		else
+			ts_debug("unsupported request code 0x%x", pre_buf[2]);
+	}
+
+	if (event_status & GOODIX_GESTURE_EVENT) {
+		ts_event->event_type |= EVENT_GESTURE;
+		ts_event->gesture_type = pre_buf[4];
+		memcpy(ts_event->gesture_data, &pre_buf[8],
+				GOODIX_GESTURE_DATA_LEN);
+	}
+
+	return 0;
+}
+
+static int brl_after_event_handler(struct goodix_ts_core *cd)
+{
+	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+	struct goodix_ic_info_misc *misc = &cd->ic_info.misc;
+	u8 sync_clean = 0;
+
+	if (cd->tools_ctrl_sync)
+		return 0;
+	return hw_ops->write(cd, misc->touch_data_addr,
+		&sync_clean, 1);
+}
+
+static int brld_get_framedata(struct goodix_ts_core *cd,
+		struct ts_rawdata_info *info)
+{
+	int ret;
+	unsigned char val;
+	int retry = 20;
+	unsigned char frame_buf[GOODIX_MAX_FRAMEDATA_LEN];
+	unsigned char *cur_ptr;
+	unsigned int flag_addr = cd->ic_info.misc.frame_data_addr;
+	int tx = cd->ic_info.parm.drv_num;
+	int rx = cd->ic_info.parm.sen_num;
+
+	/* clean touch event flag */
+	val = 0;
+	ret = brl_write(cd, flag_addr, &val, 1);
+	if (ret < 0) {
+		ts_err("clean touch event failed, exit!");
+		return ret;
+	}
+
+	while (retry--) {
+		usleep_range(2000, 2100);
+		ret = brl_read(cd, flag_addr, &val, 1);
+		if (!ret && (val & GOODIX_TOUCH_EVENT))
+			break;
+	}
+	if (retry < 0) {
+		ts_err("framedata is not ready val:0x%02x, exit!", val);
+		return -EINVAL;
+	}
+
+	ret = brl_read(cd, flag_addr, frame_buf, GOODIX_MAX_FRAMEDATA_LEN);
+	if (ret < 0) {
+		ts_err("read frame data failed");
+		return ret;
+	}
+
+	if (checksum_cmp(frame_buf, cd->ic_info.misc.frame_data_head_len,
+			CHECKSUM_MODE_U8_LE)) {
+		ts_err("frame head checksum error");
+		return -EINVAL;
+	}
+
+	cur_ptr = frame_buf;
+	cur_ptr += cd->ic_info.misc.frame_data_head_len;
+	cur_ptr += cd->ic_info.misc.fw_attr_len;
+	cur_ptr += cd->ic_info.misc.fw_log_len;
+	memcpy((u8 *)(info->buff + info->used_size), cur_ptr + 8,
+			tx * rx * 2);
+
+	return 0;
+}
+
+static int brld_get_cap_data(struct goodix_ts_core *cd,
+		struct ts_rawdata_info *info)
+{
+	struct goodix_ts_cmd temp_cmd;
+	int tx = cd->ic_info.parm.drv_num;
+	int rx = cd->ic_info.parm.sen_num;
+	int size = tx * rx;
+	int ret;
+
+	/* disable irq & close esd */
+	brl_irq_enbale(cd, false);
+	goodix_ts_blocking_notify(NOTIFY_ESD_OFF, NULL);
+
+	info->buff[0] = rx;
+	info->buff[1] = tx;
+	info->used_size = 2;
+
+	temp_cmd.cmd = 0x90;
+	temp_cmd.data[0] = 0x81;
+	temp_cmd.len = 5;
+	ret = brl_send_cmd(cd, &temp_cmd);
+	if (ret < 0) {
+		ts_err("report rawdata failed, exit!");
+		goto exit;
+	}
+
+	ret = brld_get_framedata(cd, info);
+	if (ret < 0) {
+		ts_err("brld get rawdata failed");
+		goto exit;
+	}
+	goodix_rotate_abcd2cbad(tx, rx, &info->buff[info->used_size]);
+	info->used_size += size;
+
+	temp_cmd.cmd = 0x90;
+	temp_cmd.data[0] = 0x82;
+	temp_cmd.len = 5;
+	ret = brl_send_cmd(cd, &temp_cmd);
+	if (ret < 0) {
+		ts_err("report diffdata failed, exit!");
+		goto exit;
+	}
+
+	ret = brld_get_framedata(cd, info);
+	if (ret < 0) {
+		ts_err("brld get diffdata failed");
+		goto exit;
+	}
+	goodix_rotate_abcd2cbad(tx, rx, &info->buff[info->used_size]);
+	info->used_size += size;
+
+exit:
+	temp_cmd.cmd = 0x90;
+	temp_cmd.data[0] = 0;
+	temp_cmd.len = 5;
+	brl_send_cmd(cd, &temp_cmd);
+	/* enable irq & esd */
+	brl_irq_enbale(cd, true);
+	goodix_ts_blocking_notify(NOTIFY_ESD_ON, NULL);
+	return ret;
+}
+
+#define GOODIX_CMD_RAWDATA	2
+#define GOODIX_CMD_COORD	0
+static int brl_get_capacitance_data(struct goodix_ts_core *cd,
+		struct ts_rawdata_info *info)
+{
+	int ret;
+	int retry = 20;
+	struct goodix_ts_cmd temp_cmd;
+	u32 flag_addr = cd->ic_info.misc.touch_data_addr;
+	u32 raw_addr = cd->ic_info.misc.mutual_rawdata_addr;
+	u32 diff_addr = cd->ic_info.misc.mutual_diffdata_addr;
+	int tx = cd->ic_info.parm.drv_num;
+	int rx = cd->ic_info.parm.sen_num;
+	int size = tx * rx;
+	u8 val;
+
+	if (!info) {
+		ts_err("input null ptr");
+		return -EIO;
+	}
+
+	if (cd->bus->ic_type == IC_TYPE_BERLIN_D ||
+			cd->bus->ic_type == IC_TYPE_NOTTINGHAM)
+		return brld_get_cap_data(cd, info);
+
+	/* disable irq & close esd */
+	brl_irq_enbale(cd, false);
+	goodix_ts_blocking_notify(NOTIFY_ESD_OFF, NULL);
+
+    /* switch rawdata mode */
+	temp_cmd.cmd = GOODIX_CMD_RAWDATA;
+	temp_cmd.len = 4;
+	ret = brl_send_cmd(cd, &temp_cmd);
+	if (ret < 0) {
+		ts_err("switch rawdata mode failed, exit!");
+		goto exit;
+	}
+
+	/* clean touch event flag */
+	val = 0;
+	ret = brl_write(cd, flag_addr, &val, 1);
+	if (ret < 0) {
+		ts_err("clean touch event failed, exit!");
+		goto exit;
+	}
+
+	while (retry--) {
+		usleep_range(5000, 5100);
+		ret = brl_read(cd, flag_addr, &val, 1);
+		if (!ret && (val & GOODIX_TOUCH_EVENT))
+			break;
+	}
+	if (retry < 0) {
+		ts_err("rawdata is not ready val:0x%02x, exit!", val);
+		goto exit;
+	}
+
+	/* obtain rawdata & diff_rawdata */
+	info->buff[0] = rx;
+	info->buff[1] = tx;
+	info->used_size = 2;
+
+	ret = brl_read(cd, raw_addr, (u8 *)&info->buff[info->used_size],
+			size * sizeof(s16));
+	if (ret < 0) {
+		ts_err("obtian raw_data failed, exit!");
+		goto exit;
+	}
+	goodix_rotate_abcd2cbad(tx, rx, &info->buff[info->used_size]);
+	info->used_size += size;
+
+	ret = brl_read(cd, diff_addr, (u8 *)&info->buff[info->used_size],
+			size * sizeof(s16));
+	if (ret < 0) {
+		ts_err("obtian diff_data failed, exit!");
+		goto exit;
+	}
+	goodix_rotate_abcd2cbad(tx, rx, &info->buff[info->used_size]);
+	info->used_size += size;
+
+exit:
+	/* switch coor mode */
+	temp_cmd.cmd = GOODIX_CMD_COORD;
+	temp_cmd.len = 4;
+	brl_send_cmd(cd, &temp_cmd);
+	/* clean touch event flag */
+	val = 0;
+	brl_write(cd, flag_addr, &val, 1);
+	/* enable irq & esd */
+	brl_irq_enbale(cd, true);
+	goodix_ts_blocking_notify(NOTIFY_ESD_ON, NULL);
+	return ret;
+}
+
+static struct goodix_ts_hw_ops brl_hw_ops = {
+	.power_on = brl_power_on,
+	.resume = brl_resume,
+	.suspend = brl_suspend,
+	.gesture = brl_gesture,
+	.reset = brl_reset,
+	.irq_enable = brl_irq_enbale,
+	.read = brl_read,
+	.write = brl_write,
+	.read_flash = brl_flash_read,
+	.send_cmd = brl_send_cmd,
+	.send_config = brl_send_config,
+	.read_config = brl_read_config,
+	.read_version = brl_read_version,
+	.get_ic_info = brl_get_ic_info,
+	.esd_check = brl_esd_check,
+	.event_handler = brl_event_handler,
+	.after_event_handler = brl_after_event_handler,
+	.get_capacitance_data = brl_get_capacitance_data,
+};
+
+struct goodix_ts_hw_ops *goodix_get_hw_ops(void)
+{
+	return &brl_hw_ops;
+}
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_i2c.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_i2c.c
new file mode 100644
index 00000000000000..c88afc212efc84
--- /dev/null
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_i2c.c
@@ -0,0 +1,264 @@
+ /*
+  * Goodix Touchscreen Driver
+  * Copyright (C) 2020 - 2021 Goodix, Inc.
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2 of the License, or
+  * (at your option) any later version.
+  *
+  * This program is distributed in the hope that it will be a reference
+  * to you, when you are integrating the GOODiX's CTP IC into your system,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * General Public License for more details.
+  *
+  */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/i2c.h>
+
+#include "goodix_ts_core.h"
+
+#define TS_DRIVER_NAME				"gtx8_i2c"
+#define I2C_MAX_TRANSFER_SIZE		256
+#define GOODIX_BUS_RETRY_TIMES		2
+#define GOODIX_REG_ADDR_SIZE		4
+
+static struct platform_device *goodix_pdev;
+struct goodix_bus_interface goodix_i2c_bus;
+
+static int goodix_i2c_read(struct device *dev, unsigned int reg,
+			 unsigned char *data, unsigned int len)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	unsigned int transfer_length = 0;
+	unsigned int pos = 0, address = reg;
+	unsigned char get_buf[128], addr_buf[GOODIX_REG_ADDR_SIZE];
+	int retry, r = 0;
+	struct i2c_msg msgs[] = {
+		{
+			.addr = client->addr,
+			.flags = !I2C_M_RD,
+			.buf = &addr_buf[0],
+			.len = GOODIX_REG_ADDR_SIZE,
+		}, {
+			.addr = client->addr,
+			.flags = I2C_M_RD,
+		}
+	};
+
+	if (likely(len < sizeof(get_buf))) {
+		/* code optimize, use stack memory */
+		msgs[1].buf = &get_buf[0];
+	} else {
+		msgs[1].buf = kzalloc(len, GFP_KERNEL);
+		if (msgs[1].buf == NULL)
+			return -ENOMEM;
+	}
+
+	while (pos != len) {
+		if (unlikely(len - pos > I2C_MAX_TRANSFER_SIZE))
+			transfer_length = I2C_MAX_TRANSFER_SIZE;
+		else
+			transfer_length = len - pos;
+
+		msgs[0].buf[0] = (address >> 24) & 0xFF;
+		msgs[0].buf[1] = (address >> 16) & 0xFF;
+		msgs[0].buf[2] = (address >> 8) & 0xFF;
+		msgs[0].buf[3] = address & 0xFF;
+		msgs[1].len = transfer_length;
+
+		for (retry = 0; retry < GOODIX_BUS_RETRY_TIMES; retry++) {
+			if (likely(i2c_transfer(client->adapter,
+						msgs, 2) == 2)) {
+				memcpy(&data[pos], msgs[1].buf,
+				       transfer_length);
+				pos += transfer_length;
+				address += transfer_length;
+				break;
+			}
+			ts_info("I2c read retry[%d]:0x%x", retry + 1, reg);
+			usleep_range(2000, 2100);
+		}
+		if (unlikely(retry == GOODIX_BUS_RETRY_TIMES)) {
+			ts_err("I2c read failed,dev:%02x,reg:%04x,size:%u",
+			       client->addr, reg, len);
+			r = -EAGAIN;
+			goto read_exit;
+		}
+	}
+
+read_exit:
+	if (unlikely(len >= sizeof(get_buf)))
+		kfree(msgs[1].buf);
+	return r;
+}
+
+static int goodix_i2c_write(struct device *dev, unsigned int reg,
+			unsigned char *data, unsigned int len)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	unsigned int pos = 0, transfer_length = 0;
+	unsigned int address = reg;
+	unsigned char put_buf[128];
+	int retry, r = 0;
+	struct i2c_msg msg = {
+			.addr = client->addr,
+			.flags = !I2C_M_RD,
+	};
+
+	if (likely(len + GOODIX_REG_ADDR_SIZE < sizeof(put_buf))) {
+		/* code optimize,use stack memory*/
+		msg.buf = &put_buf[0];
+	} else {
+		msg.buf = kmalloc(len + GOODIX_REG_ADDR_SIZE, GFP_KERNEL);
+		if (msg.buf == NULL)
+			return -ENOMEM;
+	}
+
+	while (pos != len) {
+		if (unlikely(len - pos > I2C_MAX_TRANSFER_SIZE -
+			     GOODIX_REG_ADDR_SIZE))
+			transfer_length = I2C_MAX_TRANSFER_SIZE -
+			     GOODIX_REG_ADDR_SIZE;
+		else
+			transfer_length = len - pos;
+		msg.buf[0] = (address >> 24) & 0xFF;
+		msg.buf[1] = (address >> 16) & 0xFF;
+		msg.buf[2] = (address >> 8) & 0xFF;
+		msg.buf[3] = address & 0xFF;
+
+		msg.len = transfer_length + GOODIX_REG_ADDR_SIZE;
+		memcpy(&msg.buf[GOODIX_REG_ADDR_SIZE],
+			&data[pos], transfer_length);
+
+		for (retry = 0; retry < GOODIX_BUS_RETRY_TIMES; retry++) {
+			if (likely(i2c_transfer(client->adapter,
+						&msg, 1) == 1)) {
+				pos += transfer_length;
+				address += transfer_length;
+				break;
+			}
+			ts_debug("I2c write retry[%d]", retry + 1);
+			msleep(20);
+		}
+		if (unlikely(retry == GOODIX_BUS_RETRY_TIMES)) {
+			ts_err("I2c write failed,dev:%02x,reg:%04x,size:%u",
+				client->addr, reg, len);
+			r = -EAGAIN;
+			goto write_exit;
+		}
+	}
+
+write_exit:
+	if (likely(len + GOODIX_REG_ADDR_SIZE >= sizeof(put_buf)))
+		kfree(msg.buf);
+	return r;
+}
+
+static void goodix_pdev_release(struct device *dev)
+{
+	ts_info("goodix pdev released");
+	kfree(goodix_pdev);
+}
+
+static int goodix_i2c_probe(struct i2c_client *client,
+	const struct i2c_device_id *dev_id)
+{
+	int ret = 0;
+
+	ts_info("goodix i2c probe in");
+	ret = i2c_check_functionality(client->adapter,
+		I2C_FUNC_I2C);
+	if (!ret)
+		return -EIO;
+
+	/* get ic type */
+	ret = goodix_get_ic_type(client->dev.of_node, &goodix_i2c_bus);
+	if (ret < 0)
+		return ret;
+
+	goodix_i2c_bus.bus_type = GOODIX_BUS_TYPE_I2C;
+	goodix_i2c_bus.dev = &client->dev;
+	goodix_i2c_bus.read = goodix_i2c_read;
+	goodix_i2c_bus.write = goodix_i2c_write;
+	/* ts core device */
+	goodix_pdev = kzalloc(sizeof(struct platform_device), GFP_KERNEL);
+	if (!goodix_pdev)
+		return -ENOMEM;
+
+	goodix_pdev->name = GOODIX_CORE_DRIVER_NAME;
+	goodix_pdev->id = 0;
+	goodix_pdev->num_resources = 0;
+	/*
+	 * you can find this platform dev in
+	 * /sys/devices/platform/goodix_ts.0
+	 * goodix_pdev->dev.parent = &client->dev;
+	 */
+	goodix_pdev->dev.platform_data = &goodix_i2c_bus;
+	goodix_pdev->dev.release = goodix_pdev_release;
+
+	/* register platform device, then the goodix_ts_core
+	 * module will probe the touch device.
+	 */
+	ret = platform_device_register(goodix_pdev);
+	if (ret) {
+		ts_err("failed register goodix platform device, %d", ret);
+		goto err_pdev;
+	}
+	ts_info("i2c probe out");
+	return ret;
+
+err_pdev:
+	kfree(goodix_pdev);
+	goodix_pdev = NULL;
+	ts_info("i2c probe out, %d", ret);
+	return ret;
+}
+
+static int goodix_i2c_remove(struct i2c_client *client)
+{
+	platform_device_unregister(goodix_pdev);
+	return 0;
+}
+
+#ifdef CONFIG_OF
+static const struct of_device_id i2c_matchs[] = {
+	{.compatible = "goodix,brl-a",},
+	{.compatible = "goodix,brl-b",},
+	{.compatible = "goodix,brl-d",},
+	{.compatible = "goodix,nottingham",},
+	{},
+};
+MODULE_DEVICE_TABLE(of, i2c_matchs);
+#endif
+
+static const struct i2c_device_id i2c_id_table[] = {
+	{TS_DRIVER_NAME, 0},
+	{},
+};
+MODULE_DEVICE_TABLE(i2c, i2c_id_table);
+
+static struct i2c_driver goodix_i2c_driver = {
+	.driver = {
+		.name = TS_DRIVER_NAME,
+		//.owner = THIS_MODULE,
+		.of_match_table = of_match_ptr(i2c_matchs),
+	},
+	.probe = goodix_i2c_probe,
+	.remove = goodix_i2c_remove,
+	.id_table = i2c_id_table,
+};
+
+int goodix_i2c_bus_init(void)
+{
+	ts_info("Goodix i2c driver init");
+	return i2c_add_driver(&goodix_i2c_driver);
+}
+
+void goodix_i2c_bus_exit(void)
+{
+	ts_info("Goodix i2c driver exit");
+	i2c_del_driver(&goodix_i2c_driver);
+}
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_spi.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_spi.c
new file mode 100644
index 00000000000000..f1a365c22d0642
--- /dev/null
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_spi.c
@@ -0,0 +1,298 @@
+ /*
+  * Goodix Touchscreen Driver
+  * Copyright (C) 2020 - 2021 Goodix, Inc.
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2 of the License, or
+  * (at your option) any later version.
+  *
+  * This program is distributed in the hope that it will be a reference
+  * to you, when you are integrating the GOODiX's CTP IC into your system,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * General Public License for more details.
+  *
+  */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spi/spi.h>
+
+#include "goodix_ts_core.h"
+#define TS_DRIVER_NAME		"gtx8_spi"
+
+#define SPI_TRANS_PREFIX_LEN    1
+#define REGISTER_WIDTH          4
+#define SPI_READ_DUMMY_LEN      4
+#define SPI_READ_PREFIX_LEN  \
+		(SPI_TRANS_PREFIX_LEN + REGISTER_WIDTH + SPI_READ_DUMMY_LEN)
+#define SPI_WRITE_PREFIX_LEN (SPI_TRANS_PREFIX_LEN + REGISTER_WIDTH)
+
+#define SPI_WRITE_FLAG  0xF0
+#define SPI_READ_FLAG   0xF1
+
+static struct platform_device *goodix_pdev;
+struct goodix_bus_interface goodix_spi_bus;
+
+/**
+ * goodix_spi_read_bra- read device register through spi bus
+ * @dev: pointer to device data
+ * @addr: register address
+ * @data: read buffer
+ * @len: bytes to read
+ * return: 0 - read ok, < 0 - spi transter error
+ */
+static int goodix_spi_read_bra(struct device *dev, unsigned int addr,
+	unsigned char *data, unsigned int len)
+{
+	struct spi_device *spi = to_spi_device(dev);
+	u8 *rx_buf = NULL;
+	u8 *tx_buf = NULL;
+	struct spi_transfer xfers;
+	struct spi_message spi_msg;
+	int ret = 0;
+
+	rx_buf = kzalloc(SPI_READ_PREFIX_LEN + len, GFP_KERNEL);
+	tx_buf = kzalloc(SPI_READ_PREFIX_LEN + len, GFP_KERNEL);
+	if (!rx_buf || !tx_buf) {
+		ts_err("alloc tx/rx_buf failed, size:%d",
+			SPI_READ_PREFIX_LEN + len);
+		return -ENOMEM;
+	}
+
+	spi_message_init(&spi_msg);
+	memset(&xfers, 0, sizeof(xfers));
+
+	/*spi_read tx_buf format: 0xF1 + addr(4bytes) + data*/
+	tx_buf[0] = SPI_READ_FLAG;
+	tx_buf[1] = (addr >> 24) & 0xFF;
+	tx_buf[2] = (addr >> 16) & 0xFF;
+	tx_buf[3] = (addr >> 8) & 0xFF;
+	tx_buf[4] = addr & 0xFF;
+	tx_buf[5] = 0xFF;
+	tx_buf[6] = 0xFF;
+	tx_buf[7] = 0xFF;
+	tx_buf[8] = 0xFF;
+
+	xfers.tx_buf = tx_buf;
+	xfers.rx_buf = rx_buf;
+	xfers.len = SPI_READ_PREFIX_LEN + len;
+	xfers.cs_change = 0;
+	spi_message_add_tail(&xfers, &spi_msg);
+	ret = spi_sync(spi, &spi_msg);
+	if (ret < 0) {
+		ts_err("spi transfer error:%d", ret);
+		goto exit;
+	}
+	memcpy(data, &rx_buf[SPI_READ_PREFIX_LEN], len);
+
+exit:
+	kfree(rx_buf);
+	kfree(tx_buf);
+	return ret;
+}
+
+static int goodix_spi_read(struct device *dev, unsigned int addr,
+	unsigned char *data, unsigned int len)
+{
+	struct spi_device *spi = to_spi_device(dev);
+	u8 *rx_buf = NULL;
+	u8 *tx_buf = NULL;
+	struct spi_transfer xfers;
+	struct spi_message spi_msg;
+	int ret = 0;
+
+	rx_buf = kzalloc(SPI_READ_PREFIX_LEN - 1 + len, GFP_KERNEL);
+	tx_buf = kzalloc(SPI_READ_PREFIX_LEN - 1 + len, GFP_KERNEL);
+	if (!rx_buf || !tx_buf) {
+		ts_err("alloc tx/rx_buf failed, size:%d",
+			SPI_READ_PREFIX_LEN - 1 + len);
+		return -ENOMEM;
+	}
+
+	spi_message_init(&spi_msg);
+	memset(&xfers, 0, sizeof(xfers));
+
+	/*spi_read tx_buf format: 0xF1 + addr(4bytes) + data*/
+	tx_buf[0] = SPI_READ_FLAG;
+	tx_buf[1] = (addr >> 24) & 0xFF;
+	tx_buf[2] = (addr >> 16) & 0xFF;
+	tx_buf[3] = (addr >> 8) & 0xFF;
+	tx_buf[4] = addr & 0xFF;
+	tx_buf[5] = 0xFF;
+	tx_buf[6] = 0xFF;
+	tx_buf[7] = 0xFF;
+
+	xfers.tx_buf = tx_buf;
+	xfers.rx_buf = rx_buf;
+	xfers.len = SPI_READ_PREFIX_LEN - 1 + len;
+	xfers.cs_change = 0;
+	spi_message_add_tail(&xfers, &spi_msg);
+	ret = spi_sync(spi, &spi_msg);
+	if (ret < 0) {
+		ts_err("spi transfer error:%d", ret);
+		goto exit;
+	}
+	memcpy(data, &rx_buf[SPI_READ_PREFIX_LEN - 1], len);
+
+exit:
+	kfree(rx_buf);
+	kfree(tx_buf);
+	return ret;
+}
+
+/**
+ * goodix_spi_write- write device register through spi bus
+ * @dev: pointer to device data
+ * @addr: register address
+ * @data: write buffer
+ * @len: bytes to write
+ * return: 0 - write ok; < 0 - spi transter error.
+ */
+static int goodix_spi_write(struct device *dev, unsigned int addr,
+		unsigned char *data, unsigned int len)
+{
+	struct spi_device *spi = to_spi_device(dev);
+	u8 *tx_buf = NULL;
+	struct spi_transfer xfers;
+	struct spi_message spi_msg;
+	int ret = 0;
+
+	tx_buf = kzalloc(SPI_WRITE_PREFIX_LEN + len, GFP_KERNEL);
+	if (!tx_buf)
+		return -ENOMEM;
+
+	spi_message_init(&spi_msg);
+	memset(&xfers, 0, sizeof(xfers));
+
+	tx_buf[0] = SPI_WRITE_FLAG;
+	tx_buf[1] = (addr >> 24) & 0xFF;
+	tx_buf[2] = (addr >> 16) & 0xFF;
+	tx_buf[3] = (addr >> 8) & 0xFF;
+	tx_buf[4] = addr & 0xFF;
+	memcpy(&tx_buf[SPI_WRITE_PREFIX_LEN], data, len);
+	xfers.tx_buf = tx_buf;
+	xfers.len = SPI_WRITE_PREFIX_LEN + len;
+	xfers.cs_change = 0;
+	spi_message_add_tail(&xfers, &spi_msg);
+	ret = spi_sync(spi, &spi_msg);
+	if (ret < 0)
+		ts_err("spi transfer error:%d", ret);
+
+	kfree(tx_buf);
+	return ret;
+}
+
+static void goodix_pdev_release(struct device *dev)
+{
+	ts_info("goodix pdev released");
+	kfree(goodix_pdev);
+}
+
+static int goodix_spi_probe(struct spi_device *spi)
+{
+	int ret = 0;
+
+	ts_info("goodix spi probe in");
+
+	/* init spi_device */
+	spi->mode            = SPI_MODE_0;
+	spi->bits_per_word   = 8;
+
+	ts_info("spi_info: speed[%d] mode[%d] bits_per_word[%d]",
+			spi->max_speed_hz, spi->mode, spi->bits_per_word);
+	ret = spi_setup(spi);
+	if (ret) {
+		ts_err("failed set spi mode, %d", ret);
+		return ret;
+	}
+
+	/* get ic type */
+	ret = goodix_get_ic_type(spi->dev.of_node, &goodix_spi_bus);
+	if (ret < 0)
+		return ret;
+
+	goodix_spi_bus.bus_type = GOODIX_BUS_TYPE_SPI;
+	goodix_spi_bus.dev = &spi->dev;
+	if (goodix_spi_bus.ic_type == IC_TYPE_BERLIN_A)
+		goodix_spi_bus.read = goodix_spi_read_bra;
+	else
+		goodix_spi_bus.read = goodix_spi_read;
+	goodix_spi_bus.write = goodix_spi_write;
+	/* ts core device */
+	goodix_pdev = kzalloc(sizeof(struct platform_device), GFP_KERNEL);
+	if (!goodix_pdev)
+		return -ENOMEM;
+
+	goodix_pdev->name = GOODIX_CORE_DRIVER_NAME;
+	goodix_pdev->id = 0;
+	goodix_pdev->num_resources = 0;
+	/*
+	 * you can find this platform dev in
+	 * /sys/devices/platfrom/goodix_ts.0
+	 * goodix_pdev->dev.parent = &client->dev;
+	 */
+	goodix_pdev->dev.platform_data = &goodix_spi_bus;
+	goodix_pdev->dev.release = goodix_pdev_release;
+
+	/* register platform device, then the goodix_ts_core
+	 * module will probe the touch deivce.
+	 */
+	ret = platform_device_register(goodix_pdev);
+	if (ret) {
+		ts_err("failed register goodix platform device, %d", ret);
+		goto err_pdev;
+	}
+	ts_info("spi probe out");
+	return 0;
+
+err_pdev:
+	kfree(goodix_pdev);
+	goodix_pdev = NULL;
+	ts_info("spi probe out, %d", ret);
+	return ret;
+}
+
+static int goodix_spi_remove(struct spi_device *spi)
+{
+	platform_device_unregister(goodix_pdev);
+	return 0;
+}
+
+#ifdef CONFIG_OF
+static const struct of_device_id spi_matchs[] = {
+	{.compatible = "goodix,brl-a",},
+	{.compatible = "goodix,brl-b",},
+	{.compatible = "goodix,brl-d",},
+	{.compatible = "goodix,nottingham",},
+	{},
+};
+#endif
+
+static const struct spi_device_id spi_id_table[] = {
+	{TS_DRIVER_NAME, 0},
+	{},
+};
+
+static struct spi_driver goodix_spi_driver = {
+	.driver = {
+		.name = TS_DRIVER_NAME,
+		//.owner = THIS_MODULE,
+		.of_match_table = spi_matchs,
+	},
+	.id_table = spi_id_table,
+	.probe = goodix_spi_probe,
+	.remove = goodix_spi_remove,
+};
+
+int goodix_spi_bus_init(void)
+{
+	ts_info("Goodix spi driver init");
+	return spi_register_driver(&goodix_spi_driver);
+}
+
+void goodix_spi_bus_exit(void)
+{
+	ts_info("Goodix spi driver exit");
+	spi_unregister_driver(&goodix_spi_driver);
+}
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_cfg_bin.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_cfg_bin.c
new file mode 100644
index 00000000000000..0e599f1d638144
--- /dev/null
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_cfg_bin.c
@@ -0,0 +1,357 @@
+ /*
+  * Goodix Touchscreen Driver
+  * Copyright (C) 2020 - 2021 Goodix, Inc.
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2 of the License, or
+  * (at your option) any later version.
+  *
+  * This program is distributed in the hope that it will be a reference
+  * to you, when you are integrating the GOODiX's CTP IC into your system,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * General Public License for more details.
+  *
+  */
+#include "goodix_ts_core.h"
+
+#define TS_BIN_VERSION_START_INDEX		5
+#define TS_BIN_VERSION_LEN				4
+#define TS_CFG_BIN_HEAD_RESERVED_LEN	6
+#define TS_CFG_OFFSET_LEN				2
+#define TS_IC_TYPE_NAME_MAX_LEN			15
+#define TS_CFG_BIN_HEAD_LEN \
+		(sizeof(struct goodix_cfg_bin_head) + \
+		TS_CFG_BIN_HEAD_RESERVED_LEN)
+#define TS_PKG_CONST_INFO_LEN \
+		(sizeof(struct goodix_cfg_pkg_const_info))
+#define TS_PKG_REG_INFO_LEN	\
+		(sizeof(struct goodix_cfg_pkg_reg_info))
+#define TS_PKG_HEAD_LEN \
+		(TS_PKG_CONST_INFO_LEN + TS_PKG_REG_INFO_LEN)
+
+/*cfg block definitin*/
+#define TS_CFG_BLOCK_PID_LEN		8
+#define TS_CFG_BLOCK_VID_LEN		8
+#define TS_CFG_BLOCK_FW_MASK_LEN	9
+#define TS_CFG_BLOCK_FW_PATCH_LEN	4
+#define TS_CFG_BLOCK_RESERVED_LEN	9
+
+#define TS_NORMAL_CFG				0x01
+#define TS_HIGH_SENSE_CFG			0x03
+#define TS_RQST_FW_RETRY_TIMES		2
+
+#pragma pack(1)
+struct goodix_cfg_pkg_reg {
+	u16 addr;
+	u8 reserved1;
+	u8 reserved2;
+};
+
+struct goodix_cfg_pkg_const_info {
+	u32 pkg_len;
+	u8 ic_type[TS_IC_TYPE_NAME_MAX_LEN];
+	u8 cfg_type;
+	u8 sensor_id;
+	u8 hw_pid[TS_CFG_BLOCK_PID_LEN];
+	u8 hw_vid[TS_CFG_BLOCK_VID_LEN];
+	u8 fw_mask[TS_CFG_BLOCK_FW_MASK_LEN];
+	u8 fw_patch[TS_CFG_BLOCK_FW_PATCH_LEN];
+	u16 x_res_offset;
+	u16 y_res_offset;
+	u16 trigger_offset;
+};
+
+struct goodix_cfg_pkg_reg_info {
+	struct goodix_cfg_pkg_reg cfg_send_flag;
+	struct goodix_cfg_pkg_reg version_base;
+	struct goodix_cfg_pkg_reg pid;
+	struct goodix_cfg_pkg_reg vid;
+	struct goodix_cfg_pkg_reg sensor_id;
+	struct goodix_cfg_pkg_reg fw_mask;
+	struct goodix_cfg_pkg_reg fw_status;
+	struct goodix_cfg_pkg_reg cfg_addr;
+	struct goodix_cfg_pkg_reg esd;
+	struct goodix_cfg_pkg_reg command;
+	struct goodix_cfg_pkg_reg coor;
+	struct goodix_cfg_pkg_reg gesture;
+	struct goodix_cfg_pkg_reg fw_request;
+	struct goodix_cfg_pkg_reg proximity;
+	u8 reserved[TS_CFG_BLOCK_RESERVED_LEN];
+};
+
+struct goodix_cfg_bin_head {
+	u32 bin_len;
+	u8 checksum;
+	u8 bin_version[TS_BIN_VERSION_LEN];
+	u8 pkg_num;
+};
+
+#pragma pack()
+
+struct goodix_cfg_package {
+	struct goodix_cfg_pkg_const_info cnst_info;
+	struct goodix_cfg_pkg_reg_info reg_info;
+	const u8 *cfg;
+	u32 pkg_len;
+};
+
+struct goodix_cfg_bin {
+	unsigned char *bin_data;
+	unsigned int bin_data_len;
+	struct goodix_cfg_bin_head head;
+	struct goodix_cfg_package *cfg_pkgs;
+};
+
+static int goodix_read_cfg_bin(struct device *dev, const char *cfg_name,
+			struct goodix_cfg_bin *cfg_bin)
+{
+	const struct firmware *firmware = NULL;
+	int ret;
+	int retry = GOODIX_RETRY_3;
+
+	ts_info("cfg_bin_name:%s", cfg_name);
+
+	while (retry--) {
+		ret = request_firmware(&firmware, cfg_name, dev);
+		if (!ret)
+			break;
+		ts_info("get cfg bin retry:[%d]", GOODIX_RETRY_3 - retry);
+		msleep(300);
+	}
+	if (retry < 0) {
+		ts_err("failed get cfg bin[%s] error:%d", cfg_name, ret);
+		return ret;
+	}
+
+	if (firmware->size <= 0) {
+		ts_err("request_firmware, cfg_bin length ERROR,len:%zu",
+		       firmware->size);
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	cfg_bin->bin_data_len = firmware->size;
+	/* allocate memory for cfg_bin->bin_data */
+	cfg_bin->bin_data = kzalloc(cfg_bin->bin_data_len, GFP_KERNEL);
+	if (!cfg_bin->bin_data) {
+		ret = -ENOMEM;
+		goto exit;
+	}
+	memcpy(cfg_bin->bin_data, firmware->data, cfg_bin->bin_data_len);
+
+exit:
+	release_firmware(firmware);
+	return ret;
+}
+
+static int goodix_parse_cfg_bin(struct goodix_cfg_bin *cfg_bin)
+{
+	u16 offset1, offset2;
+	u8 checksum;
+	int i;
+
+	/* copy cfg_bin head info */
+	if (cfg_bin->bin_data_len < sizeof(struct goodix_cfg_bin_head)) {
+		ts_err("Invalid cfg_bin size:%d", cfg_bin->bin_data_len);
+		return -EINVAL;
+	}
+
+	memcpy(&cfg_bin->head, cfg_bin->bin_data,
+	       sizeof(struct goodix_cfg_bin_head));
+	cfg_bin->head.bin_len = le32_to_cpu(cfg_bin->head.bin_len);
+
+	/*check length*/
+	if (cfg_bin->bin_data_len != cfg_bin->head.bin_len) {
+		ts_err("cfg_bin len check failed,%d != %d",
+		       cfg_bin->head.bin_len, cfg_bin->bin_data_len);
+		return -EINVAL;
+	}
+
+	/*check cfg_bin valid*/
+	checksum = 0;
+	for (i = TS_BIN_VERSION_START_INDEX; i < cfg_bin->bin_data_len; i++)
+		checksum += cfg_bin->bin_data[i];
+
+	if (checksum != cfg_bin->head.checksum) {
+		ts_err("cfg_bin checksum check filed 0x%02x != 0x%02x",
+		       cfg_bin->head.checksum, checksum);
+		return -EINVAL;
+	}
+
+	/*allocate memory for cfg packages*/
+	cfg_bin->cfg_pkgs = kzalloc(sizeof(struct goodix_cfg_package) *
+				    cfg_bin->head.pkg_num, GFP_KERNEL);
+	if (!cfg_bin->cfg_pkgs)
+		return -ENOMEM;
+
+	/*get cfg_pkg's info*/
+	for (i = 0; i < cfg_bin->head.pkg_num; i++) {
+		/*get cfg pkg length*/
+		if (i == cfg_bin->head.pkg_num - 1) {
+			offset1 = cfg_bin->bin_data[TS_CFG_BIN_HEAD_LEN +
+					i * TS_CFG_OFFSET_LEN] +
+					(cfg_bin->bin_data[TS_CFG_BIN_HEAD_LEN +
+					i * TS_CFG_OFFSET_LEN + 1] << 8);
+
+			cfg_bin->cfg_pkgs[i].pkg_len =
+					cfg_bin->bin_data_len - offset1;
+		} else {
+			offset1 = cfg_bin->bin_data[TS_CFG_BIN_HEAD_LEN +
+					i * TS_CFG_OFFSET_LEN] +
+					(cfg_bin->bin_data[TS_CFG_BIN_HEAD_LEN +
+					i * TS_CFG_OFFSET_LEN + 1] << 8);
+
+			offset2 = cfg_bin->bin_data[TS_CFG_BIN_HEAD_LEN +
+					i * TS_CFG_OFFSET_LEN + 2] +
+					(cfg_bin->bin_data[TS_CFG_BIN_HEAD_LEN +
+					i * TS_CFG_OFFSET_LEN + 3] << 8);
+
+			if (offset2 <= offset1) {
+				ts_err("offset error,pkg:%d, offset1:%d, offset2:%d",
+						i, offset1, offset2);
+				goto exit;
+			}
+
+			cfg_bin->cfg_pkgs[i].pkg_len = offset2 - offset1;
+		}
+		/*get cfg pkg head*/
+		memcpy(&cfg_bin->cfg_pkgs[i].cnst_info,
+		       &cfg_bin->bin_data[offset1], TS_PKG_CONST_INFO_LEN);
+		memcpy(&cfg_bin->cfg_pkgs[i].reg_info,
+		       &cfg_bin->bin_data[offset1 + TS_PKG_CONST_INFO_LEN],
+		       TS_PKG_REG_INFO_LEN);
+
+		/*get configuration data*/
+		cfg_bin->cfg_pkgs[i].cfg =
+				&cfg_bin->bin_data[offset1 + TS_PKG_HEAD_LEN];
+	}
+
+	/*debug, print pkg information*/
+	ts_info("Driver bin info: ver %s, len %d, pkgs %d",
+			cfg_bin->head.bin_version,
+			cfg_bin->head.bin_len,
+			cfg_bin->head.pkg_num);
+
+	return 0;
+exit:
+	kfree(cfg_bin->cfg_pkgs);
+	return -EINVAL;
+}
+
+static int goodix_get_reg_and_cfg(struct goodix_ts_core *cd, u8 sensor_id,
+			   struct goodix_cfg_bin *cfg_bin)
+{
+	int i;
+	u8 cfg_type;
+	u32 cfg_len;
+	bool match_sensor_id = true;
+	struct goodix_cfg_package *cfg_pkg;
+
+	if (!cfg_bin->head.pkg_num || !cfg_bin->cfg_pkgs) {
+		ts_err("there is none cfg package, pkg_num:%d",
+			cfg_bin->head.pkg_num);
+		return -EINVAL;
+	}
+
+	/* find cfg packages with same sensor_id */
+refind_cfg:
+	for (i = 0; i < cfg_bin->head.pkg_num; i++) {
+		cfg_pkg = &cfg_bin->cfg_pkgs[i];
+		if (match_sensor_id && (sensor_id != cfg_pkg->cnst_info.sensor_id)) {
+			ts_info("pkg:%d, sensor id contrast FAILED, bin %d != %d",
+			       i, cfg_pkg->cnst_info.sensor_id, sensor_id);
+			continue;
+		}
+		cfg_type = cfg_pkg->cnst_info.cfg_type;
+		if (cfg_type >= GOODIX_MAX_CONFIG_GROUP) {
+			ts_err("usupported config type %d",
+				cfg_pkg->cnst_info.cfg_type);
+			goto err_out;
+		}
+
+		cfg_len = cfg_pkg->pkg_len - TS_PKG_CONST_INFO_LEN -
+			    TS_PKG_REG_INFO_LEN;
+		if (cfg_len > GOODIX_CFG_MAX_SIZE) {
+			ts_err("config len exceed limit %d > %d",
+				cfg_len, GOODIX_CFG_MAX_SIZE);
+			goto err_out;
+		}
+		if (cd->ic_configs[cfg_type]) {
+			ts_err("found same type config twice for sensor id %d, skiped",
+				sensor_id);
+			continue;
+		}
+		cd->ic_configs[cfg_type] =
+				kzalloc(sizeof(struct goodix_ic_config),
+				GFP_KERNEL);
+		if (!cd->ic_configs[cfg_type])
+			goto err_out;
+		cd->ic_configs[cfg_type]->len = cfg_len;
+		memcpy(cd->ic_configs[cfg_type]->data, cfg_pkg->cfg, cfg_len);
+		ts_info("get config type %d, len %d, for sensor id %d",
+			cfg_type, cfg_len, sensor_id);
+	}
+
+	if (cd->ic_configs[CONFIG_TYPE_NORMAL] == NULL) {
+		if (match_sensor_id) {
+			ts_info("no cfg match sensor_id[%d], don't match sensor_id, try again.", sensor_id);
+			match_sensor_id = false;
+			goto refind_cfg;
+		} else {
+			ts_err("can't find normal config");
+			goto err_out;
+		}
+	}
+
+	return 0;
+
+err_out:
+	/* parse config enter error, release memory alloced */
+	for (i = 0; i < GOODIX_MAX_CONFIG_GROUP; i++) {
+		kfree(cd->ic_configs[i]);
+		cd->ic_configs[i] = NULL;
+	}
+	return -EINVAL;
+}
+
+static int goodix_get_config_data(struct goodix_ts_core *cd, u8 sensor_id)
+{
+	struct goodix_cfg_bin cfg_bin = {0};
+	char *cfg_name = cd->board_data.cfg_bin_name;
+	int ret;
+
+	/*get cfg_bin from file system*/
+	ret = goodix_read_cfg_bin(&cd->pdev->dev, cfg_name, &cfg_bin);
+	if (ret) {
+		ts_err("failed get valid config bin data");
+		return ret;
+	}
+
+	/*parse cfg bin*/
+	ret = goodix_parse_cfg_bin(&cfg_bin);
+	if (ret) {
+		ts_err("failed parse cfg bin");
+		goto err_out;
+	}
+
+	/*get register address and configuration from cfg bin*/
+	ret = goodix_get_reg_and_cfg(cd, sensor_id, &cfg_bin);
+	if (!ret)
+		ts_info("success get reg and cfg info from cfg bin");
+	else
+		ts_err("failed get cfg and reg info, update fw then retry");
+
+	kfree(cfg_bin.cfg_pkgs);
+err_out:
+	kfree(cfg_bin.bin_data);
+	return ret;
+}
+
+int goodix_get_config_proc(struct goodix_ts_core *cd)
+{
+	return goodix_get_config_data(cd, cd->fw_version.sensor_id);
+}
+
+
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.c
new file mode 100644
index 00000000000000..0a1fafcd77f58a
--- /dev/null
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.c
@@ -0,0 +1,2545 @@
+ /*
+  * Goodix Touchscreen Driver
+  * Copyright (C) 2020 - 2021 Goodix, Inc.
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2 of the License, or
+  * (at your option) any later version.
+  *
+  * This program is distributed in the hope that it will be a reference
+  * to you, when you are integrating the GOODiX's CTP IC into your system,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * General Public License for more details.
+  *
+  */
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 38)
+#include <linux/input/mt.h>
+#define INPUT_TYPE_B_PROTOCOL
+#endif
+
+#include "goodix_ts_core.h"
+/* goodix fb test */
+// #include "../../../video/fbdev/core/fb_firefly.h"
+
+#define GOODIX_DEFAULT_CFG_NAME		"goodix_cfg_group.cfg"
+
+struct goodix_module goodix_modules;
+int core_module_prob_sate = CORE_MODULE_UNPROBED;
+
+#if IS_ENABLED(CONFIG_DRM)
+#include <drm/drm_panel.h>
+struct drm_panel *gdix_active_panel;
+
+int check_dt(struct device_node *np)
+{
+	int i;
+	int count;
+	struct device_node *node;
+	struct drm_panel *panel;
+	
+	count = of_count_phandle_with_args(np, "panel", NULL);
+	if (count <= 0)
+		return 0;
+	
+	for (i = 0; i < count; i++) {
+		node = of_parse_phandle(np, "panel", i);
+		panel = of_drm_find_panel(node);
+		of_node_put(node);
+		if (!IS_ERR(panel)) {
+			gdix_active_panel = panel;
+			ts_err("%s:check_dt success\n",__func__);			
+			return 0;
+		}
+	}
+
+	return -ENODEV;
+}
+
+int check_default_tp(struct device_node *dt, const char *prop)
+{
+	const char *active_tp;
+	const char *compatible;
+	char *start;
+	int ret;
+
+	ret = of_property_read_string(dt->parent, prop, &active_tp);
+	if (ret) {
+		pr_err(" %s:fail to read %s %d\n", __func__, prop, ret);
+		return -ENODEV;
+	}
+
+	ret = of_property_read_string(dt, "compatible", &compatible);
+	if (ret < 0) {
+		pr_err(" %s:fail to read %s %d\n", __func__, "compatible", ret);
+		return -ENODEV;
+	}
+
+	start = strnstr(active_tp, compatible, strlen(active_tp));
+	if (start == NULL) {
+		pr_err(" %s:no match compatible, %s, %s\n",
+			__func__, compatible, active_tp);
+		ret = -ENODEV;
+	}
+
+	ts_err("%s:check_default_tp success!\n",__func__);
+
+	return ret;
+}
+#endif
+
+static int goodix_send_ic_config(struct goodix_ts_core *cd, int type);
+/**
+ * __do_register_ext_module - register external module
+ * to register into touch core modules structure
+ * return 0 on success, otherwise return < 0
+ */
+static int __do_register_ext_module(struct goodix_ext_module *module)
+{
+	struct goodix_ext_module *ext_module, *next;
+	struct list_head *insert_point = &goodix_modules.head;
+
+	/* prority level *must* be set */
+	if (module->priority == EXTMOD_PRIO_RESERVED) {
+		ts_err("Priority of module [%s] needs to be set",
+		       module->name);
+		return -EINVAL;
+	}
+	mutex_lock(&goodix_modules.mutex);
+	/* find insert point for the specified priority */
+	if (!list_empty(&goodix_modules.head)) {
+		list_for_each_entry_safe(ext_module, next,
+					 &goodix_modules.head, list) {
+			if (ext_module == module) {
+				ts_info("Module [%s] already exists",
+					module->name);
+				mutex_unlock(&goodix_modules.mutex);
+				return 0;
+			}
+		}
+
+		/* smaller priority value with higher priority level */
+		list_for_each_entry_safe(ext_module, next,
+					 &goodix_modules.head, list) {
+			if (ext_module->priority >= module->priority) {
+				insert_point = &ext_module->list;
+				break;
+			}
+		}
+	}
+
+	if (module->funcs && module->funcs->init) {
+		if (module->funcs->init(goodix_modules.core_data,
+					module) < 0) {
+			ts_err("Module [%s] init error",
+			       module->name ? module->name : " ");
+			mutex_unlock(&goodix_modules.mutex);
+			return -EFAULT;
+		}
+	}
+
+	list_add(&module->list, insert_point->prev);
+	mutex_unlock(&goodix_modules.mutex);
+
+	return 0;
+}
+
+static void goodix_register_ext_module_work(struct work_struct *work)
+{
+	struct goodix_ext_module *module =
+			container_of(work, struct goodix_ext_module, work);
+
+	ts_info("module register work IN");
+
+	/* driver probe failed */
+	if (core_module_prob_sate != CORE_MODULE_PROB_SUCCESS) {
+		ts_err("Can't register ext_module core error");
+		return;
+	}
+
+	if (__do_register_ext_module(module))
+		ts_err("failed register module: %s", module->name);
+	else
+		ts_info("success register module: %s", module->name);
+}
+
+static void goodix_core_module_init(void)
+{
+	if (goodix_modules.initilized)
+		return;
+	goodix_modules.initilized = true;
+	INIT_LIST_HEAD(&goodix_modules.head);
+	mutex_init(&goodix_modules.mutex);
+}
+
+/**
+ * goodix_register_ext_module - interface for register external module
+ * to the core. This will create a workqueue to finish the real register
+ * work and return immediately. The user need to check the final result
+ * to make sure registe is success or fail.
+ *
+ * @module: pointer to external module to be register
+ * return: 0 ok, <0 failed
+ */
+int goodix_register_ext_module(struct goodix_ext_module *module)
+{
+	if (!module)
+		return -EINVAL;
+
+	ts_info("IN");
+
+	goodix_core_module_init();
+	INIT_WORK(&module->work, goodix_register_ext_module_work);
+	schedule_work(&module->work);
+
+	ts_info("OUT");
+	return 0;
+}
+
+/**
+ * goodix_register_ext_module_no_wait
+ * return: 0 ok, <0 failed
+ */
+int goodix_register_ext_module_no_wait(struct goodix_ext_module *module)
+{
+	if (!module)
+		return -EINVAL;
+
+	ts_info("IN");
+	goodix_core_module_init();
+	/* driver probe failed */
+	if (core_module_prob_sate != CORE_MODULE_PROB_SUCCESS) {
+		ts_err("Can't register ext_module core error");
+		return -EINVAL;
+	}
+	return __do_register_ext_module(module);
+}
+
+/**
+ * goodix_unregister_ext_module - interface for external module
+ * to unregister external modules
+ *
+ * @module: pointer to external module
+ * return: 0 ok, <0 failed
+ */
+int goodix_unregister_ext_module(struct goodix_ext_module *module)
+{
+	struct goodix_ext_module *ext_module, *next;
+	bool found = false;
+
+	if (!module)
+		return -EINVAL;
+
+	if (!goodix_modules.initilized)
+		return -EINVAL;
+
+	if (!goodix_modules.core_data)
+		return -ENODEV;
+
+	mutex_lock(&goodix_modules.mutex);
+	if (!list_empty(&goodix_modules.head)) {
+		list_for_each_entry_safe(ext_module, next,
+					 &goodix_modules.head, list) {
+			if (ext_module == module) {
+				found = true;
+				break;
+			}
+		}
+	} else {
+		mutex_unlock(&goodix_modules.mutex);
+		return 0;
+	}
+
+	if (!found) {
+		ts_debug("Module [%s] never registed",
+				module->name);
+		mutex_unlock(&goodix_modules.mutex);
+		return 0;
+	}
+
+	list_del(&module->list);
+	mutex_unlock(&goodix_modules.mutex);
+
+	if (module->funcs && module->funcs->exit)
+		module->funcs->exit(goodix_modules.core_data, module);
+
+	ts_info("Moudle [%s] unregistered",
+		module->name ? module->name : " ");
+	return 0;
+}
+
+static void goodix_ext_sysfs_release(struct kobject *kobj)
+{
+	ts_info("Kobject released!");
+}
+
+#define to_ext_module(kobj)	container_of(kobj,\
+				struct goodix_ext_module, kobj)
+#define to_ext_attr(attr)	container_of(attr,\
+				struct goodix_ext_attribute, attr)
+
+static ssize_t goodix_ext_sysfs_show(struct kobject *kobj,
+		struct attribute *attr, char *buf)
+{
+	struct goodix_ext_module *module = to_ext_module(kobj);
+	struct goodix_ext_attribute *ext_attr = to_ext_attr(attr);
+
+	if (ext_attr->show)
+		return ext_attr->show(module, buf);
+
+	return -EIO;
+}
+
+static ssize_t goodix_ext_sysfs_store(struct kobject *kobj,
+		struct attribute *attr, const char *buf, size_t count)
+{
+	struct goodix_ext_module *module = to_ext_module(kobj);
+	struct goodix_ext_attribute *ext_attr = to_ext_attr(attr);
+
+	if (ext_attr->store)
+		return ext_attr->store(module, buf, count);
+
+	return -EIO;
+}
+
+static const struct sysfs_ops goodix_ext_ops = {
+	.show = goodix_ext_sysfs_show,
+	.store = goodix_ext_sysfs_store
+};
+
+static struct kobj_type goodix_ext_ktype = {
+	.release = goodix_ext_sysfs_release,
+	.sysfs_ops = &goodix_ext_ops,
+};
+
+struct kobj_type *goodix_get_default_ktype(void)
+{
+	return &goodix_ext_ktype;
+}
+
+struct kobject *goodix_get_default_kobj(void)
+{
+	struct kobject *kobj = NULL;
+
+	if (goodix_modules.core_data &&
+			goodix_modules.core_data->pdev)
+		kobj = &goodix_modules.core_data->pdev->dev.kobj;
+	return kobj;
+}
+
+/* show driver infomation */
+static ssize_t driver_info_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "DriverVersion:%s\n",
+			GOODIX_DRIVER_VERSION);
+}
+
+/* show chip infoamtion */
+static ssize_t chip_info_show(struct device  *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct goodix_ts_core *cd = dev_get_drvdata(dev);
+	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+	struct goodix_fw_version chip_ver;
+	struct goodix_ic_info ic_info;
+	u8 temp_pid[8] = {0};
+	int ret;
+	int cnt = -EINVAL;
+
+	if (hw_ops->read_version) {
+		ret = hw_ops->read_version(cd, &chip_ver);
+		if (!ret) {
+			memcpy(temp_pid, chip_ver.rom_pid,
+					sizeof(chip_ver.rom_pid));
+			cnt = snprintf(&buf[0], PAGE_SIZE,
+				"rom_pid:%s\nrom_vid:%02x%02x%02x\n",
+				temp_pid, chip_ver.rom_vid[0],
+				chip_ver.rom_vid[1], chip_ver.rom_vid[2]);
+			cnt += snprintf(&buf[cnt], PAGE_SIZE,
+				"patch_pid:%s\npatch_vid:%02x%02x%02x%02x\n",
+				chip_ver.patch_pid, chip_ver.patch_vid[0],
+				chip_ver.patch_vid[1], chip_ver.patch_vid[2],
+				chip_ver.patch_vid[3]);
+			cnt += snprintf(&buf[cnt], PAGE_SIZE,
+				"sensorid:%d\n", chip_ver.sensor_id);
+		}
+	}
+
+	if (hw_ops->get_ic_info) {
+		ret = hw_ops->get_ic_info(cd, &ic_info);
+		if (!ret) {
+			cnt += snprintf(&buf[cnt], PAGE_SIZE,
+					"config_id:%x\n",
+					ic_info.version.config_id);
+			cnt += snprintf(&buf[cnt], PAGE_SIZE,
+					"config_version:%x\n",
+					ic_info.version.config_version);
+		}
+	}
+
+	return cnt;
+}
+
+/* reset chip */
+static ssize_t goodix_ts_reset_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf,
+				     size_t count)
+{
+	struct goodix_ts_core *core_data = dev_get_drvdata(dev);
+	struct goodix_ts_hw_ops *hw_ops = core_data->hw_ops;
+
+	if (!buf || count <= 0)
+		return -EINVAL;
+	if (buf[0] != '0')
+		hw_ops->reset(core_data, GOODIX_NORMAL_RESET_DELAY_MS);
+	return count;
+}
+
+/* read config */
+static ssize_t read_cfg_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct goodix_ts_core *core_data = dev_get_drvdata(dev);
+	struct goodix_ts_hw_ops *hw_ops = core_data->hw_ops;
+	int ret;
+	int i;
+	int offset;
+	char *cfg_buf = NULL;
+
+	cfg_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!cfg_buf)
+		return -ENOMEM;
+
+	if (hw_ops->read_config)
+		ret = hw_ops->read_config(core_data, cfg_buf, PAGE_SIZE);
+	else
+		ret = -EINVAL;
+
+	if (ret > 0) {
+		offset = 0;
+		for (i = 0; i < 200; i++) { // only print 200 bytes
+			offset += snprintf(&buf[offset], PAGE_SIZE - offset,
+					"%02x,", cfg_buf[i]);
+			if ((i + 1) % 20 == 0)
+				buf[offset++] = '\n';
+		}
+	}
+
+	kfree(cfg_buf);
+	if (ret <= 0)
+		return ret;
+
+	return offset;
+}
+
+static u8 ascii2hex(u8 a)
+{
+	s8 value = 0;
+
+	if (a >= '0' && a <= '9')
+		value = a - '0';
+	else if (a >= 'A' && a <= 'F')
+		value = a - 'A' + 0x0A;
+	else if (a >= 'a' && a <= 'f')
+		value = a - 'a' + 0x0A;
+	else
+		value = 0xff;
+
+	return value;
+}
+
+static int goodix_ts_convert_0x_data(const u8 *buf, int buf_size,
+				     u8 *out_buf, int *out_buf_len)
+{
+	int i, m_size = 0;
+	int temp_index = 0;
+	u8 high, low;
+
+	for (i = 0; i < buf_size; i++) {
+		if (buf[i] == 'x' || buf[i] == 'X')
+			m_size++;
+	}
+
+	if (m_size <= 1) {
+		ts_err("cfg file ERROR, valid data count:%d", m_size);
+		return -EINVAL;
+	}
+	*out_buf_len = m_size;
+
+	for (i = 0; i < buf_size; i++) {
+		if (buf[i] != 'x' && buf[i] != 'X')
+			continue;
+
+		if (temp_index >= m_size) {
+			ts_err("exchange cfg data error, overflow, temp_index:%d,m_size:%d",
+					temp_index, m_size);
+			return -EINVAL;
+		}
+		high = ascii2hex(buf[i + 1]);
+		low = ascii2hex(buf[i + 2]);
+		if (high == 0xff || low == 0xff) {
+			ts_err("failed convert: 0x%x, 0x%x",
+				buf[i + 1], buf[i + 2]);
+			return -EINVAL;
+		}
+		out_buf[temp_index++] = (high << 4) + low;
+	}
+	return 0;
+}
+
+/* send config */
+static ssize_t goodix_ts_send_cfg_store(struct device *dev,
+						struct device_attribute *attr,
+						const char *buf, size_t count)
+{
+	struct goodix_ts_core *core_data = dev_get_drvdata(dev);
+	struct goodix_ts_hw_ops *hw_ops = core_data->hw_ops;
+	struct goodix_ic_config *config = NULL;
+	const struct firmware *cfg_img = NULL;
+	int ret;
+
+	if (buf[0] != '1')
+		return -EINVAL;
+
+	hw_ops->irq_enable(core_data, false);
+
+	ret = request_firmware(&cfg_img, GOODIX_DEFAULT_CFG_NAME, dev);
+	if (ret < 0) {
+		ts_err("cfg file [%s] not available,errno:%d",
+			GOODIX_DEFAULT_CFG_NAME, ret);
+		goto exit;
+	} else {
+		ts_info("cfg file [%s] is ready", GOODIX_DEFAULT_CFG_NAME);
+	}
+
+	config = kzalloc(sizeof(*config), GFP_KERNEL);
+	if (!config)
+		goto exit;
+
+	if (goodix_ts_convert_0x_data(cfg_img->data, cfg_img->size,
+			config->data, &config->len)) {
+		ts_err("convert config data FAILED");
+		goto exit;
+	}
+
+	if (hw_ops->send_config) {
+		ret = hw_ops->send_config(core_data, config->data, config->len);
+		if (ret < 0)
+			ts_err("send config failed");
+	}
+
+exit:
+	hw_ops->irq_enable(core_data, true);
+	kfree(config);
+	if (cfg_img)
+		release_firmware(cfg_img);
+
+	return count;
+}
+
+/* reg read/write */
+static u32 rw_addr;
+static u32 rw_len;
+static u8 rw_flag;
+static u8 store_buf[32];
+static u8 show_buf[PAGE_SIZE];
+static ssize_t goodix_ts_reg_rw_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct goodix_ts_core *core_data = dev_get_drvdata(dev);
+	struct goodix_ts_hw_ops *hw_ops = core_data->hw_ops;
+	int ret;
+
+	if (!rw_addr || !rw_len) {
+		ts_err("address(0x%x) and length(%d) can't be null",
+			rw_addr, rw_len);
+		return -EINVAL;
+	}
+
+	if (rw_flag != 1) {
+		ts_err("invalid rw flag %d, only support [1/2]", rw_flag);
+		return -EINVAL;
+	}
+
+	ret = hw_ops->read(core_data, rw_addr, show_buf, rw_len);
+	if (ret < 0) {
+		ts_err("failed read addr(%x) length(%d)", rw_addr, rw_len);
+		return snprintf(buf, PAGE_SIZE,
+			"failed read addr(%x), len(%d)\n",
+			rw_addr, rw_len);
+	}
+
+	return snprintf(buf, PAGE_SIZE, "0x%x,%d {%*ph}\n",
+		rw_addr, rw_len, rw_len, show_buf);
+}
+
+static ssize_t goodix_ts_reg_rw_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct goodix_ts_core *core_data = dev_get_drvdata(dev);
+	struct goodix_ts_hw_ops *hw_ops = core_data->hw_ops;
+	char *pos = NULL;
+	char *token = NULL;
+	long result = 0;
+	int ret;
+	int i;
+
+	if (!buf || !count) {
+		ts_err("invalid parame");
+		goto err_out;
+	}
+
+	if (buf[0] == 'r') {
+		rw_flag = 1;
+	} else if (buf[0] == 'w') {
+		rw_flag = 2;
+	} else {
+		ts_err("string must start with 'r/w'");
+		goto err_out;
+	}
+
+	/* get addr */
+	pos = (char *)buf;
+	pos += 2;
+	token = strsep(&pos, ":");
+	if (!token) {
+		ts_err("invalid address info");
+		goto err_out;
+	} else {
+		if (kstrtol(token, 16, &result)) {
+			ts_err("failed get addr info");
+			goto err_out;
+		}
+		rw_addr = (u32)result;
+		ts_info("rw addr is 0x%x", rw_addr);
+	}
+
+	/* get length */
+	token = strsep(&pos, ":");
+	if (!token) {
+		ts_err("invalid length info");
+		goto err_out;
+	} else {
+		if (kstrtol(token, 0, &result)) {
+			ts_err("failed get length info");
+			goto err_out;
+		}
+		rw_len = (u32)result;
+		ts_info("rw length info is %d", rw_len);
+		if (rw_len > sizeof(store_buf)) {
+			ts_err("data len > %lu", sizeof(store_buf));
+			goto err_out;
+		}
+	}
+
+	if (rw_flag == 1)
+		return count;
+
+	for (i = 0; i < rw_len; i++) {
+		token = strsep(&pos, ":");
+		if (!token) {
+			ts_err("invalid data info");
+			goto err_out;
+		} else {
+			if (kstrtol(token, 16, &result)) {
+				ts_err("failed get data[%d] info", i);
+				goto err_out;
+			}
+			store_buf[i] = (u8)result;
+			ts_info("get data[%d]=0x%x", i, store_buf[i]);
+		}
+	}
+	ret = hw_ops->write(core_data, rw_addr, store_buf, rw_len);
+	if (ret < 0) {
+		ts_err("failed write addr(%x) data %*ph", rw_addr,
+			rw_len, store_buf);
+		goto err_out;
+	}
+
+	ts_info("%s write to addr (%x) with data %*ph",
+		"success", rw_addr, rw_len, store_buf);
+
+	return count;
+err_out:
+	snprintf(show_buf, PAGE_SIZE, "%s\n",
+		"invalid params, format{r/w:4100:length:[41:21:31]}");
+	return -EINVAL;
+
+}
+
+/* show irq infomation */
+static ssize_t goodix_ts_irq_info_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct goodix_ts_core *core_data = dev_get_drvdata(dev);
+	struct irq_desc *desc;
+	size_t offset = 0;
+	int r;
+
+	r = snprintf(&buf[offset], PAGE_SIZE, "irq:%u\n", core_data->irq);
+	if (r < 0)
+		return -EINVAL;
+
+	offset += r;
+	r = snprintf(&buf[offset], PAGE_SIZE - offset, "state:%s\n",
+		     atomic_read(&core_data->irq_enabled) ?
+		     "enabled" : "disabled");
+	if (r < 0)
+		return -EINVAL;
+
+	desc = irq_to_desc(core_data->irq);
+	offset += r;
+	r = snprintf(&buf[offset], PAGE_SIZE - offset, "disable-depth:%d\n",
+		     desc->depth);
+	if (r < 0)
+		return -EINVAL;
+
+	offset += r;
+	r = snprintf(&buf[offset], PAGE_SIZE - offset, "trigger-count:%zu\n",
+		core_data->irq_trig_cnt);
+	if (r < 0)
+		return -EINVAL;
+
+	offset += r;
+	r = snprintf(&buf[offset], PAGE_SIZE - offset,
+		     "echo 0/1 > irq_info to disable/enable irq\n");
+	if (r < 0)
+		return -EINVAL;
+
+	offset += r;
+	return offset;
+}
+
+/* enable/disable irq */
+static ssize_t goodix_ts_irq_info_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct goodix_ts_core *core_data = dev_get_drvdata(dev);
+	struct goodix_ts_hw_ops *hw_ops = core_data->hw_ops;
+
+	if (!buf || count <= 0)
+		return -EINVAL;
+
+	if (buf[0] != '0')
+		hw_ops->irq_enable(core_data, true);
+	else
+		hw_ops->irq_enable(core_data, false);
+	return count;
+}
+
+/* show esd status */
+static ssize_t goodix_ts_esd_info_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct goodix_ts_core *core_data = dev_get_drvdata(dev);
+	struct goodix_ts_esd *ts_esd = &core_data->ts_esd;
+	int r = 0;
+
+	r = snprintf(buf, PAGE_SIZE, "state:%s\n",
+		     atomic_read(&ts_esd->esd_on) ?
+		     "enabled" : "disabled");
+
+	return r;
+}
+
+/* enable/disable esd */
+static ssize_t goodix_ts_esd_info_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	if (!buf || count <= 0)
+		return -EINVAL;
+
+	if (buf[0] != '0')
+		goodix_ts_blocking_notify(NOTIFY_ESD_ON, NULL);
+	else
+		goodix_ts_blocking_notify(NOTIFY_ESD_OFF, NULL);
+	return count;
+}
+
+/* debug level show */
+static ssize_t goodix_ts_debug_log_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	int r = 0;
+
+	r = snprintf(buf, PAGE_SIZE, "state:%s\n",
+		    debug_log_flag ?
+		    "enabled" : "disabled");
+
+	return r;
+}
+
+/* debug level store */
+static ssize_t goodix_ts_debug_log_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	if (!buf || count <= 0)
+		return -EINVAL;
+
+	if (buf[0] != '0')
+		debug_log_flag = true;
+	else
+		debug_log_flag = false;
+	return count;
+}
+
+/* show die package site and mcu fabs */
+#define DIE_INFO_START_FLASH_ADDR 0x1F300
+static ssize_t die_info_show(struct device  *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct goodix_ts_core *cd = dev_get_drvdata(dev);
+	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+	u8 temp_buf[21];
+	u8 pkg_site;
+	u8 mcu_fab;
+	int ret;
+
+	ret = hw_ops->read_flash(cd, DIE_INFO_START_FLASH_ADDR, temp_buf, sizeof(temp_buf));
+	if (ret < 0) {
+		ts_err("read flash failed");
+		return 0;
+	}
+
+	ts_info("die info:%*ph", (int)sizeof(temp_buf), temp_buf);
+
+	pkg_site = temp_buf[1];
+	mcu_fab = temp_buf[20];
+	ret = snprintf(buf, PAGE_SIZE, "package_id:0x%02X mcu_fab:0x%02X\n", pkg_site, mcu_fab);
+
+	return ret;
+}
+
+static DEVICE_ATTR(driver_info, 0440,
+		driver_info_show, NULL);
+static DEVICE_ATTR(chip_info, 0440,
+		chip_info_show, NULL);
+static DEVICE_ATTR(reset, 0220,
+		NULL, goodix_ts_reset_store);
+static DEVICE_ATTR(send_cfg, 0220,
+		NULL, goodix_ts_send_cfg_store);
+static DEVICE_ATTR(read_cfg, 0440,
+		read_cfg_show, NULL);
+static DEVICE_ATTR(reg_rw, 0664,
+		goodix_ts_reg_rw_show, goodix_ts_reg_rw_store);
+static DEVICE_ATTR(irq_info, 0664,
+		goodix_ts_irq_info_show, goodix_ts_irq_info_store);
+static DEVICE_ATTR(esd_info, 0664,
+		goodix_ts_esd_info_show, goodix_ts_esd_info_store);
+static DEVICE_ATTR(debug_log, 0664,
+		goodix_ts_debug_log_show, goodix_ts_debug_log_store);
+static DEVICE_ATTR(die_info, 0440,
+		die_info_show, NULL);
+
+static struct attribute *sysfs_attrs[] = {
+	&dev_attr_driver_info.attr,
+	&dev_attr_chip_info.attr,
+	&dev_attr_reset.attr,
+	&dev_attr_send_cfg.attr,
+	&dev_attr_read_cfg.attr,
+	&dev_attr_reg_rw.attr,
+	&dev_attr_irq_info.attr,
+	&dev_attr_esd_info.attr,
+	&dev_attr_debug_log.attr,
+	&dev_attr_die_info.attr,
+	NULL,
+};
+
+static const struct attribute_group sysfs_group = {
+	.attrs = sysfs_attrs,
+};
+
+static int goodix_ts_sysfs_init(struct goodix_ts_core *core_data)
+{
+	int ret;
+
+	ret = sysfs_create_group(&core_data->pdev->dev.kobj, &sysfs_group);
+	if (ret) {
+		ts_err("failed create core sysfs group");
+		return ret;
+	}
+
+	return ret;
+}
+
+static void goodix_ts_sysfs_exit(struct goodix_ts_core *core_data)
+{
+	sysfs_remove_group(&core_data->pdev->dev.kobj, &sysfs_group);
+}
+
+/* prosfs create */
+static int rawdata_proc_show(struct seq_file *m, void *v)
+{
+	struct ts_rawdata_info *info;
+	struct goodix_ts_core *cd;
+	int tx;
+	int rx;
+	int ret;
+	int i;
+	int index;
+
+	if (!m || !v)
+		return -EIO;
+
+	cd = m->private;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	ret = cd->hw_ops->get_capacitance_data(cd, info);
+	if (ret < 0) {
+		ts_err("failed to get_capacitance_data, exit!");
+		goto exit;
+	}
+
+	rx = info->buff[0];
+	tx = info->buff[1];
+	seq_printf(m, "TX:%d  RX:%d\n", tx, rx);
+	seq_puts(m, "mutual_rawdata:\n");
+	index = 2;
+	for (i = 0; i < tx * rx; i++) {
+		seq_printf(m, "%5d,", info->buff[index + i]);
+		if ((i + 1) % tx == 0)
+			seq_puts(m, "\n");
+	}
+	seq_puts(m, "mutual_diffdata:\n");
+	index += tx * rx;
+	for (i = 0; i < tx * rx; i++) {
+		seq_printf(m, "%3d,", info->buff[index + i]);
+		if ((i + 1) % tx == 0)
+			seq_puts(m, "\n");
+	}
+
+exit:
+	kfree(info);
+	return ret;
+}
+
+static int rawdata_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open_size(file, rawdata_proc_show,
+			PDE_DATA(inode), PAGE_SIZE * 10);
+}
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0))
+static const struct proc_ops rawdata_proc_fops = {
+	.proc_open = rawdata_proc_open,
+	.proc_read = seq_read,
+	.proc_lseek = seq_lseek,
+	.proc_release = single_release,
+};
+#else
+static const struct file_operations rawdata_proc_fops = {
+	.open = rawdata_proc_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+#endif
+
+static void goodix_ts_procfs_init(struct goodix_ts_core *core_data)
+{
+	struct proc_dir_entry *proc_entry;
+
+	if (!proc_mkdir("goodix_ts", NULL))
+		return;
+	proc_entry = proc_create_data("goodix_ts/tp_capacitance_data",
+			0664, NULL, &rawdata_proc_fops, core_data);
+	if (!proc_entry)
+		ts_err("failed to create proc entry");
+}
+
+static void goodix_ts_procfs_exit(struct goodix_ts_core *core_data)
+{
+	remove_proc_entry("goodix_ts/tp_capacitance_data", NULL);
+	remove_proc_entry("goodix_ts", NULL);
+}
+
+/* event notifier */
+static BLOCKING_NOTIFIER_HEAD(ts_notifier_list);
+/**
+ * goodix_ts_register_client - register a client notifier
+ * @nb: notifier block to callback on events
+ *  see enum ts_notify_event in goodix_ts_core.h
+ */
+int goodix_ts_register_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&ts_notifier_list, nb);
+}
+
+/**
+ * goodix_ts_unregister_client - unregister a client notifier
+ * @nb: notifier block to callback on events
+ *	see enum ts_notify_event in goodix_ts_core.h
+ */
+int goodix_ts_unregister_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&ts_notifier_list, nb);
+}
+
+/**
+ * fb_notifier_call_chain - notify clients of fb_events
+ *	see enum ts_notify_event in goodix_ts_core.h
+ */
+int goodix_ts_blocking_notify(enum ts_notify_event evt, void *v)
+{
+	int ret;
+
+	ret = blocking_notifier_call_chain(&ts_notifier_list,
+			(unsigned long)evt, v);
+	return ret;
+}
+
+#if IS_ENABLED(CONFIG_OF)
+/**
+ * goodix_parse_dt_resolution - parse resolution from dt
+ * @node: devicetree node
+ * @board_data: pointer to board data structure
+ * return: 0 - no error, <0 error
+ */
+static int goodix_parse_dt_resolution(struct device_node *node,
+		struct goodix_ts_board_data *board_data)
+{
+	int ret;
+
+	ret = of_property_read_u32(node, "goodix,panel-max-x",
+				 &board_data->panel_max_x);
+	if (ret) {
+		ts_err("failed get panel-max-x");
+		return ret;
+	}
+
+	ret = of_property_read_u32(node, "goodix,panel-max-y",
+				 &board_data->panel_max_y);
+	if (ret) {
+		ts_err("failed get panel-max-y");
+		return ret;
+	}
+
+	ret = of_property_read_u32(node, "goodix,panel-max-w",
+				 &board_data->panel_max_w);
+	if (ret) {
+		ts_err("failed get panel-max-w");
+		return ret;
+	}
+
+	ret = of_property_read_u32(node, "goodix,panel-max-p",
+				 &board_data->panel_max_p);
+	if (ret) {
+		ts_err("failed get panel-max-p, use default");
+		board_data->panel_max_p = GOODIX_PEN_MAX_PRESSURE;
+	}
+
+	return 0;
+}
+
+/**
+ * goodix_parse_dt - parse board data from dt
+ * @dev: pointer to device
+ * @board_data: pointer to board data structure
+ * return: 0 - no error, <0 error
+ */
+static int goodix_parse_dt(struct device_node *node,
+	struct goodix_ts_board_data *board_data)
+{
+	const char *name_tmp;
+	int r;
+
+	if (!board_data) {
+		ts_err("invalid board data");
+		return -EINVAL;
+	}
+
+	r = of_get_named_gpio(node, "goodix,avdd-gpio", 0);
+	if (r < 0) {
+		ts_info("can't find avdd-gpio, use other power supply");
+		board_data->avdd_gpio = 0;
+	} else {
+		ts_info("get avdd-gpio[%d] from dt", r);
+		board_data->avdd_gpio = r;
+	}
+
+	r = of_get_named_gpio(node, "goodix,iovdd-gpio", 0);
+	if (r < 0) {
+		ts_info("can't find iovdd-gpio, use other power supply");
+		board_data->iovdd_gpio = 0;
+	} else {
+		ts_info("get iovdd-gpio[%d] from dt", r);
+		board_data->iovdd_gpio = r;
+	}
+
+	r = of_get_named_gpio(node, "goodix,reset-gpio", 0);
+	if (r < 0) {
+		ts_err("invalid reset-gpio in dt: %d", r);
+		return -EINVAL;
+	}
+	ts_info("get reset-gpio[%d] from dt", r);
+	board_data->reset_gpio = r;
+
+	r = of_get_named_gpio(node, "goodix,irq-gpio", 0);
+	if (r < 0) {
+		ts_err("invalid irq-gpio in dt: %d", r);
+		return -EINVAL;
+	}
+	ts_info("get irq-gpio[%d] from dt", r);
+	board_data->irq_gpio = r;
+
+	r = of_property_read_u32(node, "goodix,irq-flags",
+			&board_data->irq_flags);
+	if (r) {
+		ts_err("invalid irq-flags");
+		return -EINVAL;
+	}
+
+	memset(board_data->avdd_name, 0, sizeof(board_data->avdd_name));
+	r = of_property_read_string(node, "goodix,avdd-name", &name_tmp);
+	if (!r) {
+		ts_info("avdd name from dt: %s", name_tmp);
+		if (strlen(name_tmp) < sizeof(board_data->avdd_name))
+			strncpy(board_data->avdd_name,
+				name_tmp, sizeof(board_data->avdd_name));
+		else
+			ts_info("invalied avdd name length: %ld > %ld",
+				strlen(name_tmp),
+				sizeof(board_data->avdd_name));
+	}
+
+	memset(board_data->iovdd_name, 0, sizeof(board_data->iovdd_name));
+	r = of_property_read_string(node, "goodix,iovdd-name", &name_tmp);
+	if (!r) {
+		ts_info("iovdd name from dt: %s", name_tmp);
+		if (strlen(name_tmp) < sizeof(board_data->iovdd_name))
+			strncpy(board_data->iovdd_name,
+				name_tmp, sizeof(board_data->iovdd_name));
+		else
+			ts_info("invalied iovdd name length: %ld > %ld",
+				strlen(name_tmp),
+				sizeof(board_data->iovdd_name));
+	}
+
+	/* get firmware file name */
+	r = of_property_read_string(node, "goodix,firmware-name", &name_tmp);
+	if (!r) {
+		ts_info("firmware name from dt: %s", name_tmp);
+		strncpy(board_data->fw_name,
+				name_tmp, sizeof(board_data->fw_name));
+	} else {
+		ts_info("can't find firmware name, use default: %s",
+				TS_DEFAULT_FIRMWARE);
+		strncpy(board_data->fw_name,
+				TS_DEFAULT_FIRMWARE,
+				sizeof(board_data->fw_name));
+	}
+
+	/* get config file name */
+	r = of_property_read_string(node, "goodix,config-name", &name_tmp);
+	if (!r) {
+		ts_info("config name from dt: %s", name_tmp);
+		strncpy(board_data->cfg_bin_name, name_tmp,
+				sizeof(board_data->cfg_bin_name));
+	} else {
+		ts_info("can't find config name, use default: %s",
+				TS_DEFAULT_CFG_BIN);
+		strncpy(board_data->cfg_bin_name,
+				TS_DEFAULT_CFG_BIN,
+				sizeof(board_data->cfg_bin_name));
+	}
+
+	/* get xyz resolutions */
+	r = goodix_parse_dt_resolution(node, board_data);
+	if (r) {
+		ts_err("Failed to parse resolutions:%d", r);
+		return r;
+	}
+
+	/* get sleep mode flag */
+	board_data->sleep_enable = of_property_read_bool(node,
+			"goodix,sleep-enable");
+
+	/*get pen-enable switch and pen keys, must after "key map"*/
+	board_data->pen_enable = of_property_read_bool(node,
+			"goodix,pen-enable");
+
+	ts_info("[DT]x:%d, y:%d, w:%d, p:%d sleep_enable:%d pen_enable:%d",
+		board_data->panel_max_x, board_data->panel_max_y,
+		board_data->panel_max_w, board_data->panel_max_p,
+		board_data->sleep_enable, board_data->pen_enable);
+	return 0;
+}
+#endif
+
+static void goodix_ts_report_pen(struct input_dev *dev,
+		struct goodix_pen_data *pen_data)
+{
+	int i;
+
+	mutex_lock(&dev->mutex);
+
+	if (pen_data->coords.status == TS_TOUCH) {
+		input_report_key(dev, BTN_TOUCH, pen_data->is_hover ? 0 : 1);
+		input_report_key(dev, BTN_TOOL_PEN, 1);
+		input_report_abs(dev, ABS_X, pen_data->coords.x);
+		input_report_abs(dev, ABS_Y, pen_data->coords.y);
+		input_report_abs(dev, ABS_PRESSURE, pen_data->coords.p);
+		if (pen_data->coords.p == 0)
+			input_report_abs(dev, ABS_DISTANCE, 1);
+		else
+			input_report_abs(dev, ABS_DISTANCE, 0);
+		input_report_abs(dev, ABS_TILT_X, pen_data->coords.tilt_x);
+		input_report_abs(dev, ABS_TILT_Y, pen_data->coords.tilt_y);
+		ts_debug("pen_data:x %d, y %d, p %d, tilt_x %d tilt_y %d key[%d %d]",
+				pen_data->coords.x, pen_data->coords.y,
+				pen_data->coords.p, pen_data->coords.tilt_x,
+				pen_data->coords.tilt_y,
+				pen_data->keys[0].status == TS_TOUCH ? 1 : 0,
+				pen_data->keys[1].status == TS_TOUCH ? 1 : 0);
+	} else {
+		input_report_key(dev, BTN_TOUCH, 0);
+		input_report_key(dev, BTN_TOOL_PEN, 0);
+	}
+	/* report pen button */
+	for (i = 0; i < GOODIX_MAX_PEN_KEY; i++) {
+		if (pen_data->keys[i].status == TS_TOUCH)
+			input_report_key(dev, pen_data->keys[i].code, 1);
+		else
+			input_report_key(dev, pen_data->keys[i].code, 0);
+	}
+
+	input_sync(dev);
+	mutex_unlock(&dev->mutex);
+}
+
+static void goodix_ts_report_finger(struct input_dev *dev,
+		struct goodix_touch_data *touch_data)
+{
+	unsigned int touch_num = touch_data->touch_num;
+	int i;
+
+	mutex_lock(&dev->mutex);
+
+	for (i = 0; i < GOODIX_MAX_TOUCH; i++) {
+		if (touch_data->coords[i].status == TS_TOUCH) {
+			ts_debug("report: id[%d], x %d, y %d, w %d", i,
+				touch_data->coords[i].x,
+				touch_data->coords[i].y,
+				touch_data->coords[i].w);
+			input_mt_slot(dev, i);
+			input_mt_report_slot_state(dev, MT_TOOL_FINGER, true);
+			input_report_abs(dev, ABS_MT_POSITION_X,
+					touch_data->coords[i].x);
+			input_report_abs(dev, ABS_MT_POSITION_Y,
+					touch_data->coords[i].y);
+			input_report_abs(dev, ABS_MT_TOUCH_MAJOR,
+					touch_data->coords[i].w);
+		} else {
+			input_mt_slot(dev, i);
+			input_mt_report_slot_state(dev, MT_TOOL_FINGER, false);
+		}
+	}
+
+	if (touch_data->have_key) {
+		for (i = 0; i < GOODIX_MAX_KEY; i++) {
+			if (touch_data->keys[i].status == TS_TOUCH)
+				input_report_key(dev, touch_data->keys[i].code, 1);
+			else
+				input_report_key(dev, touch_data->keys[i].code, 0);
+		}
+	}
+
+	input_report_key(dev, BTN_TOUCH, touch_num > 0 ? 1 : 0);
+	input_sync(dev);
+
+	mutex_unlock(&dev->mutex);
+}
+
+static int goodix_ts_request_handle(struct goodix_ts_core *cd,
+	struct goodix_ts_event *ts_event)
+{
+	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+	int ret = -1;
+
+	if (ts_event->request_code == REQUEST_TYPE_CONFIG)
+		ret = goodix_send_ic_config(cd, CONFIG_TYPE_NORMAL);
+	else if (ts_event->request_code == REQUEST_TYPE_RESET)
+		ret = hw_ops->reset(cd, GOODIX_NORMAL_RESET_DELAY_MS);
+	else
+		ts_info("can not handle request type 0x%x",
+			  ts_event->request_code);
+	if (ret)
+		ts_err("failed handle request 0x%x",
+			 ts_event->request_code);
+	else
+		ts_info("success handle ic request 0x%x",
+			  ts_event->request_code);
+	return ret;
+}
+
+/**
+ * goodix_ts_threadirq_func - Bottom half of interrupt
+ * This functions is excuted in thread context,
+ * sleep in this function is permit.
+ *
+ * @data: pointer to touch core data
+ * return: 0 ok, <0 failed
+ */
+static irqreturn_t goodix_ts_threadirq_func(int irq, void *data)
+{
+	struct goodix_ts_core *core_data = data;
+	struct goodix_ts_hw_ops *hw_ops = core_data->hw_ops;
+	struct goodix_ext_module *ext_module, *next;
+	struct goodix_ts_event *ts_event = &core_data->ts_event;
+	struct goodix_ts_esd *ts_esd = &core_data->ts_esd;
+	int ret;
+
+	disable_irq_nosync(core_data->irq);
+
+	ts_esd->irq_status = true;
+	core_data->irq_trig_cnt++;
+	/* inform external module */
+	mutex_lock(&goodix_modules.mutex);
+	list_for_each_entry_safe(ext_module, next,
+				 &goodix_modules.head, list) {
+		if (!ext_module->funcs->irq_event)
+			continue;
+		ret = ext_module->funcs->irq_event(core_data, ext_module);
+		if (ret == EVT_CANCEL_IRQEVT) {
+			mutex_unlock(&goodix_modules.mutex);
+			enable_irq(core_data->irq);
+			return IRQ_HANDLED;
+		}
+	}
+	mutex_unlock(&goodix_modules.mutex);
+
+	/* read touch data from touch device */
+	ret = hw_ops->event_handler(core_data, ts_event);
+	if (likely(!ret)) {
+		if (ts_event->event_type & EVENT_TOUCH) {
+			/* report touch */
+			goodix_ts_report_finger(core_data->input_dev,
+					&ts_event->touch_data);
+		}
+		if (core_data->board_data.pen_enable &&
+				ts_event->event_type & EVENT_PEN) {
+			goodix_ts_report_pen(core_data->pen_dev,
+					&ts_event->pen_data);
+		}
+		if (ts_event->event_type & EVENT_REQUEST)
+			goodix_ts_request_handle(core_data, ts_event);
+	}
+
+	enable_irq(core_data->irq);
+	return IRQ_HANDLED;
+}
+
+/**
+ * goodix_ts_init_irq - Requset interrput line from system
+ * @core_data: pointer to touch core data
+ * return: 0 ok, <0 failed
+ */
+static int goodix_ts_irq_setup(struct goodix_ts_core *core_data)
+{
+	const struct goodix_ts_board_data *ts_bdata = board_data(core_data);
+	int ret;
+
+	/* if ts_bdata-> irq is invalid */
+	core_data->irq = gpio_to_irq(ts_bdata->irq_gpio);
+	if (core_data->irq < 0) {
+		ts_err("failed get irq num %d", core_data->irq);
+		return -EINVAL;
+	}
+
+	ts_info("IRQ:%u,flags:%d", core_data->irq, (int)ts_bdata->irq_flags);
+	ret = devm_request_threaded_irq(&core_data->pdev->dev,
+				      core_data->irq, NULL,
+				      goodix_ts_threadirq_func,
+				      ts_bdata->irq_flags | IRQF_ONESHOT,
+				      GOODIX_CORE_DRIVER_NAME,
+				      core_data);
+	if (ret < 0)
+		ts_err("Failed to requeset threaded irq:%d", ret);
+	else
+		atomic_set(&core_data->irq_enabled, 1);
+
+	return ret;
+}
+
+/**
+ * goodix_ts_power_init - Get regulator for touch device
+ * @core_data: pointer to touch core data
+ * return: 0 ok, <0 failed
+ */
+static int goodix_ts_power_init(struct goodix_ts_core *core_data)
+{
+	struct goodix_ts_board_data *ts_bdata = board_data(core_data);
+	struct device *dev = core_data->bus->dev;
+	int ret = 0;
+
+	ts_err("Power init");
+	if (strlen(ts_bdata->avdd_name)) {
+		core_data->avdd = devm_regulator_get(dev,
+				 ts_bdata->avdd_name);
+		if (IS_ERR_OR_NULL(core_data->avdd)) {
+			ret = PTR_ERR(core_data->avdd);
+			ts_err("Failed to get regulator avdd:%d", ret);
+			core_data->avdd = NULL;
+			return ret;
+		}
+	} else {
+		ts_err("Avdd name is NULL");
+	}
+
+	if (strlen(ts_bdata->iovdd_name)) {
+		core_data->iovdd = devm_regulator_get(dev,
+				 ts_bdata->iovdd_name);
+		if (IS_ERR_OR_NULL(core_data->iovdd)) {
+			ret = PTR_ERR(core_data->iovdd);
+			ts_err("Failed to get regulator iovdd:%d", ret);
+			core_data->iovdd = NULL;
+		}
+	} else {
+		ts_err("iovdd name is NULL");
+	}
+
+	return ret;
+}
+
+/**
+ * goodix_ts_power_on - Turn on power to the touch device
+ * @core_data: pointer to touch core data
+ * return: 0 ok, <0 failed
+ */
+int goodix_ts_power_on(struct goodix_ts_core *cd)
+{
+	int ret = 0;
+
+	ts_info("Device power on");
+	if (cd->power_on)
+		return 0;
+
+	ret = cd->hw_ops->power_on(cd, true);
+	if (!ret)
+		cd->power_on = 1;
+	else
+		ts_err("failed power on, %d", ret);
+	return ret;
+}
+
+/**
+ * goodix_ts_power_off - Turn off power to the touch device
+ * @core_data: pointer to touch core data
+ * return: 0 ok, <0 failed
+ */
+int goodix_ts_power_off(struct goodix_ts_core *cd)
+{
+	int ret;
+
+	ts_info("Device power off");
+	if (!cd->power_on)
+		return 0;
+
+	ret = cd->hw_ops->power_on(cd, false);
+	if (!ret)
+		cd->power_on = 0;
+	else
+		ts_err("failed power off, %d", ret);
+
+	return ret;
+}
+
+/**
+ * goodix_ts_gpio_setup - Request gpio resources from GPIO subsysten
+ * @core_data: pointer to touch core data
+ * return: 0 ok, <0 failed
+ */
+static int goodix_ts_gpio_setup(struct goodix_ts_core *core_data)
+{
+	struct goodix_ts_board_data *ts_bdata = board_data(core_data);
+	int r = 0;
+
+	ts_info("GPIO setup,reset-gpio:%d, irq-gpio:%d",
+		ts_bdata->reset_gpio, ts_bdata->irq_gpio);
+	/*
+	 * after kenerl3.13, gpio_ api is deprecated, new
+	 * driver should use gpiod_ api.
+	 */
+	r = devm_gpio_request_one(&core_data->pdev->dev,
+			ts_bdata->reset_gpio,
+			GPIOF_OUT_INIT_LOW, "ts_reset_gpio");
+	if (r < 0) {
+		ts_err("Failed to request reset gpio, r:%d", r);
+		return r;
+	}
+
+	r = devm_gpio_request_one(&core_data->pdev->dev,
+			ts_bdata->irq_gpio,
+			GPIOF_IN, "ts_irq_gpio");
+	if (r < 0) {
+		ts_err("Failed to request irq gpio, r:%d", r);
+		return r;
+	}
+
+	if (ts_bdata->avdd_gpio > 0) {
+		r = devm_gpio_request_one(&core_data->pdev->dev,
+				ts_bdata->avdd_gpio,
+				GPIOF_OUT_INIT_LOW, "ts_avdd_gpio");
+		if (r < 0) {
+			ts_err("Failed to request avdd-gpio, r:%d", r);
+			return r;
+		}
+	}
+
+	if (ts_bdata->iovdd_gpio > 0) {
+		r = devm_gpio_request_one(&core_data->pdev->dev,
+				ts_bdata->iovdd_gpio,
+				GPIOF_OUT_INIT_LOW, "ts_iovdd_gpio");
+		if (r < 0) {
+			ts_err("Failed to request iovdd-gpio, r:%d", r);
+			return r;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * goodix_ts_input_dev_config - Requset and config a input device
+ *  then register it to input sybsystem.
+ * @core_data: pointer to touch core data
+ * return: 0 ok, <0 failed
+ */
+static int goodix_ts_input_dev_config(struct goodix_ts_core *core_data)
+{
+	struct goodix_ts_board_data *ts_bdata = board_data(core_data);
+	struct input_dev *input_dev = NULL;
+	static char ts_phys[32];
+	int r;
+
+	input_dev = input_allocate_device();
+	if (!input_dev) {
+		ts_err("Failed to allocated input device");
+		return -ENOMEM;
+	}
+
+	core_data->input_dev = input_dev;
+	input_set_drvdata(input_dev, core_data);
+
+	input_dev->name = GOODIX_CORE_DRIVER_NAME;
+	sprintf(ts_phys, "%s/input0", input_dev->name);
+	input_dev->phys = ts_phys;
+	input_dev->id.bustype = core_data->bus->bus_type;
+	input_dev->id.vendor = 0x27C6;
+	input_dev->id.product = 0x0001;
+	input_dev->id.version = 0x0100;
+
+	set_bit(EV_SYN, input_dev->evbit);
+	set_bit(EV_KEY, input_dev->evbit);
+	set_bit(EV_ABS, input_dev->evbit);
+	set_bit(BTN_TOUCH, input_dev->keybit);
+	set_bit(BTN_TOOL_FINGER, input_dev->keybit);
+	set_bit(INPUT_PROP_DIRECT, input_dev->propbit);
+
+	/* set input parameters */
+	input_set_abs_params(input_dev, ABS_MT_POSITION_X,
+			     0, ts_bdata->panel_max_x, 0, 0);
+	input_set_abs_params(input_dev, ABS_MT_POSITION_Y,
+			     0, ts_bdata->panel_max_y, 0, 0);
+	input_set_abs_params(input_dev, ABS_MT_TOUCH_MAJOR,
+			     0, ts_bdata->panel_max_w, 0, 0);
+#ifdef INPUT_TYPE_B_PROTOCOL
+#if LINUX_VERSION_CODE > KERNEL_VERSION(3, 7, 0)
+	input_mt_init_slots(input_dev, GOODIX_MAX_TOUCH,
+			    INPUT_MT_DIRECT);
+#else
+	input_mt_init_slots(input_dev, GOODIX_MAX_TOUCH);
+#endif
+#endif
+
+	input_set_capability(input_dev, EV_KEY, KEY_POWER);
+	input_set_capability(input_dev, EV_KEY, KEY_WAKEUP);
+	input_set_capability(input_dev, EV_KEY, KEY_GOTO);
+
+	r = input_register_device(input_dev);
+	if (r < 0) {
+		ts_err("Unable to register input device");
+		input_free_device(input_dev);
+		return r;
+	}
+
+	return 0;
+}
+
+static int goodix_ts_pen_dev_config(struct goodix_ts_core *core_data)
+{
+	struct goodix_ts_board_data *ts_bdata = board_data(core_data);
+	struct input_dev *pen_dev = NULL;
+	static char ts_phys[32];
+	int r;
+
+	pen_dev = input_allocate_device();
+	if (!pen_dev) {
+		ts_err("Failed to allocated pen device");
+		return -ENOMEM;
+	}
+
+	core_data->pen_dev = pen_dev;
+	input_set_drvdata(pen_dev, core_data);
+
+	pen_dev->name = GOODIX_PEN_DRIVER_NAME;
+	sprintf(ts_phys, "%s/input0", pen_dev->name);
+	pen_dev->phys = ts_phys;
+	pen_dev->id.bustype = core_data->bus->bus_type;
+	pen_dev->id.vendor = 0x27C6;
+	pen_dev->id.product = 0x0002;
+	pen_dev->id.version = 0x0100;
+
+	pen_dev->evbit[0] |= BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS);
+	set_bit(ABS_X, pen_dev->absbit);
+	set_bit(ABS_Y, pen_dev->absbit);
+	set_bit(ABS_TILT_X, pen_dev->absbit);
+	set_bit(ABS_TILT_Y, pen_dev->absbit);
+	set_bit(BTN_STYLUS, pen_dev->keybit);
+	set_bit(BTN_STYLUS2, pen_dev->keybit);
+	set_bit(BTN_TOUCH, pen_dev->keybit);
+	set_bit(BTN_TOOL_PEN, pen_dev->keybit);
+	set_bit(INPUT_PROP_DIRECT, pen_dev->propbit);
+	input_set_abs_params(pen_dev, ABS_X, 0, ts_bdata->panel_max_x, 0, 0);
+	input_set_abs_params(pen_dev, ABS_Y, 0, ts_bdata->panel_max_y, 0, 0);
+	input_set_abs_params(pen_dev, ABS_PRESSURE, 0,
+			     ts_bdata->panel_max_p, 0, 0);
+	input_set_abs_params(pen_dev, ABS_DISTANCE, 0, 255, 0, 0);
+	input_set_abs_params(pen_dev, ABS_TILT_X,
+			-GOODIX_PEN_MAX_TILT, GOODIX_PEN_MAX_TILT, 0, 0);
+	input_set_abs_params(pen_dev, ABS_TILT_Y,
+			-GOODIX_PEN_MAX_TILT, GOODIX_PEN_MAX_TILT, 0, 0);
+
+	r = input_register_device(pen_dev);
+	if (r < 0) {
+		ts_err("Unable to register pen device");
+		input_free_device(pen_dev);
+		return r;
+	}
+
+	return 0;
+}
+
+void goodix_ts_input_dev_remove(struct goodix_ts_core *core_data)
+{
+	if (!core_data->input_dev)
+		return;
+	input_unregister_device(core_data->input_dev);
+	input_free_device(core_data->input_dev);
+	core_data->input_dev = NULL;
+}
+
+void goodix_ts_pen_dev_remove(struct goodix_ts_core *core_data)
+{
+	if (!core_data->pen_dev)
+		return;
+	input_unregister_device(core_data->pen_dev);
+	input_free_device(core_data->pen_dev);
+	core_data->pen_dev = NULL;
+}
+
+/**
+ * goodix_ts_esd_work - check hardware status and recovery
+ *  the hardware if needed.
+ */
+static void goodix_ts_esd_work(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct goodix_ts_esd *ts_esd = container_of(dwork,
+			struct goodix_ts_esd, esd_work);
+	struct goodix_ts_core *cd = container_of(ts_esd,
+			struct goodix_ts_core, ts_esd);
+	const struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+	int ret = 0;
+
+	if (ts_esd->irq_status)
+		goto exit;
+
+	if (!atomic_read(&ts_esd->esd_on))
+		return;
+
+	if (!hw_ops->esd_check)
+		return;
+
+	ret = hw_ops->esd_check(cd);
+	if (ret) {
+		ts_err("esd check failed");
+		goodix_ts_power_off(cd);
+		usleep_range(5000, 5100);
+		goodix_ts_power_on(cd);
+	}
+
+exit:
+	ts_esd->irq_status = false;
+	if (atomic_read(&ts_esd->esd_on))
+		schedule_delayed_work(&ts_esd->esd_work, 2 * HZ);
+}
+
+/**
+ * goodix_ts_esd_on - turn on esd protection
+ */
+static void goodix_ts_esd_on(struct goodix_ts_core *cd)
+{
+	struct goodix_ic_info_misc *misc = &cd->ic_info.misc;
+	struct goodix_ts_esd *ts_esd = &cd->ts_esd;
+
+	if (!misc->esd_addr)
+		return;
+
+	if (atomic_read(&ts_esd->esd_on))
+		return;
+
+	atomic_set(&ts_esd->esd_on, 1);
+	if (!schedule_delayed_work(&ts_esd->esd_work, 2 * HZ))
+		ts_info("esd work already in workqueue");
+
+	ts_info("esd on");
+}
+
+/**
+ * goodix_ts_esd_off - turn off esd protection
+ */
+static void goodix_ts_esd_off(struct goodix_ts_core *cd)
+{
+	struct goodix_ts_esd *ts_esd = &cd->ts_esd;
+	int ret;
+
+	if (!atomic_read(&ts_esd->esd_on))
+		return;
+
+	atomic_set(&ts_esd->esd_on, 0);
+	ret = cancel_delayed_work_sync(&ts_esd->esd_work);
+	ts_info("Esd off, esd work state %d", ret);
+}
+
+/**
+ * goodix_esd_notifier_callback - notification callback
+ *  under certain condition, we need to turn off/on the esd
+ *  protector, we use kernel notify call chain to achieve this.
+ *
+ *  for example: before firmware update we need to turn off the
+ *  esd protector and after firmware update finished, we should
+ *  turn on the esd protector.
+ */
+static int goodix_esd_notifier_callback(struct notifier_block *nb,
+		unsigned long action, void *data)
+{
+	struct goodix_ts_esd *ts_esd = container_of(nb,
+			struct goodix_ts_esd, esd_notifier);
+
+	switch (action) {
+	case NOTIFY_FWUPDATE_START:
+	case NOTIFY_SUSPEND:
+ts_err("zmw---SUSPEND");	
+/*Add by T2M-mingwu.zhang for FP5-195 remarks: Double click on driver update.[Begin]*/	
+//		goodix_ts_power_off(ts_esd->ts_core);
+/*Add by T2M-mingwu.zhang [End]*/
+		break;		
+	case NOTIFY_ESD_OFF:
+		goodix_ts_esd_off(ts_esd->ts_core);
+		break;
+	case NOTIFY_FWUPDATE_FAILED:
+	case NOTIFY_FWUPDATE_SUCCESS:
+	case NOTIFY_RESUME:
+ts_err("zmw---RESUME");	
+		goodix_ts_power_on(ts_esd->ts_core);
+		break;	
+	case NOTIFY_ESD_ON:
+		goodix_ts_esd_on(ts_esd->ts_core);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/**
+ * goodix_ts_esd_init - initialize esd protection
+ */
+int goodix_ts_esd_init(struct goodix_ts_core *cd)
+{
+	struct goodix_ic_info_misc *misc = &cd->ic_info.misc;
+	struct goodix_ts_esd *ts_esd = &cd->ts_esd;
+
+	if (!cd->hw_ops->esd_check || !misc->esd_addr) {
+		ts_info("missing key info for esd check");
+		return 0;
+	}
+
+	INIT_DELAYED_WORK(&ts_esd->esd_work, goodix_ts_esd_work);
+	ts_esd->ts_core = cd;
+	atomic_set(&ts_esd->esd_on, 0);
+	ts_esd->esd_notifier.notifier_call = goodix_esd_notifier_callback;
+	goodix_ts_register_notifier(&ts_esd->esd_notifier);
+	goodix_ts_esd_on(cd);
+
+	return 0;
+}
+
+static void goodix_ts_release_connects(struct goodix_ts_core *core_data)
+{
+	struct input_dev *input_dev = core_data->input_dev;
+	struct input_dev *pen_dev = core_data->pen_dev;
+	int i;
+
+	mutex_lock(&input_dev->mutex);
+	for (i = 0; i < GOODIX_MAX_TOUCH; i++) {
+		input_mt_slot(input_dev, i);
+		input_mt_report_slot_state(input_dev,
+				MT_TOOL_FINGER,
+				false);
+	}
+	input_report_key(input_dev, BTN_TOUCH, 0);
+	input_mt_sync_frame(input_dev);
+	input_sync(input_dev);
+	mutex_unlock(&input_dev->mutex);
+
+	if (core_data->board_data.pen_enable) {
+		mutex_lock(&pen_dev->mutex);
+		input_report_key(pen_dev, BTN_TOUCH, 0);
+		input_report_key(pen_dev, BTN_TOOL_PEN, 0);
+		input_sync(pen_dev);
+		mutex_unlock(&pen_dev->mutex);
+	}
+
+	if (core_data->gesture_type)
+		core_data->hw_ops->after_event_handler(core_data);
+}
+
+/**
+ * goodix_ts_suspend - Touchscreen suspend function
+ * Called by PM/FB/EARLYSUSPEN module to put the device to sleep
+ */
+static int goodix_ts_suspend(struct goodix_ts_core *core_data)
+{
+	struct goodix_ext_module *ext_module, *next;
+	struct goodix_ts_hw_ops *hw_ops = core_data->hw_ops;
+	int ret;
+
+	if (core_data->init_stage < CORE_INIT_STAGE2 ||
+			atomic_read(&core_data->suspended))
+		return 0;
+
+	ts_err("Suspend start");
+	atomic_set(&core_data->suspended, 1);
+	/* disable irq */
+	hw_ops->irq_enable(core_data, false);
+
+	/*
+	 * notify suspend event, inform the esd protector
+	 * and charger detector to turn off the work
+	 */
+	goodix_ts_blocking_notify(NOTIFY_SUSPEND, NULL);
+
+	/* inform external module */
+	mutex_lock(&goodix_modules.mutex);
+	if (!list_empty(&goodix_modules.head)) {
+		list_for_each_entry_safe(ext_module, next,
+					 &goodix_modules.head, list) {
+			if (!ext_module->funcs->before_suspend)
+				continue;
+
+			ret = ext_module->funcs->before_suspend(core_data,
+							      ext_module);
+			if (ret == EVT_CANCEL_SUSPEND) {
+				mutex_unlock(&goodix_modules.mutex);
+				ts_info("Canceled by module:%s",
+					ext_module->name);
+				goto out;
+			}
+		}
+	}
+	mutex_unlock(&goodix_modules.mutex);
+
+	/* enter sleep mode or power off */
+	if (core_data->board_data.sleep_enable)
+		hw_ops->suspend(core_data);
+/*Add by T2M-mingwu.zhang for FP5-195 remarks: Double click on driver update.[Begin]*/	
+/* 	else
+		goodix_ts_power_off(core_data); */
+/*Add by T2M-mingwu.zhang [End]*/		
+
+	/* inform exteranl modules */
+	mutex_lock(&goodix_modules.mutex);
+	if (!list_empty(&goodix_modules.head)) {
+		list_for_each_entry_safe(ext_module, next,
+					&goodix_modules.head, list) {
+			if (!ext_module->funcs->after_suspend)
+				continue;
+
+			ret = ext_module->funcs->after_suspend(core_data,
+							     ext_module);
+			if (ret == EVT_CANCEL_SUSPEND) {
+				mutex_unlock(&goodix_modules.mutex);
+				ts_info("Canceled by module:%s",
+					ext_module->name);
+				goto out;
+			}
+		}
+	}
+	mutex_unlock(&goodix_modules.mutex);
+
+out:
+	goodix_ts_release_connects(core_data);
+	ts_info("Suspend end");
+	return 0;
+}
+
+/**
+ * goodix_ts_resume - Touchscreen resume function
+ * Called by PM/FB/EARLYSUSPEN module to wakeup device
+ */
+static int goodix_ts_resume(struct goodix_ts_core *core_data)
+{
+	struct goodix_ext_module *ext_module, *next;
+	struct goodix_ts_hw_ops *hw_ops = core_data->hw_ops;
+	int ret;
+
+	if (core_data->init_stage < CORE_INIT_STAGE2 ||
+			!atomic_read(&core_data->suspended))
+		return 0;
+
+	ts_err("Resume start");
+	atomic_set(&core_data->suspended, 0);
+	hw_ops->irq_enable(core_data, false);
+
+	mutex_lock(&goodix_modules.mutex);
+	if (!list_empty(&goodix_modules.head)) {
+		list_for_each_entry_safe(ext_module, next,
+					 &goodix_modules.head, list) {
+			if (!ext_module->funcs->before_resume)
+				continue;
+
+			ret = ext_module->funcs->before_resume(core_data,
+					ext_module);
+			if (ret == EVT_CANCEL_RESUME) {
+				mutex_unlock(&goodix_modules.mutex);
+				ts_info("Canceled by module:%s",
+					ext_module->name);
+				goto out;
+			}
+		}
+	}
+	mutex_unlock(&goodix_modules.mutex);
+
+	/* reset device or power on*/
+	if (core_data->board_data.sleep_enable)
+		hw_ops->resume(core_data);
+	else
+		goodix_ts_power_on(core_data);
+
+	mutex_lock(&goodix_modules.mutex);
+	if (!list_empty(&goodix_modules.head)) {
+		list_for_each_entry_safe(ext_module, next,
+					 &goodix_modules.head, list) {
+			if (!ext_module->funcs->after_resume)
+				continue;
+
+			ret = ext_module->funcs->after_resume(core_data,
+							    ext_module);
+			if (ret == EVT_CANCEL_RESUME) {
+				mutex_unlock(&goodix_modules.mutex);
+				ts_info("Canceled by module:%s",
+					ext_module->name);
+				goto out;
+			}
+		}
+	}
+	mutex_unlock(&goodix_modules.mutex);
+
+out:
+	/* enable irq */
+	hw_ops->irq_enable(core_data, true);
+	/* open esd */
+	goodix_ts_blocking_notify(NOTIFY_RESUME, NULL);
+	ts_info("Resume end");
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_FB)
+/**
+ * goodix_ts_fb_notifier_callback - Framebuffer notifier callback
+ * Called by kernel during framebuffer blanck/unblank phrase
+ */
+int goodix_ts_fb_notifier_callback(struct notifier_block *self,
+	unsigned long event, void *data)
+{
+	struct goodix_ts_core *core_data =
+		container_of(self, struct goodix_ts_core, fb_notifier);
+	struct fb_event *fb_event = data;
+
+	if (fb_event && fb_event->data && core_data) {
+		if (event == FB_EVENT_BLANK) {
+			int *blank = fb_event->data;
+
+			if (*blank == FB_BLANK_UNBLANK)
+				goodix_ts_resume(core_data);
+			else if (*blank == FB_BLANK_POWERDOWN)
+				goodix_ts_suspend(core_data);
+		}
+	}
+
+	return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_DRM)
+int goodix_ts_drm_notifier_callback(struct notifier_block *self,
+        unsigned long event, void *data)
+{
+    struct drm_panel_notifier *evdata = data;
+    int *blank = NULL;
+	struct goodix_ts_core *core_data =
+		container_of(self, struct goodix_ts_core, fb_notifier);
+
+    ts_info("in\n");
+
+    if (!evdata)
+        return 0;
+
+    if (!(event == DRM_PANEL_EARLY_EVENT_BLANK ||
+        event == DRM_PANEL_EVENT_BLANK)) {
+        ts_info("event(%lu) do not need process\n", event);
+        return 0;
+    }
+
+    blank = evdata->data;
+    ts_info("FB event:%lu,blank:%d", event, *blank);
+    switch (*blank) {
+    case DRM_PANEL_BLANK_UNBLANK:
+        if (event == DRM_PANEL_EARLY_EVENT_BLANK) {
+            ts_info("resume: event = %lu, not care\n", event);
+        } else if (event == DRM_PANEL_EVENT_BLANK) {
+            goodix_ts_resume(core_data);
+        }
+        break;
+
+    case DRM_PANEL_BLANK_POWERDOWN:
+        if (event == DRM_PANEL_EARLY_EVENT_BLANK) {
+           goodix_ts_suspend(core_data);
+        } else if (event == DRM_PANEL_EVENT_BLANK) {
+            ts_info("suspend: event = %lu, not care\n", event);
+        }
+        break;
+
+    default:
+        ts_err("FB BLANK(%d) do not need process\n", *blank);
+        break;
+    }
+
+    return 0;
+
+}
+#endif
+
+#if IS_ENABLED(CONFIG_PM)
+#if !IS_ENABLED(CONFIG_FB)&& !IS_ENABLED(CONFIG_DRM)&& !IS_ENABLED(CONFIG_HAS_EARLYSUSPEND)
+/**
+ * goodix_ts_pm_suspend - PM suspend function
+ * Called by kernel during system suspend phrase
+ */
+static int goodix_ts_pm_suspend(struct device *dev)
+{
+	struct goodix_ts_core *core_data =
+		dev_get_drvdata(dev);
+
+	return goodix_ts_suspend(core_data);
+}
+/**
+ * goodix_ts_pm_resume - PM resume function
+ * Called by kernel during system wakeup
+ */
+static int goodix_ts_pm_resume(struct device *dev)
+{
+	struct goodix_ts_core *core_data =
+		dev_get_drvdata(dev);
+
+	return goodix_ts_resume(core_data);
+}
+#endif
+#endif
+
+/**
+ * goodix_generic_noti_callback - generic notifier callback
+ *  for goodix touch notification event.
+ */
+static int goodix_generic_noti_callback(struct notifier_block *self,
+		unsigned long action, void *data)
+{
+	struct goodix_ts_core *cd = container_of(self,
+			struct goodix_ts_core, ts_notifier);
+	const struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+
+	if (cd->init_stage < CORE_INIT_STAGE2)
+		return 0;
+
+	ts_info("notify event type 0x%x", (unsigned int)action);
+	switch (action) {
+	case NOTIFY_FWUPDATE_START:
+		hw_ops->irq_enable(cd, 0);
+		break;
+	case NOTIFY_FWUPDATE_SUCCESS:
+	case NOTIFY_FWUPDATE_FAILED:
+		if (hw_ops->read_version(cd, &cd->fw_version))
+			ts_info("failed read fw version info[ignore]");
+		hw_ops->irq_enable(cd, 1);
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static void goodix_self_check(struct work_struct *work)
+{
+	struct goodix_ts_core *cd =
+			container_of(work, struct goodix_ts_core, self_check_work);
+	u32 fw_state_addr = cd->ic_info.misc.fw_state_addr;
+	int update_flag = UPDATE_MODE_BLOCK | UPDATE_MODE_SRC_REQUEST | UPDATE_MODE_FORCE;
+	u8 cur_cycle_cnt = 0;
+	u8 pre_cycle_cnt = 0;
+	int err_cnt = 0;
+	int retry = 5;
+
+	while (retry--) {
+		cd->hw_ops->read(cd, fw_state_addr, &cur_cycle_cnt, 1);
+		if (cur_cycle_cnt == pre_cycle_cnt)
+			err_cnt++;
+		pre_cycle_cnt = cur_cycle_cnt;
+		msleep(20);
+	}
+	if (err_cnt > 1) {
+		ts_err("Warning! The firmware maybe running abnormal, need upgrade.");
+		goodix_do_fw_update(cd->ic_configs[CONFIG_TYPE_NORMAL],
+				update_flag);
+	}
+}
+
+int goodix_ts_stage2_init(struct goodix_ts_core *cd)
+{
+	int ret;
+
+	/* alloc/config/register input device */
+	ret = goodix_ts_input_dev_config(cd);
+	if (ret < 0) {
+		ts_err("failed set input device");
+		return ret;
+	}
+
+	if (cd->board_data.pen_enable) {
+		ret = goodix_ts_pen_dev_config(cd);
+		if (ret < 0) {
+			ts_err("failed set pen device");
+			goto err_finger;
+		}
+	}
+	/* request irq line */
+	ret = goodix_ts_irq_setup(cd);
+	if (ret < 0) {
+		ts_info("failed set irq");
+		goto exit;
+	}
+	ts_info("success register irq");
+
+#if IS_ENABLED(CONFIG_FB)
+	cd->fb_notifier.notifier_call = goodix_ts_fb_notifier_callback;
+	if (fb_register_client(&cd->fb_notifier))
+		ts_err("Failed to register fb notifier client:%d", ret);
+#elif IS_ENABLED(CONFIG_DRM)	
+	cd->fb_notifier.notifier_call = goodix_ts_drm_notifier_callback;
+	if (gdix_active_panel) {	
+		ret = drm_panel_notifier_register(gdix_active_panel,
+			&cd->fb_notifier);
+		if (ret)
+			ts_err("Failed to register fb notifier client");
+	}
+	else {
+		ts_err("gdix_active_panel error\n");
+	}		
+#endif
+	/* create sysfs files */
+	goodix_ts_sysfs_init(cd);
+
+	/* create procfs files */
+	goodix_ts_procfs_init(cd);
+
+	/* esd protector */
+	goodix_ts_esd_init(cd);
+
+	/* gesture init */
+	gesture_module_init();
+
+	/* inspect init */
+	inspect_module_init(cd);
+
+	/* Do self check on first boot */
+	INIT_WORK(&cd->self_check_work, goodix_self_check);
+	schedule_work(&cd->self_check_work);
+
+	return 0;
+exit:
+	goodix_ts_pen_dev_remove(cd);
+err_finger:
+	goodix_ts_input_dev_remove(cd);
+	return ret;
+}
+
+/* try send the config specified with type */
+static int goodix_send_ic_config(struct goodix_ts_core *cd, int type)
+{
+	u32 config_id;
+	struct goodix_ic_config *cfg;
+
+	if (type >= GOODIX_MAX_CONFIG_GROUP) {
+		ts_err("unsupproted config type %d", type);
+		return -EINVAL;
+	}
+
+	cfg = cd->ic_configs[type];
+	if (!cfg || cfg->len <= 0) {
+		ts_info("no valid normal config found");
+		return -EINVAL;
+	}
+
+	config_id = goodix_get_file_config_id(cfg->data);
+	if (cd->ic_info.version.config_id == config_id) {
+		ts_info("config id is equal 0x%x, skiped", config_id);
+		return 0;
+	}
+
+	ts_info("try send config, id=0x%x", config_id);
+	return cd->hw_ops->send_config(cd, cfg->data, cfg->len);
+}
+
+/**
+ * goodix_later_init_thread - init IC fw and config
+ * @data: point to goodix_ts_core
+ *
+ * This function respond for get fw version and try upgrade fw and config.
+ * Note: when init encounter error, need release all resource allocated here.
+ */
+static int goodix_later_init_thread(void *data)
+{
+	int ret, i;
+	int update_flag = UPDATE_MODE_BLOCK | UPDATE_MODE_SRC_REQUEST;
+	struct goodix_ts_core *cd = data;
+	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+
+	/* step 1: read version */
+	ret = hw_ops->read_version(cd, &cd->fw_version);
+	if (ret < 0) {
+		ts_err("failed to get version info, try to upgrade");
+		update_flag |= UPDATE_MODE_FORCE;
+	}
+
+	/* step 2: read ic info */
+	ret = hw_ops->get_ic_info(cd, &cd->ic_info);
+	if (ret < 0) {
+		ts_err("failed to get ic info, try to upgrade");
+		update_flag |= UPDATE_MODE_FORCE;
+	}
+
+	/* step 3: get config data from config bin */
+	ret = goodix_get_config_proc(cd);
+	if (ret)
+		ts_info("no valid ic config found");
+	else
+		ts_info("success get valid ic config");
+
+	/* step 4: init fw struct add try do fw upgrade */
+	ret = goodix_fw_update_init(cd);
+	if (ret) {
+		ts_err("failed init fw update module");
+		goto err_out;
+	}
+
+	/* step 5: do upgrade */
+	ts_info("update flag: 0x%X", update_flag);
+	ret = goodix_do_fw_update(cd->ic_configs[CONFIG_TYPE_NORMAL],
+			update_flag);
+	if (ret)
+		ts_err("failed do fw update");
+
+	print_ic_info(&cd->ic_info);
+
+	/* the recomend way to update ic config is throuth ISP,
+	 * if not we will send config with interactive mode
+	 */
+	goodix_send_ic_config(cd, CONFIG_TYPE_NORMAL);
+
+	/* init other resources */
+	ret = goodix_ts_stage2_init(cd);
+	if (ret) {
+		ts_err("stage2 init failed");
+		goto uninit_fw;
+	}
+	cd->init_stage = CORE_INIT_STAGE2;
+
+	return 0;
+
+uninit_fw:
+	goodix_fw_update_uninit();
+err_out:
+	ts_err("stage2 init failed");
+	cd->init_stage = CORE_INIT_FAIL;
+	for (i = 0; i < GOODIX_MAX_CONFIG_GROUP; i++) {
+		kfree(cd->ic_configs[i]);
+		cd->ic_configs[i] = NULL;
+	}
+	return ret;
+}
+
+static int goodix_start_later_init(struct goodix_ts_core *ts_core)
+{
+	struct task_struct *init_thrd;
+	/* create and run update thread */
+	init_thrd = kthread_run(goodix_later_init_thread,
+				ts_core, "goodix_init_thread");
+	if (IS_ERR_OR_NULL(init_thrd)) {
+		ts_err("Failed to create update thread:%ld",
+		       PTR_ERR(init_thrd));
+		return -EFAULT;
+	}
+	return 0;
+}
+
+/* goodix fb test */
+// static void test_suspend(void)
+// {
+// 	goodix_ts_suspend(goodix_modules.core_data);
+// }
+
+// static void test_resume(void)
+// {
+// 	goodix_ts_resume(goodix_modules.core_data);
+// }
+
+/**
+ * goodix_ts_probe - called by kernel when Goodix touch
+ *  platform driver is added.
+ */
+static int goodix_ts_probe(struct platform_device *pdev)
+{
+	struct goodix_ts_core *core_data = NULL;
+	struct goodix_bus_interface *bus_interface;
+	int ret;
+
+	ts_err("IN_GT9897");
+
+	bus_interface = pdev->dev.platform_data;
+	if (!bus_interface) {
+		ts_err("Invalid touch device");
+		core_module_prob_sate = CORE_MODULE_PROB_FAILED;
+		return -ENODEV;
+	}
+
+	core_data = devm_kzalloc(&pdev->dev,
+			sizeof(struct goodix_ts_core), GFP_KERNEL);
+	if (!core_data) {
+		core_module_prob_sate = CORE_MODULE_PROB_FAILED;
+		return -ENOMEM;
+	}
+
+	if (IS_ENABLED(CONFIG_OF) && bus_interface->dev->of_node) {
+		/* parse devicetree property */
+		ret = goodix_parse_dt(bus_interface->dev->of_node,
+					&core_data->board_data);
+		if (ret) {
+			ts_err("failed parse device info form dts, %d", ret);
+			return -EINVAL;
+		}
+	} else {
+		ts_err("no valid device tree node found");
+		return -ENODEV;
+	}
+
+	if (check_dt(bus_interface->dev->of_node)) {
+		if (!check_default_tp(bus_interface->dev->of_node, "qcom,spi-touch-active"))
+			ret = -EPROBE_DEFER;
+		else
+			ret = -ENODEV;
+        	ts_err("check_dt failed, error=%d", ret);
+		return ret;
+	}
+
+	core_data->hw_ops = goodix_get_hw_ops();
+	if (!core_data->hw_ops) {
+		ts_err("hw ops is NULL");
+		core_module_prob_sate = CORE_MODULE_PROB_FAILED;
+		return -EINVAL;
+	}
+	goodix_core_module_init();
+	/* touch core layer is a platform driver */
+	core_data->pdev = pdev;
+	core_data->bus = bus_interface;
+	platform_set_drvdata(pdev, core_data);
+
+	/* get GPIO resource */
+	ret = goodix_ts_gpio_setup(core_data);
+	if (ret) {
+		ts_err("failed init gpio");
+		goto err_out;
+	}
+
+	ret = goodix_ts_power_init(core_data);
+	if (ret) {
+		ts_err("failed init power");
+		goto err_out;
+	}
+
+	ret = goodix_ts_power_on(core_data);
+	if (ret) {
+		ts_err("failed power on");
+		goto err_out;
+	}
+
+	/* generic notifier callback */
+	core_data->ts_notifier.notifier_call = goodix_generic_noti_callback;
+	goodix_ts_register_notifier(&core_data->ts_notifier);
+
+	/* debug node init */
+	goodix_tools_init();
+
+	/* goodix fb test */
+	// fb_firefly_register(test_suspend, test_resume);
+
+	core_data->init_stage = CORE_INIT_STAGE1;
+	goodix_modules.core_data = core_data;
+	core_module_prob_sate = CORE_MODULE_PROB_SUCCESS;
+
+	/* Try start a thread to get config-bin info */
+	goodix_start_later_init(core_data);
+
+	ts_info("goodix_ts_core probe success");
+	return 0;
+
+err_out:
+	core_data->init_stage = CORE_INIT_FAIL;
+	core_module_prob_sate = CORE_MODULE_PROB_FAILED;
+	ts_err("goodix_ts_core failed, ret:%d", ret);
+	return ret;
+}
+
+static int goodix_ts_remove(struct platform_device *pdev)
+{
+	struct goodix_ts_core *core_data = platform_get_drvdata(pdev);
+	struct goodix_ts_hw_ops *hw_ops = core_data->hw_ops;
+	struct goodix_ts_esd *ts_esd = &core_data->ts_esd;
+	int ret = 0;
+
+	goodix_ts_unregister_notifier(&core_data->ts_notifier);
+	goodix_tools_exit();
+
+	if (core_data->init_stage >= CORE_INIT_STAGE2) {
+		gesture_module_exit();
+		inspect_module_exit();
+		hw_ops->irq_enable(core_data, false);
+	#if IS_ENABLED(CONFIG_FB)
+		fb_unregister_client(&core_data->fb_notifier);
+	#elif IS_ENABLED(CONFIG_DRM)
+		if (gdix_active_panel) {			
+			ret = drm_panel_notifier_unregister(gdix_active_panel,
+					&core_data->fb_notifier);
+			if (ret < 0)
+				ts_err("Failed to unregister fb notifier client");
+		}	
+	#endif
+		core_module_prob_sate = CORE_MODULE_REMOVED;
+		if (atomic_read(&core_data->ts_esd.esd_on))
+			goodix_ts_esd_off(core_data);
+		goodix_ts_unregister_notifier(&ts_esd->esd_notifier);
+
+		goodix_fw_update_uninit();
+		goodix_ts_input_dev_remove(core_data);
+		goodix_ts_pen_dev_remove(core_data);
+		goodix_ts_sysfs_exit(core_data);
+		goodix_ts_procfs_exit(core_data);
+		goodix_ts_power_off(core_data);
+	}
+
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_PM)
+static const struct dev_pm_ops dev_pm_ops = {
+#if !IS_ENABLED(CONFIG_FB)&& !IS_ENABLED(CONFIG_DRM)&& !IS_ENABLED(CONFIG_HAS_EARLYSUSPEND)
+	.suspend = goodix_ts_pm_suspend,
+	.resume = goodix_ts_pm_resume,
+#endif
+};
+#endif
+
+static const struct platform_device_id ts_core_ids[] = {
+	{.name = GOODIX_CORE_DRIVER_NAME},
+	{}
+};
+MODULE_DEVICE_TABLE(platform, ts_core_ids);
+
+static struct platform_driver goodix_ts_driver = {
+	.driver = {
+		.name = GOODIX_CORE_DRIVER_NAME,
+		.owner = THIS_MODULE,
+#if IS_ENABLED(CONFIG_PM)
+		.pm = &dev_pm_ops,
+#endif
+	},
+	.probe = goodix_ts_probe,
+	.remove = goodix_ts_remove,
+	.id_table = ts_core_ids,
+};
+
+static int __init goodix_ts_core_init(void)
+{
+	int ret;
+
+	ts_info("Core layer init:%s", GOODIX_DRIVER_VERSION);
+#ifdef CONFIG_TOUCHSCREEN_GOODIX_BRL_SPI
+	ret = goodix_spi_bus_init();
+#else
+	ret = goodix_i2c_bus_init();
+#endif
+	if (ret) {
+		ts_err("failed add bus driver");
+		return ret;
+	}
+	return platform_driver_register(&goodix_ts_driver);
+}
+
+static void __exit goodix_ts_core_exit(void)
+{
+	ts_info("Core layer exit");
+	platform_driver_unregister(&goodix_ts_driver);
+#ifdef CONFIG_TOUCHSCREEN_GOODIX_BRL_SPI
+	goodix_spi_bus_exit();
+#else
+	goodix_i2c_bus_exit();
+#endif
+}
+
+late_initcall(goodix_ts_core_init);
+module_exit(goodix_ts_core_exit);
+
+MODULE_DESCRIPTION("Goodix Touchscreen Core Module");
+MODULE_AUTHOR("Goodix, Inc.");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.h b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.h
new file mode 100644
index 00000000000000..7af4bd9848171b
--- /dev/null
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.h
@@ -0,0 +1,850 @@
+/*
+ * Goodix Gesture Module
+ *
+ * Copyright (C) 2019 - 2020 Goodix, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be a reference
+ * to you, when you are integrating the GOODiX's CTP IC into your system,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef _GOODIX_TS_CORE_H_
+#define _GOODIX_TS_CORE_H_
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/firmware.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include <linux/vmalloc.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/completion.h>
+#include <linux/of_irq.h>
+#if IS_ENABLED(CONFIG_OF)
+#include <linux/of_gpio.h>
+#include <linux/regulator/consumer.h>
+#endif
+#if IS_ENABLED(CONFIG_FB)
+#include <linux/notifier.h>
+#include <linux/fb.h>
+#endif
+
+/*Add by T2M-mingwu.zhang for FP5-538 remarks: TP/LCD Device Information Development.[Begin]*/	
+#ifdef CONFIG_EMKIT_INFO
+#include <emkit/emkit_info.h>
+#define GOODIX_NAME "GOODIX"
+extern char emkit_buf[256];
+#endif
+/*Add by T2M-mingwu.zhang [End]*/
+
+#define GOODIX_CORE_DRIVER_NAME			"goodix_ts"
+#define GOODIX_PEN_DRIVER_NAME			"goodix_ts,pen"
+#define GOODIX_DRIVER_VERSION			"v1.3.7"
+#define GOODIX_MAX_TOUCH				10
+#define GOODIX_MAX_KEY					10
+#define GOODIX_PEN_MAX_PRESSURE			4096
+#define GOODIX_MAX_PEN_KEY				2
+#define GOODIX_PEN_MAX_TILT				90
+#define GOODIX_CFG_MAX_SIZE				4096
+#define GOODIX_FW_MAX_SIEZE				(300 * 1024)
+#define GOODIX_MAX_STR_LABLE_LEN		32
+#define GOODIX_MAX_FRAMEDATA_LEN		2000
+#define GOODIX_GESTURE_DATA_LEN			16
+
+#define GOODIX_NORMAL_RESET_DELAY_MS	100
+#define GOODIX_HOLD_CPU_RESET_DELAY_MS  5
+
+#define GOODIX_RETRY_3					3
+#define GOODIX_RETRY_5					5
+#define GOODIX_RETRY_10					10
+
+#define TS_DEFAULT_FIRMWARE				"goodix_firmware.bin"
+#define TS_DEFAULT_CFG_BIN				"goodix_cfg_group.bin"
+
+enum GOODIX_GESTURE_TYP {
+	GESTURE_C			= (1 << 0),
+	GESTURE_E			= (1 << 1),
+	GESTURE_F			= (1 << 2),
+	GESTURE_O			= (1 << 3),
+	GESTURE_M			= (1 << 4),
+	GESTURE_W			= (1 << 5),
+	GESTURE_DOUBLE_TAP	= (1 << 7),
+	GESTURE_BA			= (1 << 8),
+	GESTURE_AB			= (1 << 9),
+	GESTURE_AA			= (1 << 10),
+	GESTURE_BB			= (1 << 11),
+	GESTURE_SINGLE_TAP	= (1 << 12),
+	GESTURE_FOD_PRESS	= (1 << 13)
+};
+
+enum CORD_PROB_STA {
+	CORE_MODULE_UNPROBED = 0,
+	CORE_MODULE_PROB_SUCCESS = 1,
+	CORE_MODULE_PROB_FAILED = -1,
+	CORE_MODULE_REMOVED = -2,
+};
+
+enum GOODIX_ERR_CODE {
+	GOODIX_EBUS      = (1<<0),
+	GOODIX_ECHECKSUM = (1<<1),
+	GOODIX_EVERSION  = (1<<2),
+	GOODIX_ETIMEOUT  = (1<<3),
+	GOODIX_EMEMCMP   = (1<<4),
+
+	GOODIX_EOTHER    = (1<<7)
+};
+
+/* MAIN-ID */
+enum IC_TYPE_ID {
+	IC_TYPE_NONE,
+	IC_TYPE_NORMANDY,
+	IC_TYPE_NANJING,
+	IC_TYPE_YELLOWSTONE,
+	IC_TYPE_BERLIN_A,
+	IC_TYPE_BERLIN_B,
+	IC_TYPE_BERLIN_D,
+	IC_TYPE_NOTTINGHAM
+};
+
+/* SUB-ID
+ * sub type of berlinB serial IC.
+ * for convenience we put the MAIN-ID on the hith bits,
+ * hith 8 bits is MAIN-ID, low 8 bits is MIN-ID
+ */
+enum BERLIN_B_SUB_ID {
+	IC_TYPE_SUB_B2 = (IC_TYPE_BERLIN_B << 8) | 0x2,
+};
+
+
+enum GOODIX_IC_CONFIG_TYPE {
+	CONFIG_TYPE_TEST = 0,
+	CONFIG_TYPE_NORMAL = 1,
+	CONFIG_TYPE_HIGHSENSE = 2,
+	CONFIG_TYPE_CHARGER = 3,
+	CONFIG_TYPE_CHARGER_HS = 4,
+	CONFIG_TYPE_HOLSTER = 5,
+	CONFIG_TYPE_HOSTER_CH = 6,
+	CONFIG_TYPE_OTHER = 7,
+	/* keep this at the last */
+	GOODIX_MAX_CONFIG_GROUP = 8,
+};
+
+enum CHECKSUM_MODE {
+	CHECKSUM_MODE_U8_LE,
+	CHECKSUM_MODE_U16_LE,
+};
+
+#define MAX_SCAN_FREQ_NUM            8
+#define MAX_SCAN_RATE_NUM            8
+#define MAX_FREQ_NUM_STYLUS          8
+#define MAX_STYLUS_SCAN_FREQ_NUM     6
+#pragma pack(1)
+struct flash_head {
+    uint32_t checksum;
+    uint32_t address;
+    uint32_t length;
+};
+
+struct frame_head {
+	uint8_t sync;
+	uint16_t frame_index;
+	uint16_t cur_frame_len;
+	uint16_t next_frame_len;
+	uint32_t data_en; /* 0- 7 for pack_en; 8 - 31 for type en */
+	uint8_t touch_pack_index;
+	uint8_t stylus_pack_index;
+	uint8_t res;
+	uint16_t checksum;
+};
+
+struct goodix_fw_version {
+	u8 rom_pid[6];               /* rom PID */
+	u8 rom_vid[3];               /* Mask VID */
+	u8 rom_vid_reserved;
+	u8 patch_pid[8];              /* Patch PID */
+	u8 patch_vid[4];              /* Patch VID */
+	u8 patch_vid_reserved;
+	u8 sensor_id;
+	u8 reserved[2];
+	u16 checksum;
+};
+
+struct goodix_ic_info_version {
+	u8 info_customer_id;
+	u8 info_version_id;
+	u8 ic_die_id;
+	u8 ic_version_id;
+	u32 config_id;
+	u8 config_version;
+	u8 frame_data_customer_id;
+	u8 frame_data_version_id;
+	u8 touch_data_customer_id;
+	u8 touch_data_version_id;
+	u8 reserved[3];
+};
+
+struct goodix_ic_info_feature { /* feature info*/
+	u16 freqhop_feature;
+	u16 calibration_feature;
+	u16 gesture_feature;
+	u16 side_touch_feature;
+	u16 stylus_feature;
+};
+
+struct goodix_ic_info_param { /* param */
+	u8 drv_num;
+	u8 sen_num;
+	u8 button_num;
+	u8 force_num;
+	u8 active_scan_rate_num;
+	u16 active_scan_rate[MAX_SCAN_RATE_NUM];
+	u8 mutual_freq_num;
+	u16 mutual_freq[MAX_SCAN_FREQ_NUM];
+	u8 self_tx_freq_num;
+	u16 self_tx_freq[MAX_SCAN_FREQ_NUM];
+	u8 self_rx_freq_num;
+	u16 self_rx_freq[MAX_SCAN_FREQ_NUM];
+	u8 stylus_freq_num;
+	u16 stylus_freq[MAX_FREQ_NUM_STYLUS];
+};
+
+struct goodix_ic_info_misc { /* other data */
+	u32 cmd_addr;
+	u16 cmd_max_len;
+	u32 cmd_reply_addr;
+	u16 cmd_reply_len;
+	u32 fw_state_addr;
+	u16 fw_state_len;
+	u32 fw_buffer_addr;
+	u16 fw_buffer_max_len;
+	u32 frame_data_addr;
+	u16 frame_data_head_len;
+	u16 fw_attr_len;
+	u16 fw_log_len;
+	u8 pack_max_num;
+	u8 pack_compress_version;
+	u16 stylus_struct_len;
+	u16 mutual_struct_len;
+	u16 self_struct_len;
+	u16 noise_struct_len;
+	u32 touch_data_addr;
+	u16 touch_data_head_len;
+	u16 point_struct_len;
+	u16 screen_real_max_x;
+	u16 screen_real_max_y;
+	u32 mutual_rawdata_addr;
+	u32 mutual_diffdata_addr;
+	u32 mutual_refdata_addr;
+	u32 self_rawdata_addr;
+	u32 self_diffdata_addr;
+	u32 self_refdata_addr;
+	u32 iq_rawdata_addr;
+	u32 iq_refdata_addr;
+	u32 im_rawdata_addr;
+	u16 im_rawdata_len;
+	u32 noise_rawdata_addr;
+	u16 noise_rawdata_len;
+	u32 stylus_rawdata_addr;
+	u16 stylus_rawdata_len;
+	u32 noise_data_addr;
+	u32 esd_addr;
+	u32 auto_scan_cmd_addr;
+	u32 auto_scan_info_addr;
+};
+
+struct goodix_ic_info_other {
+	u16 normalize_k_version;
+	u32 irrigation_data_addr;
+	u32 algo_debug_data_addr;
+	u16 algo_debug_data_len;
+	u32 update_sync_data_addr;
+	u16 screen_max_x;
+	u16 screen_max_y;
+};
+
+struct goodix_ic_info {
+	u16 length;
+	struct goodix_ic_info_version version;
+	struct goodix_ic_info_feature feature;
+	struct goodix_ic_info_param parm;
+	struct goodix_ic_info_misc misc;
+	struct goodix_ic_info_other other;
+};
+
+// goodix_ic_info V2
+struct goodix_ic_info_sub_version {
+	u16 length;
+	u8 ic_die_id;
+	u8 ic_version_id;
+	u8 frame_data_customer_id;
+	u8 frame_data_version_id;
+	u8 touch_data_customer_id;
+	u8 touch_data_version_id;
+	u16 normalize_k_version;
+	u32 algorithm_version;
+	u32 short_test_version;
+	u32 lsp_version;
+	u32 config_id;
+	u8 config_version;
+};
+
+struct goodix_ic_info_sub_sample {
+	u16 length;
+	u16 screen_real_max_x;
+	u16 screen_real_max_y;
+	u16 screen_max_x;
+	u16 screen_max_y;
+	u8 stylus_feature;
+	union {
+		u8 freqhop_feature:1;
+		u8 calibration_feature:1;
+		u8 gesture_feature:1;
+		u8 stylus_freqhop_feature:1;
+	};
+	u8 drv_num;
+	u8 sen_num;
+	u8 button_num;
+	u8 force_num;
+	u8 active_scan_rate_num;
+	u16 active_scan_rate[MAX_SCAN_RATE_NUM];
+	u8 mutual_freq_num;
+	u16 mutual_freq[MAX_SCAN_FREQ_NUM];
+	u8 self_tx_freq_num;
+	u16 self_tx_freq[MAX_SCAN_FREQ_NUM];
+	u8 self_rx_freq_num;
+	u16 self_rx_freq[MAX_SCAN_FREQ_NUM];
+	u8 stylus_freq_num;
+	u16 stylus_freq[MAX_FREQ_NUM_STYLUS];
+	u8 stylus_tx2_freq_num;
+	u16 stylus_tx2_freq[MAX_FREQ_NUM_STYLUS];
+};
+
+struct goodix_ic_info_sub_address {
+	u16 length;
+	u32 cmd_addr;
+	u16 cmd_max_len;
+	u32 cmd_reply_addr;
+	u16 cmd_reply_len;
+	u32 fw_state_addr;
+	u8 fw_state_len;
+	u8 cmd_state_len;
+	u32 fw_buffer_addr;
+	u16 fw_buffer_max_len;
+	u32 frame_data_addr;
+	u16 frame_data_head_len;
+	u16 fw_attr_len;
+	u16 fw_log_len;
+	u8 pack_max_num;
+	u8 pack_compress_version;
+	u16 stylus_struct_len;
+	u16 mutual_struct_len;
+	u16 self_struct_len;
+	u16 noise_struct_len;
+	u16 im_struct_len;
+	u32 touch_data_addr;
+	u16 touch_data_head_len;
+	u16 point_struct_len;
+	u32 mutual_rawdata_addr;
+	u32 mutual_diffdata_addr;
+	u32 mutual_refdata_addr;
+	u32 self_rawdata_addr;
+	u32 self_diffdata_addr;
+	u32 self_refdata_addr;
+	u32 iq_rawdata_addr;
+	u32 iq_refdata_addr;
+	u32 im_rawdata_addr;
+	u16 im_rawdata_len;
+	u32 noise_rawdata_addr;
+	u16 noise_rawdata_len;
+	u32 stylus_rawdata_addr;
+	u16 stylus_rawdata_len;
+	u32 noise_data_addr;
+	u32 esd_addr;
+	u32 auto_scan_cmd_addr;
+	u32 auto_scan_info_addr;
+	u32 irrigation_data_addr;
+	u32 algo_debug_data_addr;
+	u16 algo_debug_data_len;
+	u32 update_sync_data_addr;
+};
+
+struct goodix_ic_info_sub_customer {
+	u16 length;
+	u32 customer;
+};
+
+struct goodix_ic_info_v2 {
+	u16 length;
+	u8 info_customer_id;
+	u8 info_version_id;
+	struct goodix_ic_info_sub_version version;
+	struct goodix_ic_info_sub_sample sample;
+	struct goodix_ic_info_sub_address address;
+	struct goodix_ic_info_sub_customer customer;
+};
+#pragma pack()
+
+/*
+ * struct ts_rawdata_info
+ *
+ */
+#define TS_RAWDATA_BUFF_MAX             7000
+#define TS_RAWDATA_RESULT_MAX           100
+struct ts_rawdata_info {
+	int used_size; //fill in rawdata size
+	s16 buff[TS_RAWDATA_BUFF_MAX];
+	char result[TS_RAWDATA_RESULT_MAX];
+};
+
+/*
+ * struct goodix_module - external modules container
+ * @head: external modules list
+ * @initilized: whether this struct is initilized
+ * @mutex: mutex lock
+ * @wq: workqueue to do register work
+ * @core_data: core_data pointer
+ */
+struct goodix_module {
+	struct list_head head;
+	bool initilized;
+	struct mutex mutex;
+	struct workqueue_struct *wq;
+	struct goodix_ts_core *core_data;
+};
+
+/*
+ * struct goodix_ts_board_data -  board data
+ * @avdd_name: name of analoy regulator
+ * @iovdd_name: name of analoy regulator
+ * @reset_gpio: reset gpio number
+ * @irq_gpio: interrupt gpio number
+ * @irq_flag: irq trigger type
+ * @swap_axis: whether swaw x y axis
+ * @panel_max_x/y/w/p: resolution and size
+ * @pannel_key_map: key map
+ * @fw_name: name of the firmware image
+ */
+struct goodix_ts_board_data {
+	char avdd_name[GOODIX_MAX_STR_LABLE_LEN];
+	char iovdd_name[GOODIX_MAX_STR_LABLE_LEN];
+	int reset_gpio;
+	int irq_gpio;
+	int avdd_gpio;
+	int iovdd_gpio;
+	unsigned int  irq_flags;
+
+	unsigned int swap_axis;
+	unsigned int panel_max_x;
+	unsigned int panel_max_y;
+	unsigned int panel_max_w; /*major and minor*/
+	unsigned int panel_max_p; /*pressure*/
+
+	bool pen_enable;
+	bool sleep_enable;
+	char fw_name[GOODIX_MAX_STR_LABLE_LEN];
+	char cfg_bin_name[GOODIX_MAX_STR_LABLE_LEN];
+};
+
+enum goodix_fw_update_mode {
+	UPDATE_MODE_DEFAULT = 0,
+	UPDATE_MODE_FORCE = (1<<0), /* force update mode */
+	UPDATE_MODE_BLOCK = (1<<1), /* update in block mode */
+	UPDATE_MODE_FLASH_CFG = (1<<2), /* reflash config */
+	UPDATE_MODE_SRC_SYSFS = (1<<4), /* firmware file from sysfs */
+	UPDATE_MODE_SRC_HEAD = (1<<5), /* firmware file from head file */
+	UPDATE_MODE_SRC_REQUEST = (1<<6), /* request firmware */
+	UPDATE_MODE_SRC_ARGS = (1<<7), /* firmware data from function args */
+};
+
+#define MAX_CMD_DATA_LEN 10
+#define MAX_CMD_BUF_LEN  16
+#pragma pack(1)
+struct goodix_ts_cmd {
+	union {
+		struct {
+			u8 state;
+			u8 ack;
+			u8 len;
+			u8 cmd;
+			u8 data[MAX_CMD_DATA_LEN];
+		};
+		u8 buf[MAX_CMD_BUF_LEN];
+	};
+};
+#pragma pack()
+
+/* interrupt event type */
+enum ts_event_type {
+	EVENT_INVALID = 0,
+	EVENT_TOUCH = (1 << 0), /* finger touch event */
+	EVENT_PEN = (1 << 1),   /* pen event */
+	EVENT_REQUEST = (1 << 2),
+	EVENT_GESTURE = (1 << 3),
+};
+
+enum ts_point_type {
+	POINT_TYPE_NULL = 0,
+	POINT_TYPE_STYLUS_HOVER = 1,
+	POINT_TYPE_FINGER = 2,
+	POINT_TYPE_STYLUS = 3,
+	POINT_TYPE_GLOVE = 4,
+	POINT_TYPE_KEY = 5,
+};
+
+enum ts_request_type {
+	REQUEST_TYPE_CONFIG = 1,
+	REQUEST_TYPE_RESET = 3,
+};
+
+/* notifier event */
+enum ts_notify_event {
+	NOTIFY_FWUPDATE_START,
+	NOTIFY_FWUPDATE_FAILED,
+	NOTIFY_FWUPDATE_SUCCESS,
+	NOTIFY_SUSPEND,
+	NOTIFY_RESUME,
+	NOTIFY_ESD_OFF,
+	NOTIFY_ESD_ON,
+	NOTIFY_CFG_BIN_FAILED,
+	NOTIFY_CFG_BIN_SUCCESS,
+};
+
+enum touch_point_status {
+	TS_NONE,
+	TS_RELEASE,
+	TS_TOUCH,
+};
+/* coordinate package */
+struct goodix_ts_coords {
+	int status; /* NONE, RELEASE, TOUCH */
+	unsigned int x, y, w, p;
+};
+
+struct goodix_pen_coords {
+	int status; /* NONE, RELEASE, TOUCH */
+	int tool_type;  /* BTN_TOOL_RUBBER BTN_TOOL_PEN */
+	unsigned int x, y, p;
+	signed char tilt_x;
+	signed char tilt_y;
+};
+
+struct goodix_ts_key {
+	int status;
+	int code;
+};
+
+/* touch event data */
+struct goodix_touch_data {
+	int touch_num;
+	bool have_key;
+	struct goodix_ts_key keys[GOODIX_MAX_KEY];
+	struct goodix_ts_coords coords[GOODIX_MAX_TOUCH];
+};
+
+struct goodix_pen_data {
+	bool is_hover;
+	struct goodix_pen_coords coords;
+	struct goodix_ts_key keys[GOODIX_MAX_PEN_KEY];
+};
+
+/*
+ * struct goodix_ts_event - touch event struct
+ * @event_type: touch event type, touch data or
+ *	request event
+ * @event_data: event data
+ */
+struct goodix_ts_event {
+	u8 event_type;
+	u8 fp_flag;	 /* finger print DOWN flag */
+	u8 request_code; /* represent the request type */
+	u8 gesture_type;
+	u8 gesture_data[GOODIX_GESTURE_DATA_LEN];
+	struct goodix_touch_data touch_data;
+	struct goodix_pen_data pen_data;
+};
+
+enum goodix_ic_bus_type {
+	GOODIX_BUS_TYPE_NONE,
+	GOODIX_BUS_TYPE_I2C,
+	GOODIX_BUS_TYPE_SPI,
+	GOODIX_BUS_TYPE_I3C,
+};
+
+struct goodix_bus_interface {
+	int bus_type;
+	int ic_type;
+	int sub_ic_type;
+	struct device *dev;
+	int (*read)(struct device *dev, unsigned int addr,
+			 unsigned char *data, unsigned int len);
+	int (*write)(struct device *dev, unsigned int addr,
+			unsigned char *data, unsigned int len);
+};
+
+struct goodix_ts_hw_ops {
+	int (*power_on)(struct goodix_ts_core *cd, bool on);
+	int (*resume)(struct goodix_ts_core *cd);
+	int (*suspend)(struct goodix_ts_core *cd);
+	int (*gesture)(struct goodix_ts_core *cd, int gesture_type);
+	int (*reset)(struct goodix_ts_core *cd, int delay_ms);
+	int (*irq_enable)(struct goodix_ts_core *cd, bool enable);
+	int (*read)(struct goodix_ts_core *cd, unsigned int addr,
+		    unsigned char *data, unsigned int len);
+	int (*write)(struct goodix_ts_core *cd, unsigned int addr,
+		     unsigned char *data, unsigned int len);
+	int (*read_flash)(struct goodix_ts_core *cd, unsigned int addr,
+		     unsigned char *data, unsigned int len);
+	int (*send_cmd)(struct goodix_ts_core *cd,
+			struct goodix_ts_cmd *cmd);
+	int (*send_config)(struct goodix_ts_core *cd,
+			u8 *config, int len);
+	int (*read_config)(struct goodix_ts_core *cd,
+			u8 *config_data, int size);
+	int (*read_version)(struct goodix_ts_core *cd,
+			struct goodix_fw_version *version);
+	int (*get_ic_info)(struct goodix_ts_core *cd,
+			struct goodix_ic_info *ic_info);
+	int (*esd_check)(struct goodix_ts_core *cd);
+	int (*event_handler)(struct goodix_ts_core *cd,
+			struct goodix_ts_event *ts_event);
+	int (*after_event_handler)(struct goodix_ts_core *cd);
+	int (*get_capacitance_data)(struct goodix_ts_core *cd,
+			struct ts_rawdata_info *info);
+};
+
+/*
+ * struct goodix_ts_esd - esd protector structure
+ * @esd_work: esd delayed work
+ * @esd_on: 1 - turn on esd protection, 0 - turn
+ *  off esd protection
+ */
+struct goodix_ts_esd {
+	bool irq_status;
+	atomic_t esd_on;
+	struct delayed_work esd_work;
+	struct notifier_block esd_notifier;
+	struct goodix_ts_core *ts_core;
+};
+
+enum goodix_core_init_stage {
+	CORE_UNINIT,
+	CORE_INIT_FAIL,
+	CORE_INIT_STAGE1,
+	CORE_INIT_STAGE2
+};
+
+struct goodix_ic_config {
+	int len;
+	u8 data[GOODIX_CFG_MAX_SIZE];
+};
+
+struct goodix_ts_core {
+	int init_stage;
+	struct platform_device *pdev;
+	struct goodix_fw_version fw_version;
+	struct goodix_ic_info ic_info;
+	struct goodix_ic_info_v2 ic_info_v2;
+	struct goodix_bus_interface *bus;
+	struct goodix_ts_board_data board_data;
+	struct goodix_ts_hw_ops *hw_ops;
+	struct input_dev *input_dev;
+	struct input_dev *pen_dev;
+	/* TODO counld we remove this from core data? */
+	struct goodix_ts_event ts_event;
+
+	struct work_struct self_check_work;
+
+	/* every pointer of this array represent a kind of config */
+	struct goodix_ic_config *ic_configs[GOODIX_MAX_CONFIG_GROUP];
+	struct regulator *avdd;
+	struct regulator *iovdd;
+	u32 gesture_type;
+
+	int power_on;
+	int irq;
+	size_t irq_trig_cnt;
+
+	atomic_t irq_enabled;
+	atomic_t suspended;
+	/* when this flag is true, driver should not clean the sync flag */
+	bool tools_ctrl_sync;
+
+	struct notifier_block ts_notifier;
+	struct goodix_ts_esd ts_esd;
+
+#if (IS_ENABLED(CONFIG_FB) || IS_ENABLED(CONFIG_DRM))
+	struct notifier_block fb_notifier;
+#endif
+};
+
+/* external module structures */
+enum goodix_ext_priority {
+	EXTMOD_PRIO_RESERVED = 0,
+	EXTMOD_PRIO_FWUPDATE,
+	EXTMOD_PRIO_GESTURE,
+	EXTMOD_PRIO_HOTKNOT,
+	EXTMOD_PRIO_DBGTOOL,
+	EXTMOD_PRIO_DEFAULT,
+};
+
+#define EVT_HANDLED				0
+#define EVT_CONTINUE			0
+#define EVT_CANCEL				1
+#define EVT_CANCEL_IRQEVT		1
+#define EVT_CANCEL_SUSPEND		1
+#define EVT_CANCEL_RESUME		1
+#define EVT_CANCEL_RESET		1
+
+struct goodix_ext_module;
+/* external module's operations callback */
+struct goodix_ext_module_funcs {
+	int (*init)(struct goodix_ts_core *core_data,
+		    struct goodix_ext_module *module);
+	int (*exit)(struct goodix_ts_core *core_data,
+		    struct goodix_ext_module *module);
+	int (*before_reset)(struct goodix_ts_core *core_data,
+			    struct goodix_ext_module *module);
+	int (*after_reset)(struct goodix_ts_core *core_data,
+			   struct goodix_ext_module *module);
+	int (*before_suspend)(struct goodix_ts_core *core_data,
+			      struct goodix_ext_module *module);
+	int (*after_suspend)(struct goodix_ts_core *core_data,
+			     struct goodix_ext_module *module);
+	int (*before_resume)(struct goodix_ts_core *core_data,
+			     struct goodix_ext_module *module);
+	int (*after_resume)(struct goodix_ts_core *core_data,
+			    struct goodix_ext_module *module);
+	int (*irq_event)(struct goodix_ts_core *core_data,
+			 struct goodix_ext_module *module);
+};
+
+/*
+ * struct goodix_ext_module - external module struct
+ * @list: list used to link into modules manager
+ * @name: name of external module
+ * @priority: module priority vlaue, zero is invalid
+ * @funcs: operations callback
+ * @priv_data: private data region
+ * @kobj: kobject
+ * @work: used to queue one work to do registration
+ */
+struct goodix_ext_module {
+	struct list_head list;
+	char *name;
+	enum goodix_ext_priority priority;
+	const struct goodix_ext_module_funcs *funcs;
+	void *priv_data;
+	struct kobject kobj;
+	struct work_struct work;
+};
+
+/*
+ * struct goodix_ext_attribute - exteranl attribute struct
+ * @attr: attribute
+ * @show: show interface of external attribute
+ * @store: store interface of external attribute
+ */
+struct goodix_ext_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct goodix_ext_module *module, char *buf);
+	ssize_t (*store)(struct goodix_ext_module *module,
+			const char *buf, size_t len);
+};
+
+/* external attrs helper macro */
+#define __EXTMOD_ATTR(_name, _mode, _show, _store)	{	\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.show   = _show,	\
+	.store  = _store,	\
+}
+
+/* external attrs helper macro, used to define external attrs */
+#define DEFINE_EXTMOD_ATTR(_name, _mode, _show, _store)	\
+static struct goodix_ext_attribute ext_attr_##_name = \
+	__EXTMOD_ATTR(_name, _mode, _show, _store)
+
+/* log macro */
+extern bool debug_log_flag;
+#define ts_info(fmt, arg...) \
+		pr_info("[GTP-INF][%s] "fmt"\n", __func__, ##arg)
+#define	ts_err(fmt, arg...) \
+		pr_err("[GTP-ERR][%s] "fmt"\n", __func__, ##arg)
+#define ts_debug(fmt, arg...) \
+		{if (debug_log_flag) \
+		pr_info("[GTP-DBG][%s] "fmt"\n", __func__, ##arg);}
+
+/*
+ * get board data pointer
+ */
+static inline struct goodix_ts_board_data *board_data(
+		struct goodix_ts_core *core)
+{
+	if (!core)
+		return NULL;
+	return &(core->board_data);
+}
+
+/**
+ * goodix_register_ext_module - interface for external module
+ * to register into touch core modules structure
+ *
+ * @module: pointer to external module to be register
+ * return: 0 ok, <0 failed
+ */
+int goodix_register_ext_module(struct goodix_ext_module *module);
+/* register module no wait */
+int goodix_register_ext_module_no_wait(struct goodix_ext_module *module);
+/**
+ * goodix_unregister_ext_module - interface for external module
+ * to unregister external modules
+ *
+ * @module: pointer to external module
+ * return: 0 ok, <0 failed
+ */
+int goodix_unregister_ext_module(struct goodix_ext_module *module);
+/* remove all registered ext module
+ * return 0 on success, otherwise return < 0
+ */
+int goodix_ts_blocking_notify(enum ts_notify_event evt, void *v);
+struct kobj_type *goodix_get_default_ktype(void);
+struct kobject *goodix_get_default_kobj(void);
+
+struct goodix_ts_hw_ops *goodix_get_hw_ops(void);
+int goodix_get_config_proc(struct goodix_ts_core *cd);
+
+int goodix_spi_bus_init(void);
+void goodix_spi_bus_exit(void);
+int goodix_i2c_bus_init(void);
+void goodix_i2c_bus_exit(void);
+
+u32 goodix_append_checksum(u8 *data, int len, int mode);
+int checksum_cmp(const u8 *data, int size, int mode);
+int is_risk_data(const u8 *data, int size);
+u32 goodix_get_file_config_id(u8 *ic_config);
+void goodix_rotate_abcd2cbad(int tx, int rx, s16 *data);
+void print_ic_info(struct goodix_ic_info *ic_info);
+
+int goodix_fw_update_init(struct goodix_ts_core *core_data);
+void goodix_fw_update_uninit(void);
+int goodix_do_fw_update(struct goodix_ic_config *ic_config, int mode);
+
+int goodix_get_ic_type(struct device_node *node, struct goodix_bus_interface *bus_inf);
+int gesture_module_init(void);
+void gesture_module_exit(void);
+int inspect_module_init(struct goodix_ts_core *core_data);
+void inspect_module_exit(void);
+int goodix_tools_init(void);
+void goodix_tools_exit(void);
+
+#endif
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_gesture.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_gesture.c
new file mode 100644
index 00000000000000..77fcbd70f382f4
--- /dev/null
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_gesture.c
@@ -0,0 +1,454 @@
+/*
+ * Goodix Gesture Module
+ *
+ * Copyright (C) 2019 - 2020 Goodix, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be a reference
+ * to you, when you are integrating the GOODiX's CTP IC into your system,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/input.h>
+#include <linux/platform_device.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/atomic.h>
+#include <linux/input/mt.h>
+#include "goodix_ts_core.h"
+
+
+#define GOODIX_GESTURE_DOUBLE_TAP		0xCC
+#define GOODIX_GESTURE_SINGLE_TAP		0x4C
+#define GOODIX_GESTURE_FOD_DOWN			0x46
+#define GOODIX_GESTURE_FOD_UP			0x55
+
+/*
+ * struct gesture_module - gesture module data
+ * @registered: module register state
+ * @sysfs_node_created: sysfs node state
+ * @gesture_type: valid gesture type, each bit represent one gesture type
+ * @gesture_data: store latest gesture code get from irq event
+ * @gesture_ts_cmd: gesture command data
+ */
+struct gesture_module {
+	atomic_t registered;
+	struct goodix_ts_core *ts_core;
+	struct goodix_ext_module module;
+};
+
+static struct gesture_module *gsx_gesture; /*allocated in gesture init module*/
+static bool module_initialized;
+
+static ssize_t gsx_double_type_show(struct goodix_ext_module *module,
+		char *buf)
+{
+	struct gesture_module *gsx = module->priv_data;
+	u32 type;
+
+	if (!gsx)
+		return -EIO;
+
+	type = gsx->ts_core->gesture_type;
+
+	if (atomic_read(&gsx->registered) == 0) {
+		ts_err("gesture module is not registered");
+		return 0;
+	}
+
+	return sprintf(buf, "%s\n",
+			(type & GESTURE_DOUBLE_TAP) ? "enable" : "disable");
+}
+
+static ssize_t gsx_double_type_store(struct goodix_ext_module *module,
+		const char *buf, size_t count)
+{
+	struct gesture_module *gsx = module->priv_data;
+
+	if (!gsx)
+		return -EIO;
+
+	if (atomic_read(&gsx->registered) == 0) {
+		ts_err("gesture module is not registered");
+		return 0;
+	}
+
+	if (buf[0] == '1') {
+		ts_info("enable double tap");
+		gsx->ts_core->gesture_type |= GESTURE_DOUBLE_TAP;
+	} else if (buf[0] == '0') {
+		ts_info("disable double tap");
+		gsx->ts_core->gesture_type &= ~GESTURE_DOUBLE_TAP;
+	} else
+		ts_err("invalid cmd[%d]", buf[0]);
+
+	return count;
+}
+
+static ssize_t gsx_single_type_show(struct goodix_ext_module *module,
+		char *buf)
+{
+	struct gesture_module *gsx = module->priv_data;
+	u32 type;
+
+	if (!gsx)
+		return -EIO;
+
+	type = gsx->ts_core->gesture_type;
+
+	if (atomic_read(&gsx->registered) == 0) {
+		ts_err("gesture module is not registered");
+		return 0;
+	}
+
+	return sprintf(buf, "%s\n",
+			(type & GESTURE_SINGLE_TAP) ? "enable" : "disable");
+}
+
+static ssize_t gsx_single_type_store(struct goodix_ext_module *module,
+		const char *buf, size_t count)
+{
+	struct gesture_module *gsx = module->priv_data;
+
+	if (!gsx)
+		return -EIO;
+
+	if (atomic_read(&gsx->registered) == 0) {
+		ts_err("gesture module is not registered");
+		return 0;
+	}
+
+	if (buf[0] == '1') {
+		ts_info("enable single tap");
+		gsx->ts_core->gesture_type |= GESTURE_SINGLE_TAP;
+	} else if (buf[0] == '0') {
+		ts_info("disable single tap");
+		gsx->ts_core->gesture_type &= ~GESTURE_SINGLE_TAP;
+	} else
+		ts_err("invalid cmd[%d]", buf[0]);
+
+	return count;
+}
+
+static ssize_t gsx_fod_type_show(struct goodix_ext_module *module,
+		char *buf)
+{
+	struct gesture_module *gsx = module->priv_data;
+	u32 type;
+
+	if (!gsx)
+		return -EIO;
+
+	type = gsx->ts_core->gesture_type;
+
+	if (atomic_read(&gsx->registered) == 0) {
+		ts_err("gesture module is not registered");
+		return 0;
+	}
+
+	return sprintf(buf, "%s\n",
+			(type & GESTURE_FOD_PRESS) ? "enable" : "disable");
+}
+
+static ssize_t gsx_fod_type_store(struct goodix_ext_module *module,
+		const char *buf, size_t count)
+{
+	struct gesture_module *gsx = module->priv_data;
+
+	if (!gsx)
+		return -EIO;
+
+	if (atomic_read(&gsx->registered) == 0) {
+		ts_err("gesture module is not registered");
+		return 0;
+	}
+
+	if (buf[0] == '1') {
+		ts_info("enable fod");
+		gsx->ts_core->gesture_type |= GESTURE_FOD_PRESS;
+	} else if (buf[0] == '0') {
+		ts_info("disable fod");
+		gsx->ts_core->gesture_type &= ~GESTURE_FOD_PRESS;
+	} else
+		ts_err("invalid cmd[%d]", buf[0]);
+
+	return count;
+}
+
+
+const struct goodix_ext_attribute gesture_attrs[] = {
+	__EXTMOD_ATTR(double_en, 0664,
+			gsx_double_type_show, gsx_double_type_store),
+	__EXTMOD_ATTR(single_en, 0664,
+			gsx_single_type_show, gsx_single_type_store),
+	__EXTMOD_ATTR(fod_en, 0664,
+			gsx_fod_type_show, gsx_fod_type_store),
+};
+
+static int gsx_gesture_init(struct goodix_ts_core *cd,
+		struct goodix_ext_module *module)
+{
+	struct gesture_module *gsx = module->priv_data;
+
+	if (!cd || !cd->hw_ops->gesture) {
+		ts_err("gesture unsupported");
+		return -EINVAL;
+	}
+
+	gsx->ts_core = cd;
+	gsx->ts_core->gesture_type = 0;
+	atomic_set(&gsx->registered, 1);
+
+	return 0;
+}
+
+static int gsx_gesture_exit(struct goodix_ts_core *cd,
+		struct goodix_ext_module *module)
+{
+	struct gesture_module *gsx = module->priv_data;
+
+	if (!cd || !cd->hw_ops->gesture) {
+		ts_err("gesture unsupported");
+		return -EINVAL;
+	}
+
+	atomic_set(&gsx->registered, 0);
+
+	return 0;
+}
+
+/**
+ * gsx_gesture_ist - Gesture Irq handle
+ * This functions is excuted when interrupt happended and
+ * ic in doze mode.
+ *
+ * @cd: pointer to touch core data
+ * @module: pointer to goodix_ext_module struct
+ * return: 0 goon execute, EVT_CANCEL_IRQEVT  stop execute
+ */
+static int gsx_gesture_ist(struct goodix_ts_core *cd,
+	struct goodix_ext_module *module)
+{
+	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+	struct goodix_ts_event gs_event = {0};
+	int fodx, fody, overlay_area;
+	int ret;
+
+	if (atomic_read(&cd->suspended) == 0 || cd->gesture_type == 0)
+		return EVT_CONTINUE;
+
+	ret = hw_ops->event_handler(cd, &gs_event);
+	if (ret) {
+		ts_err("failed get gesture data");
+		goto re_send_ges_cmd;
+	}
+
+	if (!(gs_event.event_type & EVENT_GESTURE)) {
+		ts_err("invalid event type: 0x%x",
+			cd->ts_event.event_type);
+		goto re_send_ges_cmd;
+	}
+
+	switch (gs_event.gesture_type) {
+	case GOODIX_GESTURE_SINGLE_TAP:
+		if (cd->gesture_type & GESTURE_SINGLE_TAP) {
+			ts_info("get SINGLE-TAP gesture");
+			input_report_key(cd->input_dev, KEY_WAKEUP, 1);
+			// input_report_key(cd->input_dev, KEY_GOTO, 1);
+			input_sync(cd->input_dev);
+			input_report_key(cd->input_dev, KEY_WAKEUP, 0);
+			// input_report_key(cd->input_dev, KEY_GOTO, 0);
+			input_sync(cd->input_dev);
+		} else {
+			ts_debug("not enable SINGLE-TAP");
+		}
+		break;
+	case GOODIX_GESTURE_DOUBLE_TAP:
+		if (cd->gesture_type & GESTURE_DOUBLE_TAP) {
+			ts_info("get DOUBLE-TAP gesture");
+			input_report_key(cd->input_dev, KEY_WAKEUP, 1);
+			input_sync(cd->input_dev);
+			input_report_key(cd->input_dev, KEY_WAKEUP, 0);
+			input_sync(cd->input_dev);
+		} else {
+			ts_debug("not enable DOUBLE-TAP");
+		}
+		break;
+	case GOODIX_GESTURE_FOD_DOWN:
+		if (cd->gesture_type & GESTURE_FOD_PRESS) {
+			ts_info("get FOD-DOWN gesture");
+			fodx = le16_to_cpup((__le16 *)gs_event.gesture_data);
+			fody = le16_to_cpup((__le16 *)(gs_event.gesture_data + 2));
+			overlay_area = gs_event.gesture_data[4];
+			ts_debug("fodx:%d fody:%d overlay_area:%d", fodx, fody, overlay_area);
+			input_report_key(cd->input_dev, BTN_TOUCH, 1);
+			input_mt_slot(cd->input_dev, 0);
+			input_mt_report_slot_state(cd->input_dev, MT_TOOL_FINGER, 1);
+			input_report_abs(cd->input_dev, ABS_MT_POSITION_X, fodx);
+			input_report_abs(cd->input_dev, ABS_MT_POSITION_Y, fody);
+			input_report_abs(cd->input_dev, ABS_MT_WIDTH_MAJOR, overlay_area);
+			input_sync(cd->input_dev);
+		} else {
+			ts_debug("not enable FOD-DOWN");
+		}
+		break;
+	case GOODIX_GESTURE_FOD_UP:
+		if (cd->gesture_type & GESTURE_FOD_PRESS) {
+			ts_info("get FOD-UP gesture");
+			// fodx = le16_to_cpup((__le16 *)gs_event.gesture_data);
+			// fody = le16_to_cpup((__le16 *)(gs_event.gesture_data + 2));
+			// overlay_area = gs_event.gesture_data[4];
+			input_report_key(cd->input_dev, BTN_TOUCH, 0);
+			input_mt_slot(cd->input_dev, 0);
+			input_mt_report_slot_state(cd->input_dev,
+					MT_TOOL_FINGER, 0);
+			input_sync(cd->input_dev);
+		} else {
+			ts_debug("not enable FOD-UP");
+		}
+		break;
+	default:
+		ts_err("not support gesture type[%02X]", gs_event.gesture_type);
+		break;
+	}
+
+	return EVT_CANCEL_IRQEVT;
+
+re_send_ges_cmd:
+	if (hw_ops->gesture(cd, 0))
+		ts_info("warning: failed re_send gesture cmd");
+	return EVT_CANCEL_IRQEVT;
+}
+
+/**
+ * gsx_gesture_before_suspend - execute gesture suspend routine
+ * This functions is excuted to set ic into doze mode
+ *
+ * @cd: pointer to touch core data
+ * @module: pointer to goodix_ext_module struct
+ * return: 0 goon execute, EVT_IRQCANCLED  stop execute
+ */
+static int gsx_gesture_before_suspend(struct goodix_ts_core *cd,
+	struct goodix_ext_module *module)
+{
+	int ret;
+	const struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+
+	if (cd->gesture_type == 0)
+		return EVT_CONTINUE;
+
+	ret = hw_ops->gesture(cd, 0);
+	if (ret)
+		ts_err("failed enter gesture mode");
+	else
+		ts_info("enter gesture mode, type[0x%02X]", cd->gesture_type);
+
+	hw_ops->irq_enable(cd, true);
+	enable_irq_wake(cd->irq);
+
+	return EVT_CANCEL_SUSPEND;
+}
+
+static int gsx_gesture_before_resume(struct goodix_ts_core *cd,
+	struct goodix_ext_module *module)
+{
+	const struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+
+	if (cd->gesture_type == 0)
+		return EVT_CONTINUE;
+
+	disable_irq_wake(cd->irq);
+	hw_ops->reset(cd, GOODIX_NORMAL_RESET_DELAY_MS);
+
+	return EVT_CANCEL_RESUME;
+}
+
+static struct goodix_ext_module_funcs gsx_gesture_funcs = {
+	.irq_event = gsx_gesture_ist,
+	.init = gsx_gesture_init,
+	.exit = gsx_gesture_exit,
+	.before_suspend = gsx_gesture_before_suspend,
+	.before_resume = gsx_gesture_before_resume,
+};
+
+int gesture_module_init(void)
+{
+	int ret;
+	int i;
+	struct kobject *def_kobj = goodix_get_default_kobj();
+	struct kobj_type *def_kobj_type = goodix_get_default_ktype();
+
+	gsx_gesture = kzalloc(sizeof(struct gesture_module), GFP_KERNEL);
+	if (!gsx_gesture)
+		return -ENOMEM;
+
+	gsx_gesture->module.funcs = &gsx_gesture_funcs;
+	gsx_gesture->module.priority = EXTMOD_PRIO_GESTURE;
+	gsx_gesture->module.name = "Goodix_gsx_gesture";
+	gsx_gesture->module.priv_data = gsx_gesture;
+
+	atomic_set(&gsx_gesture->registered, 0);
+
+	/* gesture sysfs init */
+	ret = kobject_init_and_add(&gsx_gesture->module.kobj,
+			def_kobj_type, def_kobj, "gesture");
+	if (ret) {
+		ts_err("failed create gesture sysfs node!");
+		goto err_out;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(gesture_attrs) && !ret; i++)
+		ret = sysfs_create_file(&gsx_gesture->module.kobj,
+				&gesture_attrs[i].attr);
+	if (ret) {
+		ts_err("failed create gst sysfs files");
+		while (--i >= 0)
+			sysfs_remove_file(&gsx_gesture->module.kobj,
+					&gesture_attrs[i].attr);
+
+		kobject_put(&gsx_gesture->module.kobj);
+		goto err_out;
+	}
+
+	module_initialized = true;
+	goodix_register_ext_module_no_wait(&gsx_gesture->module);
+	ts_info("gesture module init success");
+
+	return 0;
+
+err_out:
+	ts_err("gesture module init failed!");
+	kfree(gsx_gesture);
+	return ret;
+}
+
+void gesture_module_exit(void)
+{
+	int i;
+
+	ts_info("gesture module exit");
+	if (!module_initialized)
+		return;
+
+	goodix_unregister_ext_module(&gsx_gesture->module);
+
+	/* deinit sysfs */
+	for (i = 0; i < ARRAY_SIZE(gesture_attrs); i++)
+		sysfs_remove_file(&gsx_gesture->module.kobj,
+					&gesture_attrs[i].attr);
+
+	kobject_put(&gsx_gesture->module.kobj);
+	kfree(gsx_gesture);
+	module_initialized = false;
+}
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_inspect.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_inspect.c
new file mode 100644
index 00000000000000..67eff2ac4caf72
--- /dev/null
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_inspect.c
@@ -0,0 +1,2954 @@
+ /*
+  * Goodix Touchscreen Driver
+  * Copyright (C) 2020 - 2021 Goodix, Inc.
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2 of the License, or
+  * (at your option) any later version.
+  *
+  * This program is distributed in the hope that it will be a reference
+  * to you, when you are integrating the GOODiX's CTP IC into your system,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * General Public License for more details.
+  *
+  */
+
+#include "goodix_ts_core.h"
+#include <linux/rtc.h>
+#include <linux/timer.h>
+#include <linux/version.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <asm/uaccess.h>
+
+
+/* test config */
+#define TOTAL_FRAME_NUM 					16 /* rawdata test frames */
+#define NOISEDATA_TEST_TIMES				1  /* noise test frames */
+
+#define GOODIX_TEST_FILE_NAME				"goodix_test_limits"
+#define DEFAULT_SEQ_FILE_SIZE				800 * 1024
+#define MAX_DATA_BUFFER						30000
+#define MAX_SHORT_NUM						15
+#define MAX_LINE_LEN                		(1024 * 3 * 7)
+#define MAX_DRV_NUM							52
+#define MAX_SEN_NUM							75
+
+#define STATISTICS_DATA_LEN					32
+#define MAX_STR_LEN				 			32
+#define MAX_TEST_ITEMS			    		10 /* 0P-1P-2P-3P-5P total test items */
+#define GTP_CAP_TEST						1
+#define GTP_DELTA_TEST						2
+#define GTP_NOISE_TEST						3
+#define GTP_SHORT_TEST						5
+#define GTP_SELFCAP_TEST					6
+#define GTP_SELFNOISE_TEST					7
+
+#define GTP_TEST_PASS						1 
+#define GTP_PANEL_REASON					2
+#define SYS_SOFTWARE_REASON					3
+
+#define CHN_VDD								0xFF
+#define CHN_GND								0x7F
+#define DRV_CHANNEL_FLAG		    		0x80
+
+#define CSV_TP_SPECIAL_RAW_MIN				"special_raw_min"
+#define CSV_TP_SPECIAL_RAW_MAX				"special_raw_max"
+#define CSV_TP_SPECIAL_RAW_DELTA			"special_raw_delta"
+#define CSV_TP_SHORT_THRESHOLD				"shortciurt_threshold"
+#define CSV_TP_SPECIAL_SELFRAW_MAX			"special_selfraw_max"
+#define CSV_TP_SPECIAL_SELFRAW_MIN			"special_selfraw_min"
+#define CSV_TP_NOISE_LIMIT					"noise_data_limit"
+#define CSV_TP_SELFNOISE_LIMIT				"noise_selfdata_limit"
+#define CSV_TP_TEST_CONFIG					"test_config"
+
+#define MAX_TEST_TIME_MS            		15000
+#define DEFAULT_TEST_TIME_MS				7000
+
+/* berlin A */
+#define MAX_DRV_NUM_BRA				    	21
+#define MAX_SEN_NUM_BRA				    	42
+#define SHORT_TEST_TIME_REG_BRA				0x11FF2
+#define DFT_ADC_DUMP_NUM_BRA				1396
+#define DFT_SHORT_THRESHOLD_BRA  			16
+#define DFT_DIFFCODE_SHORT_THRESHOLD_BRA	16
+#define SHORT_TEST_STATUS_REG_BRA			0x10400
+#define SHORT_TEST_RESULT_REG_BRA			0x10410
+#define DRV_DRV_SELFCODE_REG_BRA			0x1045E
+#define SEN_SEN_SELFCODE_REG_BRA 			0x1084E
+#define DRV_SEN_SELFCODE_REG_BRA			0x11712
+#define DIFF_CODE_DATA_REG_BRA				0x11F72
+
+/* berlin B */
+#define MAX_DRV_NUM_BRB				    	52
+#define MAX_SEN_NUM_BRB				    	75
+#define SHORT_TEST_TIME_REG_BRB				0x26AE0
+#define DFT_ADC_DUMP_NUM_BRB				762
+#define DFT_SHORT_THRESHOLD_BRB				100
+#define DFT_DIFFCODE_SHORT_THRESHOLD_BRB	32
+#define SHORT_TEST_STATUS_REG_BRB			0x20400
+#define SHORT_TEST_RESULT_REG_BRB			0x20410
+#define DRV_DRV_SELFCODE_REG_BRB			0x2049A
+#define SEN_SEN_SELFCODE_REG_BRB 			0x21AF2
+#define DRV_SEN_SELFCODE_REG_BRB			0x248A6
+#define DIFF_CODE_DATA_REG_BRB				0x269E0
+
+/* berlinD */
+#define MAX_DRV_NUM_BRD				    	20
+#define MAX_SEN_NUM_BRD				    	40
+#define SHORT_TEST_TIME_REG_BRD				0x14D7A
+#define DFT_ADC_DUMP_NUM_BRD				762
+#define DFT_SHORT_THRESHOLD_BRD				100
+#define DFT_DIFFCODE_SHORT_THRESHOLD_BRD	32
+#define SHORT_TEST_STATUS_REG_BRD			0x13400
+#define SHORT_TEST_RESULT_REG_BRD			0x13408
+#define DRV_DRV_SELFCODE_REG_BRD			0x1344E
+#define SEN_SEN_SELFCODE_REG_BRD 			0x137E6
+#define DRV_SEN_SELFCODE_REG_BRD			0x14556
+#define DIFF_CODE_DATA_REG_BRD				0x14D00
+
+/* nottingham */
+#define MAX_DRV_NUM_NOT				    	17
+#define MAX_SEN_NUM_NOT				    	35
+#define SHORT_TEST_TIME_REG_NOT				0x1479E
+#define SHORT_TEST_STATUS_REG_NOT			0x13400
+#define SHORT_TEST_RESULT_REG_NOT			0x13408
+#define DRV_DRV_SELFCODE_REG_NOT			0x13446
+#define SEN_SEN_SELFCODE_REG_NOT 			0x136EE
+#define DRV_SEN_SELFCODE_REG_NOT			0x14152
+#define DIFF_CODE_DATA_REG_NOT				0x14734
+
+
+#define ABS(val)			((val < 0)? -(val) : val)
+#define MAX(a, b)			((a > b)? a : b)
+
+static bool module_initialized;
+static struct seq_file *g_seq;
+
+/* berlin A drv-sen map */
+static u8 brl_a_drv_map[] = {
+    42, 43, 44, 45, 46, 47, 48, 49,
+    50, 51, 52, 53, 54, 55, 56, 57,
+    58, 59, 60, 61, 62
+};
+
+static u8 brl_a_sen_map[] = {
+    0, 1, 2, 3, 4, 5, 6, 7,
+	8, 9, 10, 11, 12, 13, 14, 15,
+	16, 17, 18, 19, 20, 21, 22, 23,
+	24, 25, 26, 27, 28, 29, 30, 31,
+	32, 33, 34, 35, 36, 37, 38, 39,
+	40, 41
+};
+
+/* berlin B drv-sen map */
+static u8 brl_b_drv_map[] = {
+	75, 76, 77, 78, 79, 80, 81, 82,
+	83, 84, 85, 86, 87, 88, 89, 90,
+	91, 92, 93, 94, 95, 96, 97, 98,
+	99, 100, 101, 102, 103, 104, 105,
+	106, 107, 108, 109, 110, 111, 112,
+	113, 114, 115, 116, 117, 118, 119,
+	120, 121, 122, 123, 124, 125, 126
+};
+
+static u8 brl_b_sen_map[] = {
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+	11, 12, 13, 14, 15, 16, 17, 18,
+	19, 20, 21, 22, 23, 24, 25, 26,
+	27, 28, 29, 30, 31, 32, 33, 34,
+	35, 36, 37, 38, 39, 40, 41, 42,
+	43, 44, 45, 46, 47, 48, 49, 50,
+	51, 52, 53, 54, 55, 56, 57, 58,
+	59, 60, 61, 62, 63, 64, 65, 66,
+	67, 68, 69, 70, 71, 72, 73, 74
+};
+
+/* berlin D drv-sen map */
+static u8 brl_d_drv_map[] = {
+	40, 41, 42, 43, 44, 45, 46, 47,
+	48, 49, 50, 51, 52, 53, 54, 55,
+	56, 57, 58, 59,
+};
+
+static u8 brl_d_sen_map[] = {
+    0, 1, 2, 3, 4, 5, 6, 7,
+	8, 9, 10, 11, 12, 13, 14, 15,
+	16, 17, 18, 19, 20, 21, 22, 23,
+	24, 25, 26, 27, 28, 29, 30, 31,
+	32, 33, 34, 35, 36, 37, 38, 39,
+};
+
+/* nottingham drv-sen map */
+static u8 not_drv_map[] = {
+	35, 36, 37, 38, 39, 40, 41, 42, 43,
+	44, 45, 46, 47, 48, 49, 50, 51
+};
+
+static u8 not_sen_map[] = {
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+	10, 11, 12, 13, 14, 15, 16, 17, 18,
+	19, 20, 21, 22, 23, 24, 25, 26, 27,
+	28, 29, 30, 31, 32, 33, 34
+};
+
+typedef struct __attribute__((packed)) {
+    u8 result;
+	u8 drv_drv_num;
+	u8 sen_sen_num;
+	u8 drv_sen_num;
+	u8 drv_gnd_avdd_num;
+	u8 sen_gnd_avdd_num;
+	u16 checksum;
+} test_result_t;
+
+struct params_info_t {
+	u32 max_drv_num;
+	u32 max_sen_num;
+	u8 *drv_map;
+	u8 *sen_map;
+	u32 short_test_time_reg;
+	u32 short_test_status_reg;
+	u32 short_test_result_reg;
+	u32 drv_drv_selfcode_reg;
+	u32 sen_sen_selfcode_reg;
+	u32 drv_sen_selfcode_reg;
+	u32 diffcode_data_reg;
+	u16 short_test_dump_num;
+	u16 dft_short_threshold;
+	u16 short_diffcode_threshold;
+};
+
+struct params_info_t params_bra = {
+	MAX_DRV_NUM_BRA,
+	MAX_SEN_NUM_BRA,
+	brl_a_drv_map,
+	brl_a_sen_map,
+	SHORT_TEST_TIME_REG_BRA,
+	SHORT_TEST_STATUS_REG_BRA,
+	SHORT_TEST_RESULT_REG_BRA,
+	DRV_DRV_SELFCODE_REG_BRA,
+	SEN_SEN_SELFCODE_REG_BRA,
+	DRV_SEN_SELFCODE_REG_BRA,
+	DIFF_CODE_DATA_REG_BRA,
+	DFT_ADC_DUMP_NUM_BRA,
+	DFT_SHORT_THRESHOLD_BRA,
+	DFT_DIFFCODE_SHORT_THRESHOLD_BRA,
+};
+
+struct params_info_t params_brb = {
+	MAX_DRV_NUM_BRB,
+	MAX_SEN_NUM_BRB,
+	brl_b_drv_map,
+	brl_b_sen_map,
+	SHORT_TEST_TIME_REG_BRB,
+	SHORT_TEST_STATUS_REG_BRB,
+	SHORT_TEST_RESULT_REG_BRB,
+	DRV_DRV_SELFCODE_REG_BRB,
+	SEN_SEN_SELFCODE_REG_BRB,
+	DRV_SEN_SELFCODE_REG_BRB,
+	DIFF_CODE_DATA_REG_BRB,
+	DFT_ADC_DUMP_NUM_BRB,
+	DFT_SHORT_THRESHOLD_BRB,
+	DFT_DIFFCODE_SHORT_THRESHOLD_BRB,
+};
+
+struct params_info_t params_brd = {
+	MAX_DRV_NUM_BRD,
+	MAX_SEN_NUM_BRD,
+	brl_d_drv_map,
+	brl_d_sen_map,
+	SHORT_TEST_TIME_REG_BRD,
+	SHORT_TEST_STATUS_REG_BRD,
+	SHORT_TEST_RESULT_REG_BRD,
+	DRV_DRV_SELFCODE_REG_BRD,
+	SEN_SEN_SELFCODE_REG_BRD,
+	DRV_SEN_SELFCODE_REG_BRD,
+	DIFF_CODE_DATA_REG_BRD,
+	DFT_ADC_DUMP_NUM_BRD,
+	DFT_SHORT_THRESHOLD_BRD,
+	DFT_DIFFCODE_SHORT_THRESHOLD_BRD,
+};
+
+struct params_info_t params_not = {
+	MAX_DRV_NUM_NOT,
+	MAX_SEN_NUM_NOT,
+	not_drv_map,
+	not_sen_map,
+	SHORT_TEST_TIME_REG_NOT,
+	SHORT_TEST_STATUS_REG_NOT,
+	SHORT_TEST_RESULT_REG_NOT,
+	DRV_DRV_SELFCODE_REG_NOT,
+	SEN_SEN_SELFCODE_REG_NOT,
+	DRV_SEN_SELFCODE_REG_NOT,
+	DIFF_CODE_DATA_REG_NOT,
+	0,
+	0,
+	0,
+};
+
+struct ts_test_params {
+	bool test_items[MAX_TEST_ITEMS];
+
+	u32 rawdata_addr;
+	u32 noisedata_addr;
+	u32 self_rawdata_addr;
+	u32 self_noisedata_addr;
+
+	u32 drv_num;
+	u32 sen_num;
+
+	struct params_info_t *params_info;
+
+	s32 cfg_buf[GOODIX_CFG_MAX_SIZE];
+	s32 max_limits[MAX_DRV_NUM * MAX_SEN_NUM];
+	s32 min_limits[MAX_DRV_NUM * MAX_SEN_NUM];
+	s32 deviation_limits[MAX_DRV_NUM * MAX_SEN_NUM];
+	s32 self_max_limits[MAX_DRV_NUM + MAX_SEN_NUM];
+	s32 self_min_limits[MAX_DRV_NUM + MAX_SEN_NUM];
+	s32 noise_threshold;
+	s32 self_noise_threshold;
+
+	u32 short_threshold;
+	u32 r_drv_drv_threshold;
+	u32 r_drv_sen_threshold;
+	u32 r_sen_sen_threshold;
+	u32 r_drv_gnd_threshold;
+	u32 r_sen_gnd_threshold;
+	u32 avdd_value;
+};
+
+struct ts_test_rawdata {
+	s16 data[MAX_DRV_NUM * MAX_SEN_NUM];
+	u32 size;
+};
+
+struct ts_test_self_rawdata {
+	s16 data[MAX_DRV_NUM + MAX_SEN_NUM];
+	u32 size;
+};
+
+struct ts_short_res {
+	u8 short_num;
+	s16 short_msg[4 * MAX_SHORT_NUM];
+};
+
+struct ts_open_res {
+	u8 beyond_max_limit_cnt[MAX_DRV_NUM * MAX_SEN_NUM];
+	u8 beyond_min_limit_cnt[MAX_DRV_NUM * MAX_SEN_NUM];
+	u8 beyond_accord_limit_cnt[MAX_DRV_NUM * MAX_SEN_NUM];
+};
+
+struct goodix_ts_test {
+	struct goodix_ts_core *ts;
+	struct ts_test_params test_params;
+	struct ts_test_rawdata rawdata[TOTAL_FRAME_NUM];
+	struct ts_test_rawdata accord_arr[TOTAL_FRAME_NUM];
+	struct ts_test_rawdata noisedata[NOISEDATA_TEST_TIMES];
+	struct goodix_ic_config test_config;
+	struct ts_test_self_rawdata self_rawdata;
+	struct ts_test_self_rawdata self_noisedata;
+	struct ts_short_res short_res;
+	struct ts_open_res open_res;
+
+	/*[0][0][0][0][0]..  0 without test; 1 pass, 2 panel failed; 3 software failed */
+	char test_result[MAX_TEST_ITEMS];
+	char test_info[TS_RAWDATA_RESULT_MAX];
+};
+
+static int cal_cha_to_cha_res(struct goodix_ts_test *ts_test, int v1, int v2)
+{
+	if (ts_test->ts->bus->ic_type == IC_TYPE_BERLIN_A)
+		return (v1 - v2) * 63 / v2;
+	else if (ts_test->ts->bus->ic_type == IC_TYPE_BERLIN_B)
+		return (v1 - v2) * 74 / v2 + 20;
+	else if (ts_test->ts->bus->ic_type == IC_TYPE_BERLIN_D)
+		return (v1 / v2 - 1) * 70 + 59;
+	else
+		return (v1 / v2 - 1) * 55 + 45;
+}
+
+static int cal_cha_to_avdd_res(struct goodix_ts_test *ts_test, int v1, int v2)
+{
+	if (ts_test->ts->bus->ic_type == IC_TYPE_BERLIN_A)
+		return 64 * (2 * v2 - 25) * 40 / v1 - 40;
+	else if (ts_test->ts->bus->ic_type == IC_TYPE_BERLIN_B)
+		return 64 * (2 * v2 - 25) * 99 / v1 - 60;
+	else if (ts_test->ts->bus->ic_type == IC_TYPE_BERLIN_D)
+		return 64 * (2 * v2 - 25) * 93 / v1 - 20;
+	else
+		return 64 * (2 * v2 - 25) * 76 / v1 - 15;
+}
+
+static int cal_cha_to_gnd_res(struct goodix_ts_test *ts_test, int v) 
+{
+	if (ts_test->ts->bus->ic_type == IC_TYPE_BERLIN_A)
+		return 64148 / v - 40;
+	else if (ts_test->ts->bus->ic_type == IC_TYPE_BERLIN_B)
+		return 150500 / v - 60;
+	else if (ts_test->ts->bus->ic_type == IC_TYPE_BERLIN_D)
+		return 145000 / v - 15;
+	else
+		return 120000 / v - 16;
+}
+
+static int ts_test_reset(struct goodix_ts_test *ts_test,
+    u32 delay_ms)
+{
+    return ts_test->ts->hw_ops->reset(ts_test->ts, delay_ms);
+}
+
+static int ts_test_read(struct goodix_ts_test *ts_test,
+    u32 addr, u8 *data, u32 len)
+{
+    return ts_test->ts->hw_ops->read(ts_test->ts, addr, data, len);
+}
+
+static int ts_test_write(struct goodix_ts_test *ts_test,
+    u32 addr, u8 *data, u32 len)
+{
+    return ts_test->ts->hw_ops->write(ts_test->ts, addr, data, len);
+}
+
+static int ts_test_send_cmd(struct goodix_ts_test *ts_test,
+    struct goodix_ts_cmd *cmd)
+{
+    return ts_test->ts->hw_ops->send_cmd(ts_test->ts, cmd);
+}
+
+static int ts_test_irq_enable(struct goodix_ts_test *ts_test,
+	bool flag)
+{
+	return ts_test->ts->hw_ops->irq_enable(ts_test->ts, flag);
+}
+
+static int ts_test_send_config(struct goodix_ts_test *ts_test,
+    int type)
+{
+    struct goodix_ic_config *cfg;
+
+	if (type >= GOODIX_MAX_CONFIG_GROUP) {
+		ts_err("unsupproted config type %d", type);
+		return -EINVAL;
+	}
+    cfg = ts_test->ts->ic_configs[type];
+	if (!cfg || cfg->len <= 0) {
+		ts_err("no valid normal config found");
+		return -EINVAL;
+	}
+    
+    return ts_test->ts->hw_ops->send_config(ts_test->ts, cfg->data, cfg->len);
+}
+
+static int ts_test_read_version(struct goodix_ts_test *ts_test,
+    struct goodix_fw_version *version)
+{
+    return ts_test->ts->hw_ops->read_version(ts_test->ts, version);
+}
+
+static void goto_next_line(char **ptr)
+{
+    do {
+        *ptr = *ptr + 1;
+    } while (**ptr != '\n' && **ptr != '\0');
+    if (**ptr == '\0') {
+        return;
+    }
+    *ptr = *ptr + 1;
+}
+
+static void copy_this_line(char *dest, char *src)
+{
+	char *copy_from;
+	char *copy_to;
+
+	copy_from = src;
+	copy_to = dest;
+	do {
+		*copy_to = *copy_from;
+		copy_from++;
+		copy_to++;
+	} while((*copy_from != '\n') && (*copy_from != '\r') && (*copy_from != '\0'));
+	*copy_to = '\0';
+}
+
+static int getrid_space(s8* data, s32 len)
+{
+	u8* buf = NULL;
+	s32 i;
+	u32 count = 0;
+
+	buf = (char*)vzalloc(len + 5);
+	if (buf == NULL){
+		ts_err("get space alloc error");
+		return -ESRCH;
+	}
+
+	for (i = 0; i < len; i++)
+	{
+		if (data[i] == ' ' || data[i] == '\r' || data[i] == '\n')
+		{
+			continue;
+		}
+		buf[count++] = data[i];
+	}
+
+	buf[count++] = '\0';
+
+	memcpy(data, buf, count);
+	vfree(buf);
+
+	return count;
+}
+
+static int parse_valid_data(char *buf_start, loff_t buf_size,
+    char *ptr, s32 *data, s32 rows)
+{
+    int i = 0;
+    int j = 0;
+    char *token = NULL;
+    char *tok_ptr = NULL;
+    char *row_data = NULL;
+	long temp_val;
+
+    if (!ptr) {
+        ts_err("ptr is NULL");
+        return -EINVAL;
+    }
+	if (!data) {
+		ts_err("data is NULL");
+		return -EINVAL;
+	}
+
+    row_data = (char *)vzalloc(MAX_LINE_LEN);
+    if (!row_data) {
+        ts_err("alloc bytes %d failed.", MAX_LINE_LEN);
+        return -ENOMEM;
+    }
+
+    for (i = 0; i < rows; i++) {
+        memset(row_data, 0, MAX_LINE_LEN);
+        copy_this_line(row_data, ptr);
+        getrid_space(row_data, strlen(row_data));
+        tok_ptr = row_data;
+		while ((token = strsep(&tok_ptr,","))) {
+			if (strlen(token) == 0)
+				continue;
+			if (kstrtol(token, 0, &temp_val)) {
+				vfree(row_data);
+				return -EINVAL;
+			}
+			data[j++] = (s32)temp_val;
+		}
+		if (i == rows - 1)
+			break;
+		goto_next_line(&ptr);				//next row
+		if(!ptr || (0 == strlen(ptr)) || (ptr >= (buf_start + buf_size))) {
+			ts_info("invalid ptr, return");
+			vfree(row_data);
+			row_data = NULL;
+			return -EPERM;
+		}        
+    }
+    vfree(row_data);
+    return j;
+}
+
+static int parse_csvfile(char *buf, size_t size, char *target_name,
+        s32 *data, s32 rows, s32 col)
+{
+    int ret = 0;
+    char *ptr = NULL;
+    int read_ret;
+
+    read_ret = size;
+    if (read_ret > 0) {
+        ptr = buf;
+        ptr = strstr(ptr, target_name);
+        if (!ptr) {
+			ts_info("load %s failed 1, maybe not this item", target_name);
+            return -EINTR;
+        }
+
+        goto_next_line(&ptr);
+        if (!ptr || (0 == strlen(ptr))) {
+            ts_err("load %s failed 2!", target_name);
+            return -EIO;
+        }
+        
+        if (data) {
+            ret = parse_valid_data(buf, size, ptr, data, rows);
+        } else {
+            ts_err("load %s failed 3!", target_name);
+            return -EINTR;
+        }
+    } else {
+        ts_err("ret=%d, read_ret=%d", ret, read_ret);
+        ret = -ENXIO;
+    }
+
+    return ret;
+}
+
+
+static void goodix_init_params(struct goodix_ts_test *ts_test)
+{
+	struct goodix_ts_core *ts = ts_test->ts;
+	struct ts_test_params *test_params = &ts_test->test_params;
+
+	test_params->rawdata_addr = ts->ic_info.misc.mutual_rawdata_addr;
+	test_params->noisedata_addr = ts->ic_info.misc.mutual_diffdata_addr;
+	test_params->self_rawdata_addr = ts->ic_info.misc.self_rawdata_addr;
+	test_params->self_noisedata_addr = ts->ic_info.misc.self_diffdata_addr;
+
+	test_params->drv_num = ts->ic_info.parm.drv_num;
+	test_params->sen_num = ts->ic_info.parm.sen_num;
+
+	if (ts_test->ts->bus->ic_type == IC_TYPE_BERLIN_A)
+		test_params->params_info = &params_bra;
+	else if (ts_test->ts->bus->ic_type == IC_TYPE_BERLIN_B)
+		test_params->params_info = &params_brb;
+	else if (ts_test->ts->bus->ic_type == IC_TYPE_BERLIN_D)
+		test_params->params_info = &params_brd;
+	else if (ts_test->ts->bus->ic_type == IC_TYPE_NOTTINGHAM)
+		test_params->params_info = &params_not;
+}
+
+static int goodix_init_testlimits(struct goodix_ts_test *ts_test)
+{
+    int ret;
+	int i;
+    u32 data_buf[10] = {0};
+    char *temp_buf = NULL;
+    struct ts_test_params *test_params = &ts_test->test_params;
+    struct goodix_ts_core *ts_core = ts_test->ts;
+    const struct firmware *firmware = NULL;
+    struct device *dev = &ts_core->pdev->dev;
+    char limit_file[100] = {0};
+    u32 tx = test_params->drv_num;
+    u32 rx = test_params->sen_num;
+
+    sprintf(limit_file, "%s_%d.csv", GOODIX_TEST_FILE_NAME,
+			ts_core->fw_version.sensor_id);
+    ts_info("limit_file_name:%s", limit_file);
+
+    ret = request_firmware(&firmware, limit_file, dev);
+    if (ret < 0) {
+        ts_err("limits file [%s] not available", limit_file);
+        return -EINVAL;
+    }
+    if (firmware->size <= 0) {
+        ts_err("request_firmware, limits param length error,len:%zu",
+            firmware->size);
+        ret = -EINVAL;
+        goto exit_free;
+    }
+    temp_buf = vzalloc(firmware->size + 1);
+    if (!temp_buf) {
+        ts_err("alloc bytes failed.");
+        ret = -ENOMEM;
+        goto exit_free;
+    }
+    memcpy(temp_buf, firmware->data, firmware->size);
+
+	/* obtain config data */
+	ret = parse_csvfile(temp_buf, firmware->size, CSV_TP_TEST_CONFIG,
+		test_params->cfg_buf, 1, GOODIX_CFG_MAX_SIZE);
+	if (ret < 0) {
+		ts_info("Can't find %s", CSV_TP_TEST_CONFIG);
+	} else {
+		ts_info("parse_csvfile %s OK, cfg_len:%d", CSV_TP_TEST_CONFIG, ret);
+		for (i = 0; i < ret; i++)
+			ts_test->test_config.data[i] = (u8)test_params->cfg_buf[i];
+		ts_test->test_config.len = ret;
+	}
+
+    /* obtain mutual_raw min */
+    ret = parse_csvfile(temp_buf, firmware->size, CSV_TP_SPECIAL_RAW_MIN,
+        test_params->min_limits, rx, tx);
+    if (ret < 0) {
+        ts_err("Failed get min_limits");
+        goto exit_free;
+    } else {
+        ts_info("parse_csvfile %s OK", CSV_TP_SPECIAL_RAW_MIN);
+    }
+    /* obtain mutual_raw max */
+    ret = parse_csvfile(temp_buf, firmware->size, CSV_TP_SPECIAL_RAW_MAX,
+        test_params->max_limits, rx, tx);
+    if (ret < 0) {
+        ts_err("Failed get max_limits");
+        goto exit_free;
+    } else {
+        ts_info("parse_csvfile %s OK", CSV_TP_SPECIAL_RAW_MAX);
+    }
+	/* obtain delta limit */
+	ret = parse_csvfile(temp_buf, firmware->size, CSV_TP_SPECIAL_RAW_DELTA,
+		test_params->deviation_limits, rx, tx);
+	if (ret < 0) {
+		ts_err("Failed get delta limit");
+		goto exit_free;
+	} else {
+		ts_info("parse_csvfile %s OK", CSV_TP_SPECIAL_RAW_DELTA);
+	}
+
+    /* obtain self_raw min */
+    ret = parse_csvfile(temp_buf, firmware->size, CSV_TP_SPECIAL_SELFRAW_MIN,
+        test_params->self_min_limits, 1, tx + rx);
+	/* obtain self_raw max */
+	ret |= parse_csvfile(temp_buf, firmware->size, CSV_TP_SPECIAL_SELFRAW_MAX,
+		test_params->self_max_limits, 1, tx + rx);
+    if (ret < 0) {
+        ts_info("Can't find self_min_max_limits, ship this item");
+		ret = 0;
+        test_params->test_items[GTP_SELFCAP_TEST] = false;
+    } else {
+        ts_info("parse_csvfile %s OK", CSV_TP_SPECIAL_SELFRAW_MIN);
+		ts_info("parse_csvfile %s OK", CSV_TP_SPECIAL_SELFRAW_MAX);
+		test_params->test_items[GTP_SELFCAP_TEST] = true;
+    }
+
+    /* obtain noise_threshold */
+    ret = parse_csvfile(temp_buf, firmware->size, CSV_TP_NOISE_LIMIT,
+        &test_params->noise_threshold, 1, 1);
+    if (ret < 0) {
+        ts_info("Can't find noise_threshold, skip this item");
+		ret = 0;
+        test_params->test_items[GTP_NOISE_TEST] = false;
+    } else {
+        ts_info("parse_csvfile %s OK", CSV_TP_NOISE_LIMIT);
+		test_params->test_items[GTP_NOISE_TEST] = true;
+    }
+
+    /* obtain self_noise_threshold */
+    ret = parse_csvfile(temp_buf, firmware->size, CSV_TP_SELFNOISE_LIMIT,
+        &test_params->self_noise_threshold, 1, 1);
+    if (ret < 0) {
+        ts_info("Can't find self_noise_threshold, skip this item");
+		ret = 0;
+        test_params->test_items[GTP_SELFNOISE_TEST] = false;
+    } else {
+        ts_info("parse_csvfile %s OK", CSV_TP_SELFNOISE_LIMIT);
+		test_params->test_items[GTP_SELFNOISE_TEST] = true;
+    }
+
+	 /* obtain short_params */
+    ret = parse_csvfile(temp_buf, firmware->size, CSV_TP_SHORT_THRESHOLD,
+        (s32 *)data_buf, 1, 7);
+    if (ret < 0) {
+		ts_info("Can't find short shortciurt_threshold, skip this item");
+		ret = 0;
+		test_params->test_items[GTP_SHORT_TEST] = false;
+    } else {
+        ts_info("parse_csvfile %s OK", CSV_TP_SHORT_THRESHOLD);
+		test_params->test_items[GTP_SHORT_TEST] = true;
+        test_params->short_threshold = data_buf[0];
+        test_params->r_drv_drv_threshold = data_buf[1];
+        test_params->r_drv_sen_threshold = data_buf[2];
+        test_params->r_sen_sen_threshold = data_buf[3];
+        test_params->r_drv_gnd_threshold = data_buf[4];
+        test_params->r_sen_gnd_threshold = data_buf[5];
+        test_params->avdd_value = data_buf[6];
+    }
+
+exit_free:
+    vfree(temp_buf);
+    if (firmware)
+        release_firmware(firmware);
+    return ret;
+}
+
+static int goodix_tptest_prepare(struct goodix_ts_test *ts_test)
+{
+    int ret;
+	struct goodix_ic_config *cfg = &ts_test->test_config;
+
+	ts_info("TP test prepare IN");
+
+    goodix_init_params(ts_test);
+    /* parse test limits from csv */
+    ret = goodix_init_testlimits(ts_test);
+    if (ret < 0) {
+        ts_err("Failed to init testlimits from csv.");
+        return ret;
+    }
+
+    /* disable irq */
+    ts_test_irq_enable(ts_test, false);
+    /* close esd */
+    goodix_ts_blocking_notify(NOTIFY_ESD_OFF, NULL);
+
+    /* send test config if exist */
+	if (cfg->len > 0) {
+		ts_info("Test config exists and send it");
+		ret = ts_test->ts->hw_ops->send_config(ts_test->ts, cfg->data, cfg->len);
+		if (ret < 0) {
+			ts_err("Send test config failed, exit");
+			goodix_ts_blocking_notify(NOTIFY_ESD_ON, NULL);
+			ts_test_irq_enable(ts_test, true);
+			return ret;
+		}
+	}
+
+    return 0;
+}
+
+static void goodix_tptest_finish(struct goodix_ts_test *ts_test)
+{
+	ts_info("TP test finish IN");
+    /* reset chip */
+    ts_test_reset(ts_test, 100);
+    /* send normal config */
+	if (ts_test->test_config.len > 0) {
+		if (ts_test_send_config(ts_test, CONFIG_TYPE_NORMAL))
+			ts_err("Send normal config failed");
+	}
+
+    /* open esd */
+    goodix_ts_blocking_notify(NOTIFY_ESD_ON, NULL);
+    /* enable irq */
+    ts_test_irq_enable(ts_test, true);
+}
+
+#define SHORT_TEST_RUN_REG			0x10400
+#define SHORT_TEST_RUN_FLAG			0xAA
+#define INSPECT_FW_SWITCH_CMD		0x85
+#define TEST_FW_PID  				"OST"
+static int goodix_short_test_prepare(struct goodix_ts_test *ts_test)
+{
+    struct goodix_ts_cmd tmp_cmd;
+    struct goodix_fw_version fw_ver;
+    int ret;
+    int retry;
+	int resend = 3;
+	u8 status;
+
+	ts_info("short test prepare IN");
+	ts_test->test_result[GTP_SHORT_TEST] = SYS_SOFTWARE_REASON;
+    tmp_cmd.len = 4;
+    tmp_cmd.cmd = INSPECT_FW_SWITCH_CMD;
+
+resend_cmd:
+    ret = ts_test_send_cmd(ts_test, &tmp_cmd);
+    if (ret < 0) {
+        ts_err("send test mode failed");
+        return ret;
+    }
+
+	retry = 3;
+    while (retry--) {
+        msleep(40);
+		if (ts_test->ts->bus->ic_type == IC_TYPE_BERLIN_A) {
+			ret = ts_test_read_version(ts_test, &fw_ver);
+			if (ret < 0) {
+				ts_err("read test version failed");
+				return ret;
+			}
+			ret = memcmp(&(fw_ver.patch_pid[3]), TEST_FW_PID, strlen(TEST_FW_PID));
+			if (ret == 0)
+				return 0;
+			else
+				ts_info("patch ID dismatch %s != %s", fw_ver.patch_pid, TEST_FW_PID);
+		} else {
+			ret = ts_test_read(ts_test, SHORT_TEST_RUN_REG, &status, 1);
+			if (!ret && status == SHORT_TEST_RUN_FLAG)
+				return 0;
+			ts_info("short_mode_status=0x%02x ret=%d", status, ret);
+		}
+    }
+
+	if (resend--) {
+		ts_test_reset(ts_test, 100);
+		goto resend_cmd;
+	}
+
+    return -EINVAL;
+}
+
+static u32 map_die2pin(struct ts_test_params *test_params, u32 chn_num)
+{
+	int i = 0;
+	u32 res = 255;
+
+	if (chn_num & DRV_CHANNEL_FLAG)
+		chn_num = (chn_num & ~DRV_CHANNEL_FLAG) + test_params->params_info->max_sen_num;
+
+	for (i = 0; i < test_params->params_info->max_sen_num; i++) {
+		if (test_params->params_info->sen_map[i] == chn_num) {
+			res = i;
+			break;
+		}
+	}
+	/* res != 255 mean found the corresponding channel num */
+	if (res != 255)
+		return res;
+	/* if cannot find in SenMap try find in DrvMap */
+	for (i = 0; i < test_params->params_info->max_drv_num; i++) {
+		if (test_params->params_info->drv_map[i] == chn_num) {
+			res = i;
+			break;
+		}
+	}
+	if (i >= test_params->params_info->max_drv_num)
+		ts_err("Faild found corrresponding channel num:%d", chn_num);
+	else
+		res |= DRV_CHANNEL_FLAG;
+
+	return res;   
+}
+
+static void goodix_save_short_res(struct ts_test_params *params,
+	u16 chn1, u16 chn2, int r)
+{
+	int i;
+	u8 repeat_cnt = 0;
+	u8 repeat = 0;
+	struct goodix_ts_test *ts_test = container_of(params, 
+		struct goodix_ts_test, test_params);
+	struct ts_short_res *short_res = &ts_test->short_res;
+
+	if (chn1 == chn2 || short_res->short_num >= MAX_SHORT_NUM)
+		return;
+	
+	for (i = 0; i < short_res->short_num; i++) {
+		repeat_cnt = 0;
+		if (short_res->short_msg[4 * i] == chn1)
+			repeat_cnt++;
+		if (short_res->short_msg[4 * i] == chn2)
+			repeat_cnt++;
+		if (short_res->short_msg[4 * i + 1] == chn1)
+			repeat_cnt++;
+		if (short_res->short_msg[4 * i + 1] == chn2)
+			repeat_cnt++;
+		if (repeat_cnt >= 2){
+			repeat = 1;
+			break;
+		}
+	}
+	if (repeat == 0) {
+		short_res->short_msg[4 * short_res->short_num + 0] = chn1;
+		short_res->short_msg[4 * short_res->short_num + 1] = chn2;
+		short_res->short_msg[4 * short_res->short_num + 2] = (r >> 8) & 0xFF;
+		short_res->short_msg[4 * short_res->short_num + 3] = r & 0xFF;
+		if (short_res->short_num < MAX_SHORT_NUM)
+			short_res->short_num++;
+	}
+}
+
+static int gdix_check_tx_tx_shortcircut(struct goodix_ts_test *ts_test,
+        u8 short_ch_num)
+{
+	int ret = 0, err = 0;
+	u32 r_threshold = 0, short_r = 0;
+	int size = 0, i = 0, j = 0;
+	u16 adc_signal = 0;
+	u8 master_pin_num, slave_pin_num;
+	u8 *data_buf;
+	u32 data_reg;
+	struct ts_test_params *test_params = &ts_test->test_params;
+	int max_drv_num = test_params->params_info->max_drv_num;
+	int max_sen_num = test_params->params_info->max_sen_num;
+	u16 self_capdata, short_die_num = 0;
+
+	size = 4 + max_drv_num * 2 + 2;
+	data_buf = kzalloc(size, GFP_KERNEL);
+	if (!data_buf) {
+		ts_err("Failed to alloc memory");
+		return -ENOMEM;
+	}
+    /* drv&drv shortcircut check */
+	data_reg = test_params->params_info->drv_drv_selfcode_reg;
+	for (i = 0; i < short_ch_num; i++) {
+		ret = ts_test_read(ts_test, data_reg, data_buf, size);
+		if (ret < 0) {
+			ts_err("Failed read Drv-to-Drv short rawdata");
+			err = -EINVAL;
+			break;
+		}
+
+		if (checksum_cmp(data_buf, size, CHECKSUM_MODE_U8_LE)) {
+			ts_err("Drv-to-Drv adc data checksum error");
+			err = -EINVAL;
+			break;
+		}
+
+		r_threshold = test_params->r_drv_drv_threshold;
+		short_die_num = le16_to_cpup((__le16 *)&data_buf[0]);
+		short_die_num -= max_sen_num;
+		if (short_die_num >= max_drv_num) {
+			ts_info("invalid short pad num:%d",
+				short_die_num + max_sen_num);
+			continue;
+		}
+
+		/* TODO: j start position need recheck */
+		self_capdata = le16_to_cpup((__le16 *)&data_buf[2]);
+		if (self_capdata == 0xffff || self_capdata == 0) {
+			ts_info("invalid self_capdata:0x%x", self_capdata);
+			continue;
+		}
+
+		for (j = short_die_num + 1; j < max_drv_num; j++) {
+			adc_signal = le16_to_cpup((__le16 *)&data_buf[4 + j * 2]);
+
+			if (adc_signal < test_params->short_threshold)
+				continue;
+
+			short_r = (u32)cal_cha_to_cha_res(ts_test, self_capdata, adc_signal);
+			if (short_r < r_threshold) {
+				master_pin_num =
+					map_die2pin(test_params, short_die_num + max_sen_num);
+				slave_pin_num =
+					map_die2pin(test_params, j + max_sen_num);
+				if (master_pin_num == 0xFF || slave_pin_num == 0xFF) {
+					ts_info("WARNNING invalid pin");
+					continue;
+				}
+				goodix_save_short_res(test_params, master_pin_num,
+					slave_pin_num, short_r);
+				ts_err("short circut:R=%dK,R_Threshold=%dK",
+							short_r, r_threshold);
+				ts_err("%s%d--%s%d shortcircut",
+					(master_pin_num & DRV_CHANNEL_FLAG) ? "DRV" : "SEN",
+					(master_pin_num & ~DRV_CHANNEL_FLAG),
+					(slave_pin_num & DRV_CHANNEL_FLAG) ? "DRV" : "SEN",
+					(slave_pin_num & ~DRV_CHANNEL_FLAG));
+				err = -EINVAL;
+			}
+		}
+		data_reg += size;
+	}
+
+	kfree(data_buf);
+	return err;
+}
+
+static int gdix_check_rx_rx_shortcircut(struct goodix_ts_test *ts_test,
+        u8 short_ch_num)
+{
+	int ret = 0, err = 0;
+	u32 r_threshold = 0, short_r = 0;
+	int size = 0, i = 0, j = 0;
+	u16 adc_signal = 0;
+	u8 master_pin_num, slave_pin_num;
+	u8 *data_buf;
+	u32 data_reg;
+	struct ts_test_params *test_params = &ts_test->test_params;
+	int max_sen_num = test_params->params_info->max_sen_num;
+	u16 self_capdata, short_die_num = 0;
+
+	size = 4 + max_sen_num * 2 + 2;
+	data_buf = kzalloc(size, GFP_KERNEL);
+	if (!data_buf) {
+		ts_err("Failed to alloc memory");
+		return -ENOMEM;
+	}
+	/* drv&drv shortcircut check */
+	data_reg = test_params->params_info->sen_sen_selfcode_reg;
+    for (i = 0; i < short_ch_num; i++) {
+		ret = ts_test_read(ts_test, data_reg, data_buf, size);
+		if (ret) {
+			ts_err("Failed read Sen-to-Sen short rawdata");
+			err = -EINVAL;
+			break;
+		}
+
+		if (checksum_cmp(data_buf, size, CHECKSUM_MODE_U8_LE)) {
+			ts_err("Sen-to-Sen adc data checksum error");				
+			err = -EINVAL;
+			break;		
+		}
+
+		r_threshold = test_params->r_sen_sen_threshold;
+		short_die_num = le16_to_cpup((__le16 *)&data_buf[0]);
+		if (short_die_num >= max_sen_num) {
+			ts_info("invalid short pad num:%d",	short_die_num);
+			continue;
+		}
+
+		/* TODO: j start position need recheck */
+		self_capdata = le16_to_cpup((__le16 *)&data_buf[2]);
+		if (self_capdata == 0xffff || self_capdata == 0) {
+			ts_info("invalid self_capdata:0x%x", self_capdata);
+			continue;
+		}
+
+		for (j = short_die_num + 1; j < max_sen_num; j++) {
+			adc_signal = le16_to_cpup((__le16 *)&data_buf[4 + j * 2]);
+			
+			if (adc_signal < test_params->short_threshold)
+				continue;
+
+			short_r = (u32)cal_cha_to_cha_res(ts_test, self_capdata, adc_signal);
+			if (short_r < r_threshold) {
+				master_pin_num = map_die2pin(test_params, short_die_num);
+				slave_pin_num = map_die2pin(test_params, j);
+				if (master_pin_num == 0xFF || slave_pin_num == 0xFF) {
+					ts_info("WARNNING invalid pin");
+					continue;
+				}
+				goodix_save_short_res(test_params, master_pin_num,
+					slave_pin_num, short_r);
+				ts_err("short circut:R=%dK,R_Threshold=%dK",
+							short_r, r_threshold);
+				ts_err("%s%d--%s%d shortcircut",
+					(master_pin_num & DRV_CHANNEL_FLAG) ? "DRV" : "SEN",
+					(master_pin_num & ~DRV_CHANNEL_FLAG),
+					(slave_pin_num & DRV_CHANNEL_FLAG) ? "DRV" : "SEN",
+					(slave_pin_num & ~DRV_CHANNEL_FLAG));
+				err = -EINVAL;
+			}
+		}
+		data_reg += size;
+	}
+
+	kfree(data_buf);
+	return err;    
+}
+
+static int gdix_check_tx_rx_shortcircut(struct goodix_ts_test *ts_test,
+        u8 short_ch_num)
+{
+	int ret = 0, err = 0;
+	u32 r_threshold = 0, short_r = 0;	
+	int size = 0, i = 0, j = 0;
+	u16 adc_signal = 0;
+	u8 master_pin_num, slave_pin_num;	
+	u8 *data_buf = NULL;
+	u32 data_reg;
+	struct ts_test_params *test_params = &ts_test->test_params;
+	int max_drv_num = test_params->params_info->max_drv_num;
+	int max_sen_num = test_params->params_info->max_sen_num;
+	u16 self_capdata, short_die_num = 0;
+
+	size = 4 + max_drv_num * 2 + 2;
+	data_buf = kzalloc(size, GFP_KERNEL);
+	if (!data_buf) {
+		ts_err("Failed to alloc memory");
+		return -ENOMEM;
+	}
+	/* drv&sen shortcircut check */
+	data_reg = test_params->params_info->drv_sen_selfcode_reg;
+	for (i = 0; i < short_ch_num; i++) {
+		ret = ts_test_read(ts_test, data_reg, data_buf, size);
+		if (ret) {
+			ts_err("Failed read Drv-to-Sen short rawdata");
+			err = -EINVAL;
+			break;
+		}
+
+		if (checksum_cmp(data_buf, size, CHECKSUM_MODE_U8_LE)) {
+			ts_err("Drv-to-Sen adc data checksum error");
+			err = -EINVAL;
+			break;
+		}
+
+		r_threshold = test_params->r_drv_sen_threshold;
+		short_die_num = le16_to_cpup((__le16 *)&data_buf[0]);
+		if (short_die_num >= max_sen_num) {
+			ts_info("invalid short pad num:%d",	short_die_num);
+			continue;
+		}
+
+		/* TODO: j start position need recheck */
+		self_capdata = le16_to_cpup((__le16 *)&data_buf[2]);
+		if (self_capdata == 0xffff || self_capdata == 0) {
+			ts_info("invalid self_capdata:0x%x", self_capdata);
+			continue;
+		}
+
+		for (j = 0; j < max_drv_num; j++) {
+			adc_signal = le16_to_cpup((__le16 *)&data_buf[4 + j * 2]);
+
+			if (adc_signal < test_params->short_threshold)
+				continue;
+
+			short_r = (u32)cal_cha_to_cha_res(ts_test, self_capdata, adc_signal);
+			if (short_r < r_threshold) {
+				master_pin_num = map_die2pin(test_params, short_die_num);
+				slave_pin_num = map_die2pin(test_params, j + max_sen_num);
+				if (master_pin_num == 0xFF || slave_pin_num == 0xFF) {
+					ts_info("WARNNING invalid pin");
+					continue;
+				}
+				goodix_save_short_res(test_params, master_pin_num,
+					slave_pin_num, short_r);
+				ts_err("short circut:R=%dK,R_Threshold=%dK",
+							short_r, r_threshold);
+				ts_err("%s%d--%s%d shortcircut",
+					(master_pin_num & DRV_CHANNEL_FLAG) ? "DRV" : "SEN",
+					(master_pin_num & ~DRV_CHANNEL_FLAG),
+					(slave_pin_num & DRV_CHANNEL_FLAG) ? "DRV" : "SEN",
+					(slave_pin_num & ~DRV_CHANNEL_FLAG));
+				err = -EINVAL;
+			}
+		}
+		data_reg += size;
+	}
+
+	kfree(data_buf);
+	return err;    
+}
+
+static int gdix_check_resistance_to_gnd(struct ts_test_params *test_params,
+        u16 adc_signal, u32 pos)
+{
+	long r = 0;
+	u16 r_th = 0, avdd_value = 0;
+	u16 chn_id_tmp = 0;
+	u8 pin_num = 0;
+	unsigned short short_type;
+	struct goodix_ts_test *ts_test = container_of(test_params,
+		struct goodix_ts_test, test_params);
+	int max_drv_num = test_params->params_info->max_drv_num;
+	int max_sen_num = test_params->params_info->max_sen_num;
+
+	avdd_value = test_params->avdd_value;
+	short_type = adc_signal & 0x8000;
+	adc_signal &= ~0x8000;
+	if (adc_signal == 0)
+		adc_signal = 1;
+
+	if (short_type == 0) {
+		/* short to GND */
+		r = cal_cha_to_gnd_res(ts_test, adc_signal);
+	} else {
+		/* short to VDD */
+		r = cal_cha_to_avdd_res(ts_test, adc_signal, avdd_value);
+	}
+
+	if (pos < max_drv_num)
+		r_th = test_params->r_drv_gnd_threshold;
+	else
+		r_th = test_params->r_sen_gnd_threshold;
+
+	chn_id_tmp = pos;
+	if (chn_id_tmp < max_drv_num)
+		chn_id_tmp += max_sen_num;
+	else
+		chn_id_tmp -= max_drv_num;
+
+	if (r < r_th) {
+		pin_num = map_die2pin(test_params, chn_id_tmp);
+		goodix_save_short_res(test_params, pin_num,
+				short_type ? CHN_VDD : CHN_GND, r);
+		ts_err("%s%d shortcircut to %s,R=%ldK,R_Threshold=%dK",
+				(pin_num & DRV_CHANNEL_FLAG) ? "DRV" : "SEN",
+				(pin_num & ~DRV_CHANNEL_FLAG),
+				short_type ? "VDD" : "GND",
+				r, r_th);
+
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int gdix_check_gndvdd_shortcircut(struct goodix_ts_test *ts_test)
+{
+	int ret = 0, err = 0;
+	int size = 0, i = 0;
+	u16 adc_signal = 0;
+    u32 data_reg;
+	u8 *data_buf = NULL;
+	int max_drv_num = ts_test->test_params.params_info->max_drv_num;
+	int max_sen_num = ts_test->test_params.params_info->max_sen_num;
+
+	size = (max_drv_num + max_sen_num) * 2 + 2;
+	data_buf = kzalloc(size, GFP_KERNEL);
+	if (!data_buf) {
+		ts_err("Failed to alloc memory");
+		return -ENOMEM;
+	}
+	/* read diff code, diff code will be used to calculate
+		* resistance between channel and GND */
+	data_reg = ts_test->test_params.params_info->diffcode_data_reg;
+	ret = ts_test_read(ts_test, data_reg, data_buf, size);
+	if (ret < 0) {
+		ts_err("Failed read to-gnd rawdata");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (checksum_cmp(data_buf, size, CHECKSUM_MODE_U8_LE)) {
+		ts_err("diff code checksum error");
+		err = -EINVAL;
+		goto err_out;		
+	}
+
+	for (i = 0; i < max_drv_num + max_sen_num; i++) {
+		adc_signal = le16_to_cpup((__le16 *)&data_buf[i * 2]);
+		ret = gdix_check_resistance_to_gnd(&ts_test->test_params,
+					adc_signal, i);
+		if (ret != 0) {
+			ts_err("Resistance to-gnd/vdd short");
+			err = ret;
+		}
+	}
+
+err_out:
+	kfree(data_buf);
+	return err;
+}
+
+static int goodix_shortcircut_analysis(struct goodix_ts_test *ts_test)
+{
+    int ret;
+    int err = 0;
+    test_result_t test_result;
+
+    ret = ts_test_read(ts_test, ts_test->test_params.params_info->short_test_result_reg,
+        (u8 *)&test_result, sizeof(test_result));
+    if (ret < 0) {
+        ts_err("Read TEST_RESULT_REG failed");
+        return ret;
+    }
+
+	if (checksum_cmp((u8 *)&test_result, sizeof(test_result),
+		CHECKSUM_MODE_U8_LE)) {
+		ts_err("shrot result checksum err");
+		return -EINVAL;
+	}
+
+	if (!(test_result.result & 0x0F)) {
+		ts_info(">>>>> No shortcircut");
+		return 0;
+	}
+	ts_info("short flag 0x%02x, drv&drv:%d, sen&sen:%d, drv&sen:%d, drv/GNDVDD:%d, sen/GNDVDD:%d",
+		test_result.result, test_result.drv_drv_num, test_result.sen_sen_num,
+		test_result.drv_sen_num, test_result.drv_gnd_avdd_num, test_result.sen_gnd_avdd_num);
+
+	if (test_result.drv_drv_num)
+		err |= gdix_check_tx_tx_shortcircut(ts_test, test_result.drv_drv_num);
+	if (test_result.sen_sen_num)
+		err |= gdix_check_rx_rx_shortcircut(ts_test, test_result.sen_sen_num);
+	if (test_result.drv_sen_num)
+		err |= gdix_check_tx_rx_shortcircut(ts_test, test_result.drv_sen_num);
+	if (test_result.drv_gnd_avdd_num || test_result.sen_gnd_avdd_num)
+		err |= gdix_check_gndvdd_shortcircut(ts_test);
+
+    ts_info(">>>>> short check return 0x%x", err);
+
+    return err;
+}
+
+#define SHORT_FW_CMD_REG				0x10400
+static int send_test_cmd(struct goodix_ts_test *ts_test,
+    struct goodix_ts_cmd *cmd)
+{
+    int ret;
+    u32 reg = SHORT_FW_CMD_REG;
+    cmd->state = 0;
+    cmd->ack = 0;
+    goodix_append_checksum(&(cmd->buf[2]), cmd->len - 2,
+            CHECKSUM_MODE_U8_LE);
+    ret = ts_test_write(ts_test, reg, cmd->buf, cmd->len + 2);
+    if (ret < 0)
+        return ret;
+    usleep_range(10000, 11000);
+    return ret;
+}
+
+
+#define INSPECT_PARAM_CMD				0xAA
+#define SHORT_TEST_FINISH_FLAG  		0x88
+#define SHORT_TEST_THRESHOLD_REG		0x20402
+static void goodix_shortcircut_test(struct goodix_ts_test *ts_test)
+{
+    int ret = 0;
+    int retry;
+    u16 test_time;
+	u8 status;
+	int ic_type = ts_test->ts->bus->ic_type;
+    struct goodix_ts_cmd test_parm_cmd;
+	// u8 test_param[6];
+
+	ts_info("---------------------- short_test begin ----------------------");
+    ret = goodix_short_test_prepare(ts_test);
+    if (ret < 0) {
+        ts_err("Failed enter short test mode");
+        return;
+    }
+
+	/* get short test time */
+    ret = ts_test_read(ts_test, ts_test->test_params.params_info->short_test_time_reg, (u8 *)&test_time, 2);
+    if (ret < 0) {
+        ts_err("Failed to get test_time, default %dms", DEFAULT_TEST_TIME_MS);
+        test_time = DEFAULT_TEST_TIME_MS;
+    } else {
+		if (ic_type == IC_TYPE_BERLIN_A)
+        	test_time /= 10;
+        if (test_time > MAX_TEST_TIME_MS) {
+            ts_info("test time too long %d > %d",
+                test_time, MAX_TEST_TIME_MS);
+            test_time = MAX_TEST_TIME_MS;
+        }
+        ts_info("get test time %dms", test_time);
+    }
+
+	/* start short test */
+	if (ic_type == IC_TYPE_BERLIN_A) {
+		test_parm_cmd.len = 0x0A;
+		test_parm_cmd.cmd = INSPECT_PARAM_CMD;
+		test_parm_cmd.data[0] = ts_test->test_params.params_info->dft_short_threshold & 0xFF;
+		test_parm_cmd.data[1] = (ts_test->test_params.params_info->dft_short_threshold >> 8) & 0xFF;
+		test_parm_cmd.data[2] = ts_test->test_params.params_info->short_diffcode_threshold & 0xFF;
+		test_parm_cmd.data[3] = (ts_test->test_params.params_info->short_diffcode_threshold >> 8) & 0xFF;
+		test_parm_cmd.data[4] = ts_test->test_params.params_info->short_test_dump_num & 0xFF;
+		test_parm_cmd.data[5] = (ts_test->test_params.params_info->short_test_dump_num >> 8) & 0xFF;
+		ret = send_test_cmd(ts_test, &test_parm_cmd);
+		if (ret < 0) {
+			ts_err("send INSPECT_PARAM_CMD failed");
+			return;
+		}
+	} else {
+		// test_param[0] = ts_test->test_params.params_info->dft_short_threshold & 0xFF;
+		// test_param[1] = (ts_test->test_params.params_info->dft_short_threshold >> 8) & 0xFF;
+		// test_param[2] = ts_test->test_params.params_info->short_diffcode_threshold & 0xFF;
+		// test_param[3] = (ts_test->test_params.params_info->short_diffcode_threshold >> 8) & 0xFF;
+		// test_param[4] = ts_test->test_params.params_info->short_test_dump_num & 0xFF;
+		// test_param[5] = (ts_test->test_params.params_info->short_test_dump_num >> 8) & 0xFF;
+		// ts_test_write(ts_test, SHORT_TEST_THRESHOLD_REG, test_param, sizeof(test_param));
+		status = 0;
+		ts_test_write(ts_test, SHORT_TEST_RUN_REG, &status, 1);
+	}
+
+	/* wait short test finish */
+    msleep(test_time);
+    retry = 50;
+	while (retry--) {
+		ret = ts_test_read(ts_test, ts_test->test_params.params_info->short_test_status_reg, &status, 1);
+		if (!ret && status == SHORT_TEST_FINISH_FLAG)
+			break;
+		msleep(50);
+	}
+	if (retry < 0) {
+		ts_err("short test failed, status:0x%02x", status);
+		return;
+	}
+
+	/* start analysis short result */
+	ts_info("short_test finished, start analysis");
+	ret = goodix_shortcircut_analysis(ts_test);
+	if (ret < 0)
+		ts_test->test_result[GTP_SHORT_TEST] = GTP_PANEL_REASON;
+	else
+		ts_test->test_result[GTP_SHORT_TEST] = GTP_TEST_PASS;
+}
+
+#define GOODIX_CMD_RAWDATA	2
+#define GOODIX_TOUCH_EVENT	0x80
+static int goodix_cap_test_prepare(struct goodix_ts_test *ts_test)
+{
+    int ret;
+    struct goodix_ts_cmd temp_cmd;
+
+	ts_info("cap test prepare IN");
+	ts_test->test_result[GTP_CAP_TEST] = SYS_SOFTWARE_REASON;
+	ts_test->test_result[GTP_DELTA_TEST] = SYS_SOFTWARE_REASON;
+	if (ts_test->test_params.test_items[GTP_SELFCAP_TEST])
+		ts_test->test_result[GTP_SELFCAP_TEST] = SYS_SOFTWARE_REASON;
+	if (ts_test->test_params.test_items[GTP_NOISE_TEST])
+		ts_test->test_result[GTP_NOISE_TEST] = SYS_SOFTWARE_REASON;
+	if (ts_test->test_params.test_items[GTP_SELFNOISE_TEST])
+		ts_test->test_result[GTP_SELFNOISE_TEST] = SYS_SOFTWARE_REASON;
+
+    /* switch rawdata mode */
+	if (ts_test->ts->bus->ic_type == IC_TYPE_BERLIN_D ||
+			ts_test->ts->bus->ic_type == IC_TYPE_NOTTINGHAM) {
+		temp_cmd.cmd = 0x90;
+		temp_cmd.data[0] = 0x81;
+		temp_cmd.len = 5;
+	} else {
+		temp_cmd.cmd = GOODIX_CMD_RAWDATA;
+		temp_cmd.len = 4;
+	}
+	ret = ts_test_send_cmd(ts_test, &temp_cmd);
+    if (ret < 0)
+        ts_err("Enter rawdata mode failed");
+
+    return ret;
+}
+
+static int goodix_cap_test_finish(struct goodix_ts_test *ts_test)
+{
+	ts_info("cap_test finished");
+    /* switch coor mode */
+	ts_test_reset(ts_test, 100);
+	return 0;
+}
+
+static int goodix_cache_rawdata(struct goodix_ts_test *ts_test)
+{
+	int ret;
+	int i;
+	int retry;
+	u8 val;
+	unsigned char frame_buf[GOODIX_MAX_FRAMEDATA_LEN];
+	struct goodix_ts_core *cd = ts_test->ts;
+	unsigned char *cur_ptr;
+	u32 sen_num = ts_test->test_params.sen_num;
+	u32 drv_num = ts_test->test_params.drv_num;
+	u32 data_size = sen_num * drv_num;
+	u32 data_addr = ts_test->test_params.rawdata_addr;
+	u32 flag_addr = ts_test->ts->ic_info.misc.touch_data_addr;
+
+	if (ts_test->ts->bus->ic_type == IC_TYPE_BERLIN_D ||
+			ts_test->ts->bus->ic_type == IC_TYPE_NOTTINGHAM)
+		flag_addr = ts_test->ts->ic_info.misc.frame_data_addr;
+
+	for (i = 0; i < TOTAL_FRAME_NUM; i++) {
+		val = 0;
+		ret = ts_test_write(ts_test, flag_addr, &val, 1);
+		if (ret < 0) {
+			ts_err("clean touch event failed, exit");
+			return -EAGAIN;
+		}
+		retry = 20;
+		while (retry--) {
+			usleep_range(5000, 5100);
+			ret = ts_test_read(ts_test, flag_addr, &val, 1);
+			if (!ret && (val & 0x80))
+				break;
+		}
+		if (retry < 0) {
+			ts_err("rawdata is not ready val:0x%02x i:%d, exit", val, i);
+			return -EAGAIN;
+		}
+
+		if (cd->bus->ic_type == IC_TYPE_BERLIN_D ||
+				cd->bus->ic_type == IC_TYPE_NOTTINGHAM) {
+			ret = ts_test_read(ts_test, flag_addr, frame_buf, sizeof(frame_buf));
+			if (ret < 0)
+				return ret;
+			if (checksum_cmp(frame_buf, cd->ic_info.misc.frame_data_head_len, CHECKSUM_MODE_U8_LE)) {
+				ts_err("frame head checksum error");
+				return -EINVAL;
+			}
+
+			cur_ptr = frame_buf;
+			cur_ptr += cd->ic_info.misc.frame_data_head_len;
+			cur_ptr += cd->ic_info.misc.fw_attr_len;
+			cur_ptr += cd->ic_info.misc.fw_log_len;
+			memcpy((u8 *)ts_test->rawdata[i].data, cur_ptr + 8,
+					data_size * 2);
+		} else {
+			ret = ts_test_read(ts_test, data_addr,
+				(u8 *)ts_test->rawdata[i].data, data_size * sizeof(s16));
+			if (ret < 0)
+				return ret;
+		}
+
+		ts_test->rawdata[i].size = data_size;
+		goodix_rotate_abcd2cbad(drv_num, sen_num, ts_test->rawdata[i].data);
+	}
+
+	return ret;
+}
+
+static void goodix_cache_deltadata(struct goodix_ts_test *ts_test)
+{
+	u32 data_size;
+	int tx = ts_test->test_params.drv_num;
+	int i;
+	int j;
+	int max_val;
+	int raw;
+	int temp;
+
+	for (i = 0; i < TOTAL_FRAME_NUM; i++) {
+		data_size = ts_test->rawdata[i].size;
+		if (data_size == 0)
+			continue;
+		for (j = 0; j < data_size; j++) {
+			raw = ts_test->rawdata[i].data[j];
+			max_val = 0;
+			/* calcu delta with above node */
+			if (j - tx >= 0) {
+				temp = ts_test->rawdata[i].data[j - tx];
+				temp = ABS(temp - raw);
+				max_val = MAX(max_val, temp);
+			}
+			/* calcu delta with bellow node */
+			if (j + tx < data_size) {
+				temp = ts_test->rawdata[i].data[j + tx];
+				temp = ABS(temp - raw);
+				max_val = MAX(max_val, temp);
+			}
+			/* calcu delta with left node */
+			if (j % tx) {
+				temp = ts_test->rawdata[i].data[j - 1];
+				temp = ABS(temp - raw);
+				max_val = MAX(max_val, temp);
+			}
+			/* calcu delta with right node */
+			if ((j + 1) % tx) {
+				temp = ts_test->rawdata[i].data[j + 1];
+				temp = ABS(temp - raw);
+				max_val = MAX(max_val, temp);
+			}
+			temp = max_val * 1000 / raw;
+			ts_test->accord_arr[i].data[j] = (temp > 32767) ? 32767 : temp;
+		}
+		ts_test->accord_arr[i].size = data_size;
+	}
+}
+
+static int goodix_cache_self_rawdata(struct goodix_ts_test *ts_test)
+{
+	int ret;
+	u32 sen_num = ts_test->test_params.sen_num;
+	u32 drv_num = ts_test->test_params.drv_num;
+	u32 data_size = sen_num + drv_num;
+	u32 data_addr = ts_test->test_params.self_rawdata_addr;
+	u32 flag_addr = ts_test->ts->ic_info.misc.frame_data_addr;
+
+	struct goodix_ts_core *cd = ts_test->ts;
+	unsigned char frame_buf[GOODIX_MAX_FRAMEDATA_LEN];
+	unsigned char *cur_ptr;	
+
+	if (cd->bus->ic_type == IC_TYPE_BERLIN_D ||
+			cd->bus->ic_type == IC_TYPE_NOTTINGHAM) {
+		ret = ts_test_read(ts_test, flag_addr, frame_buf, sizeof(frame_buf));
+		if (ret < 0)
+			return ret;
+		if (checksum_cmp(frame_buf, cd->ic_info.misc.frame_data_head_len, CHECKSUM_MODE_U8_LE)) {
+			ts_err("frame head checksum error");
+			return -EINVAL;
+		}
+
+		cur_ptr = frame_buf;
+		cur_ptr += cd->ic_info.misc.frame_data_head_len;
+		cur_ptr += cd->ic_info.misc.fw_attr_len;
+		cur_ptr += cd->ic_info.misc.fw_log_len;
+		cur_ptr += cd->ic_info.misc.mutual_struct_len;
+		memcpy((u8 *)ts_test->self_rawdata.data, cur_ptr + 10,
+				data_size * 2);
+	} else {
+		ret = ts_test_read(ts_test, data_addr,
+			(u8 *)ts_test->self_rawdata.data, data_size * sizeof(s16));
+		if (ret < 0)
+			return ret;
+	}
+	ts_test->self_rawdata.size = data_size;
+
+	return ret;
+}
+
+static int goodix_cache_noisedata(struct goodix_ts_test *ts_test)
+{
+	int ret;
+	int i;
+	int cnt;
+	int retry;
+	u8 val;
+	unsigned char frame_buf[GOODIX_MAX_FRAMEDATA_LEN];
+	unsigned char *cur_ptr;
+	struct goodix_ts_cmd temp_cmd;
+	struct goodix_ts_core *cd = ts_test->ts;		
+	u32 sen_num = ts_test->test_params.sen_num;
+	u32 drv_num = ts_test->test_params.drv_num;
+	u32 data_size = sen_num * drv_num;
+	u32 data_addr = ts_test->test_params.noisedata_addr;
+	u32 flag_addr = ts_test->ts->ic_info.misc.touch_data_addr;
+
+	if (cd->bus->ic_type == IC_TYPE_BERLIN_D ||
+			cd->bus->ic_type == IC_TYPE_NOTTINGHAM) {
+		flag_addr = ts_test->ts->ic_info.misc.frame_data_addr;
+		temp_cmd.cmd = 0x90;
+		temp_cmd.data[0] = 0x82;
+		temp_cmd.len = 5;
+		ret = ts_test_send_cmd(ts_test, &temp_cmd);
+		if (ret < 0) {
+			ts_err("switch diffdata mode failed, exit!");
+			return ret;
+		}
+	}
+
+	for (cnt = 0; cnt < NOISEDATA_TEST_TIMES; cnt++) {
+		val = 0;
+		ret = ts_test_write(ts_test, flag_addr, &val, 1);
+		if (ret < 0) {
+			ts_err("clean touch event failed, exit");
+			return -EAGAIN;
+		}
+		retry = 20;
+		while (retry--) {
+			usleep_range(5000, 5100);
+			ret = ts_test_read(ts_test, flag_addr, &val, 1);
+			if (!ret && (val & 0x80))
+				break;
+		}
+		if (retry < 0) {
+			ts_err("noisedata is not ready val:0x%02x i:%d, exit", val, cnt);
+			return -EAGAIN;
+		}
+
+		if (cd->bus->ic_type == IC_TYPE_BERLIN_D ||
+				cd->bus->ic_type == IC_TYPE_NOTTINGHAM) {
+			ret = ts_test_read(ts_test, flag_addr, frame_buf, sizeof(frame_buf));
+			if (ret < 0)
+				return ret;
+			if (checksum_cmp(frame_buf, cd->ic_info.misc.frame_data_head_len, CHECKSUM_MODE_U8_LE)) {
+				ts_err("frame head checksum error");
+				return -EINVAL; 
+			}
+
+			cur_ptr = frame_buf;
+			cur_ptr += cd->ic_info.misc.frame_data_head_len;
+			cur_ptr += cd->ic_info.misc.fw_attr_len;
+			cur_ptr += cd->ic_info.misc.fw_log_len;
+			memcpy((u8 *)ts_test->noisedata[cnt].data, cur_ptr + 8,
+					data_size * 2);
+		} else {
+			ret = ts_test_read(ts_test, data_addr,
+				(u8 *)ts_test->noisedata[cnt].data, data_size * sizeof(s16));
+			if (ret < 0)
+				return ret;
+		}
+
+		ts_test->noisedata[cnt].size = data_size;
+		goodix_rotate_abcd2cbad(drv_num, sen_num, ts_test->noisedata[cnt].data);
+		for (i = 0; i < data_size; i++)
+			ts_test->noisedata[cnt].data[i] = ABS(ts_test->noisedata[cnt].data[i]);
+	}
+
+	return ret;
+}
+
+static int goodix_cache_self_noisedata(struct goodix_ts_test *ts_test)
+{
+	int ret;
+	int i;
+	u32 sen_num = ts_test->test_params.sen_num;
+	u32 drv_num = ts_test->test_params.drv_num;
+	u32 data_size = sen_num + drv_num;
+	u32 data_addr = ts_test->test_params.self_noisedata_addr;
+	u32 flag_addr = ts_test->ts->ic_info.misc.frame_data_addr;
+
+	struct goodix_ts_core *cd = ts_test->ts;
+	unsigned char frame_buf[GOODIX_MAX_FRAMEDATA_LEN];
+	unsigned char *cur_ptr;
+
+	if (cd->bus->ic_type == IC_TYPE_BERLIN_D ||
+			cd->bus->ic_type == IC_TYPE_NOTTINGHAM) {
+		ret = ts_test_read(ts_test, flag_addr, frame_buf, sizeof(frame_buf));
+		if (ret < 0)
+			return ret;
+		if (checksum_cmp(frame_buf, cd->ic_info.misc.frame_data_head_len, CHECKSUM_MODE_U8_LE)) {
+			ts_err("frame head checksum error");
+			return -EINVAL;
+		}
+
+		cur_ptr = frame_buf;
+		cur_ptr += cd->ic_info.misc.frame_data_head_len;
+		cur_ptr += cd->ic_info.misc.fw_attr_len;
+		cur_ptr += cd->ic_info.misc.fw_log_len;
+		cur_ptr += cd->ic_info.misc.mutual_struct_len;
+		memcpy((u8 *)ts_test->self_noisedata.data, cur_ptr + 10,
+				data_size * 2);
+	} else {
+		ret = ts_test_read(ts_test, data_addr,
+			(u8 *)ts_test->self_noisedata.data, data_size * sizeof(s16));
+		if (ret < 0)
+			return ret;
+	}
+
+	ts_test->self_noisedata.size = data_size;
+	for (i = 0; i < data_size; i++) {
+		ts_test->self_noisedata.data[i] = ABS(ts_test->self_noisedata.data[i]);
+	}	
+
+	return ret;	
+}
+
+static int goodix_analysis_rawdata(struct goodix_ts_test *ts_test)
+{
+	int i;
+	int j;
+	bool fail_flag = false;
+	int err_cnt = 0;
+	int times = TOTAL_FRAME_NUM;
+	s16 val;
+	u32 data_size = ts_test->rawdata[0].size;
+
+	for (i = 0; i < times; i++) {
+		for (j = 0; j < data_size; j++) {
+			val = ts_test->rawdata[i].data[j];
+			if (val < ts_test->test_params.min_limits[j]) {
+				fail_flag = true;
+				ts_test->open_res.beyond_min_limit_cnt[j]++;
+			}
+			if (val > ts_test->test_params.max_limits[j]) {
+				fail_flag = true;
+				ts_test->open_res.beyond_max_limit_cnt[j]++;
+			}
+		}
+		if (fail_flag)
+			err_cnt++;
+		fail_flag = false;
+	}
+
+	if (err_cnt * 100 > times * 100 * 9 / 10) {
+		ts_err("rawdata more than 90%%(%d) fail, test fail", err_cnt);
+		return -1;
+	}
+	if (err_cnt * 100 > times * 100 * 1 / 10) {
+		ts_info("rawdata more than 10%%(%d) fail, need test again", err_cnt);
+		return 1;
+	}
+
+	ts_info("rawdata less than 10%%(%d) fail, test pass", err_cnt);
+	return 0;
+}
+
+static int goodix_analysis_deltadata(struct goodix_ts_test *ts_test)
+{
+	int i;
+	int j;
+	int ret = 0;
+	s32 val;
+	u32 data_size = ts_test->accord_arr[0].size;
+
+	for (i = 0; i < TOTAL_FRAME_NUM; i++) {
+		for (j = 0; j < data_size; j++) {
+			val = ts_test->accord_arr[i].data[j];
+			if (val > ts_test->test_params.deviation_limits[j]) {
+				ts_test->open_res.beyond_accord_limit_cnt[j]++;
+				ret = -EINVAL;
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int goodix_analysis_self_rawdata(struct goodix_ts_test *ts_test)
+{
+	int i;
+	s16 val;
+	u32 data_size = ts_test->self_rawdata.size;
+
+	for (i = 0; i < data_size; i++) {
+		val = ts_test->self_rawdata.data[i];
+		if (val < ts_test->test_params.self_min_limits[i] ||
+			val > ts_test->test_params.self_max_limits[i]) {
+			ts_err("self_rawdata isn't in range, val:%d threshold:[%d,%d]",
+				val, ts_test->test_params.self_min_limits[i],
+				ts_test->test_params.self_max_limits[i]);
+			return -EINVAL;
+		}
+	}
+
+	return 0;	
+}
+
+static int goodix_analysis_noisedata(struct goodix_ts_test *ts_test)
+{
+	int cnt;
+	int i;
+	bool fail_flag = false;
+	int err_cnt = 0;
+	int times = NOISEDATA_TEST_TIMES;
+	s16 val;
+	u32 data_size = ts_test->noisedata[0].size;
+
+	for (cnt = 0; cnt < times; cnt++) {
+		for (i = 0; i < data_size; i++) {
+			val = ts_test->noisedata[cnt].data[i];
+			if (val > ts_test->test_params.noise_threshold)
+				fail_flag = true;
+		}
+		if (fail_flag)
+			err_cnt++;
+		fail_flag = false;
+	}
+
+	if (err_cnt > 0) {
+		ts_err("noisedata have %d frames out of range", err_cnt);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int goodix_analysis_self_noisedata(struct goodix_ts_test *ts_test)
+{
+	int i;
+	s16 val;
+	u32 data_size = ts_test->self_noisedata.size;
+
+	for (i = 0; i < data_size; i++) {
+		val = ts_test->self_noisedata.data[i];
+		if (val > ts_test->test_params.self_noise_threshold) {
+			ts_err("self noisedata isn't in range, val:%d threshold:[0,%d]",
+				val, ts_test->test_params.self_noise_threshold);
+			return -EINVAL;
+		}
+	}
+
+	return 0;		
+}
+
+static void goodix_capacitance_test(struct goodix_ts_test *ts_test)
+{
+	int ret;
+	int retry = 0;
+
+	ts_info("---------------------- cap_test begin ----------------------");
+    ret = goodix_cap_test_prepare(ts_test);
+    if (ret < 0) {
+		ts_err("cap_test prepare failed, exit");
+		goto exit;
+    }
+	ts_info("cap rawdata prepare OK");
+
+rawdata_test_start:
+    /* obtain rawdata */
+    ret = goodix_cache_rawdata(ts_test);
+    if (ret < 0) {
+		if (ret == -EAGAIN) {
+			ts_err("Capacitance exit");
+			goto exit;
+		} else {
+			ts_err("Failed to read capdata");
+		}
+    } else {
+		ts_info("get rawdata finish, start analysis");
+		ret = goodix_analysis_rawdata(ts_test);
+		if (ret < 0) {
+			ts_test->test_result[GTP_CAP_TEST] = GTP_PANEL_REASON;
+		} else if (ret == 0) {
+			ts_test->test_result[GTP_CAP_TEST] = GTP_TEST_PASS;
+		} else if (retry < 3) {
+			retry++;
+			ts_info("rawdata test retry[%d]", retry);
+			goto rawdata_test_start;
+		} else {
+			ts_test->test_result[GTP_CAP_TEST] = GTP_PANEL_REASON;
+		}
+    }
+
+	/* obtain delta_data */
+	goodix_cache_deltadata(ts_test);
+	ts_info("get deltadata finish, start analysis");
+	ret = goodix_analysis_deltadata(ts_test);
+	if (ret < 0)
+		ts_test->test_result[GTP_DELTA_TEST] = GTP_PANEL_REASON;
+	else
+		ts_test->test_result[GTP_DELTA_TEST] = GTP_TEST_PASS;
+
+    /* obtain self_rawdata */
+	if (ts_test->test_params.test_items[GTP_SELFCAP_TEST]) {
+		ret = goodix_cache_self_rawdata(ts_test);
+		if (ret < 0) {
+			ts_err("Failed to read self_capdata");
+		} else {
+			ts_info("get self_rawdata finish, start analysis");
+			ret = goodix_analysis_self_rawdata(ts_test);
+			if (ret < 0)
+				ts_test->test_result[GTP_SELFCAP_TEST] = GTP_PANEL_REASON;
+			else
+				ts_test->test_result[GTP_SELFCAP_TEST] = GTP_TEST_PASS;
+		}
+	}
+
+    /* obtain noisedata */
+	if (ts_test->test_params.test_items[GTP_NOISE_TEST]) {
+		ret = goodix_cache_noisedata(ts_test);
+		if (ret < 0) {
+			ts_err("Failed to read noisedata");
+		} else {
+			ts_info("get noisedata finish, start analysis");
+			ret = goodix_analysis_noisedata(ts_test);
+			if (ret < 0)
+				ts_test->test_result[GTP_NOISE_TEST] = GTP_PANEL_REASON;
+			else
+				ts_test->test_result[GTP_NOISE_TEST] = GTP_TEST_PASS;
+		}
+	}
+
+    /* obtain self_noisedata */
+	if (ts_test->test_params.test_items[GTP_SELFNOISE_TEST]) {
+		ret = goodix_cache_self_noisedata(ts_test);
+		if (ret < 0) {
+			ts_err("Failed to read self_noisedata");
+		} else {
+			ts_info("get self_noisedata finish, start analysis");
+			ret = goodix_analysis_self_noisedata(ts_test);
+			if (ret < 0)
+				ts_test->test_result[GTP_SELFNOISE_TEST] = GTP_PANEL_REASON;
+			else
+				ts_test->test_result[GTP_SELFNOISE_TEST] = GTP_TEST_PASS;
+		}
+	}
+
+exit:
+	goodix_cap_test_finish(ts_test);
+}
+
+char *goodix_strncat(char *dest, char *src, size_t dest_size)
+{
+	size_t dest_len = 0;
+
+	dest_len = strnlen(dest, dest_size);
+	return strncat(&dest[dest_len], src, dest_size - dest_len - 1);
+}
+
+char *goodix_strncatint(char *dest, int src, char *format, size_t dest_size)
+{
+	char src_str[MAX_STR_LEN] = {0};
+
+	snprintf(src_str, MAX_STR_LEN, format, src);
+	return goodix_strncat(dest, src_str, dest_size);
+}
+
+static void goodix_data_cal(s16 *data, size_t data_size, s16 *stat_result)
+{
+	int i = 0;
+	s16 avg = 0;
+	s16 min = 0;
+	s16 max = 0;
+	long long sum = 0;
+
+	min = data[0];
+	max = data[0];
+	for (i = 0; i < data_size; i++) {
+		sum += data[i];
+		if (max < data[i])
+			max = data[i];
+		if (min > data[i])
+			min = data[i];
+	}
+	avg = div_s64(sum, data_size);
+	stat_result[0] = avg;
+	stat_result[1] = max;
+	stat_result[2] = min;
+}
+
+static void goodix_data_statistics(s16 *data, size_t data_size,
+		char *result, size_t res_size)
+{
+	s16 stat_value[3];
+
+	if (!data || !result) {
+		ts_err("parameters error please check *data and *result value");
+		return;
+	}
+
+	if (data_size <= 0 || res_size <= 0) {
+		ts_err("input parameter is illegva:data_size=%ld, res_size=%ld",
+			data_size, res_size);
+		return;
+	}
+	goodix_data_cal(data, data_size, stat_value);
+
+	memset(result, 0, res_size);
+	snprintf(result, res_size, "[%d,%d,%d]",
+			stat_value[0], stat_value[1], stat_value[2]);
+	return;
+}
+
+static int fs_write(const void* buf, size_t size)
+{
+    return seq_write(g_seq, buf, size);
+}
+
+static int goodix_save_test_config(struct goodix_ts_test *ts_test)
+{
+	int ret = 0;
+	int i;
+	int bytes = 0;
+	char *data;
+	struct goodix_ic_config *cfg = &ts_test->test_config;
+
+	if (cfg->len <= 0) {
+		ts_info("Can't find vaild test config");
+		return 0;
+	}
+
+	data = vzalloc(MAX_DATA_BUFFER);
+	if (!data) {
+		ts_err("alloc memory failed");
+		return -ENOMEM;
+	}
+
+	bytes += sprintf(&data[bytes], "<OrderConfig>\n");
+	for (i = 0; i < cfg->len; i++) {
+		bytes += sprintf(&data[bytes], "0x%02x,", cfg->data[i]);
+	}
+	bytes += sprintf(&data[bytes], "\n");
+	bytes += sprintf(&data[bytes], "</OrderConfig>\n");
+	ret = fs_write(data, bytes);
+	if (ret < 0) {
+		ts_err("test config write failed");
+		goto save_end;
+	}
+
+save_end:
+	vfree(data);
+	return ret;
+}
+
+static int goodix_save_header(struct goodix_ts_test *ts_test)
+{
+	int ret;
+	int i;
+	int bytes = 0;
+	bool result = false;
+	char *data = NULL;
+	struct goodix_ts_core *ts = ts_test->ts;
+
+	data = vzalloc(MAX_DATA_BUFFER);
+	if (!data) {
+		ts_err("alloc memory failed");
+		return -ENOMEM;
+	}
+
+	bytes += sprintf(&data[bytes], "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
+	bytes += sprintf(&data[bytes], "<TESTLOG>\n");
+	bytes += sprintf(&data[bytes], "<Header>\n");
+	/* sava test result */
+	for (i = 0; i < MAX_TEST_ITEMS; i++) {
+		if ((ts_test->test_result[i] > 0) &&
+			(ts_test->test_result[i] != GTP_TEST_PASS)) {
+				result = true;
+				break;
+		}
+	}
+	if (result)
+		bytes += sprintf(&data[bytes], "<Result>NG</Result>\n");
+	else
+		bytes += sprintf(&data[bytes], "<Result>OK</Result>\n");
+	bytes += sprintf(&data[bytes], "<DeviceType>GT%s</DeviceType>\n",
+			ts->fw_version.patch_pid);
+	bytes += sprintf(&data[bytes], "<SensorId>%d</SensorId>\n",
+			ts_test->ts->fw_version.sensor_id);
+	ret = fs_write(data, bytes);
+	if (ret < 0) {
+		ts_err("header write failed");
+		goto save_end;
+	}
+	bytes = 0;
+	/* save test config */
+	ret = goodix_save_test_config(ts_test);
+	if (ret < 0)  {
+		ts_err("save test config failed");
+		goto save_end;
+	}
+
+	bytes += sprintf(&data[bytes], "</Header>\n");
+	ret = fs_write(data, bytes);
+	if (ret < 0) {
+		ts_err("header write failed");
+		goto save_end;
+	}
+	bytes = 0;
+
+	/* item list */
+	bytes += sprintf(&data[bytes], "<ItemList>\n");
+	if (ts_test->test_result[GTP_CAP_TEST]) {
+		if (GTP_TEST_PASS == ts_test->test_result[GTP_CAP_TEST])
+			bytes += sprintf(&data[bytes],
+					"<Item name=\"Rawdata MAX/MIN Test\" result=\"OK\"/>\n");
+		else
+			bytes += sprintf(&data[bytes],
+					"<Item name=\"Rawdata MAX/MIN Test\" result=\"NG\"/>\n");
+	}
+
+	if (ts_test->test_result[GTP_DELTA_TEST]) {
+		if (GTP_TEST_PASS == ts_test->test_result[GTP_DELTA_TEST])
+			bytes += sprintf(&data[bytes],
+					"<Item name=\"Rawdata Adjcent Deviation Test\" result=\"OK\"/>\n");
+		else
+			bytes += sprintf(&data[bytes],
+					"<Item name=\"Rawdata Adjcent Deviation Test\" result=\"NG\"/>\n");
+	}
+
+	if (ts_test->test_result[GTP_NOISE_TEST]) {
+		if (GTP_TEST_PASS == ts_test->test_result[GTP_NOISE_TEST])
+			bytes += sprintf(&data[bytes],
+					"<Item name=\"Diffdata Jitter Test\" result=\"OK\"/>\n");
+		else
+			bytes += sprintf(&data[bytes],
+					"<Item name=\"Diffdata Jitter Test\" result=\"NG\"/>\n");
+	}
+
+	if (ts_test->test_result[GTP_SELFNOISE_TEST]) {
+		if (GTP_TEST_PASS == ts_test->test_result[GTP_SELFNOISE_TEST])
+			bytes += sprintf(&data[bytes],
+					"<Item name=\"Self Diffdata Jitter Limit Test\" result=\"OK\"/>\n");
+		else
+			bytes += sprintf(&data[bytes],
+					"<Item name=\"Self Diffdata Jitter Limit Test\" result=\"NG\"/>\n");
+	}
+
+	if (ts_test->test_result[GTP_SELFCAP_TEST]) {
+		if (GTP_TEST_PASS == ts_test->test_result[GTP_SELFCAP_TEST])
+			bytes += sprintf(&data[bytes],
+					"<Item name=\"Self Rawdata Upper Limit Test\" result=\"OK\"/>\n");
+		else
+			bytes += sprintf(&data[bytes],
+					"<Item name=\"Self Rawdata Upper Limit Test\" result=\"NG\"/>\n");
+	}
+
+	if (ts_test->test_result[GTP_SHORT_TEST]) {
+		if (GTP_TEST_PASS == ts_test->test_result[GTP_SHORT_TEST])
+			bytes += sprintf(&data[bytes],
+					"<Item name=\"Short Test\" result=\"OK\"/>\n");
+		else
+			bytes += sprintf(&data[bytes],
+					"<Item name=\"Short Test\" result=\"NG\"/>\n");
+	}
+
+	bytes += sprintf(&data[bytes], "</ItemList>\n");
+	ret = fs_write(data, bytes);
+	if (ret < 0) {
+		ts_err("item list write failed");
+		goto save_end;
+	}
+
+save_end:
+	vfree(data);
+	return ret;
+}
+
+static int goodix_save_limits(struct goodix_ts_test *ts_test)
+{
+	int ret;
+	int i;
+	int bytes = 0;
+	char *data = NULL;
+	int tx = ts_test->test_params.drv_num;
+	int rx = ts_test->test_params.sen_num;
+	int chn1;
+	int chn2;
+	int r;
+
+	data = vzalloc(MAX_DATA_BUFFER);
+	if (!data)
+		return -ENOMEM;
+
+	bytes += sprintf(&data[bytes], "<TestItems>\n");
+
+	/* save short result */
+	if (ts_test->test_result[GTP_SHORT_TEST]) {
+		bytes += sprintf(&data[bytes], "<Item name=\"Short Test\">\n");
+		bytes += sprintf(&data[bytes], "<ShortNum>%d</ShortNum>\n",
+				ts_test->short_res.short_num);
+		for (i = 0; i < ts_test->short_res.short_num; i++) {
+			chn1 = ts_test->short_res.short_msg[4 * i];
+			chn2 = ts_test->short_res.short_msg[4 * i + 1];
+			r = (ts_test->short_res.short_msg[4 * i + 2] << 8) +
+				ts_test->short_res.short_msg[4 * i + 3];
+			if (chn1 == CHN_VDD)
+				bytes += sprintf(&data[bytes], "<ShortMess Chn1=\"VDD\" ");
+			else if (chn1 == CHN_GND)
+				bytes += sprintf(&data[bytes], "<ShortMess Chn1=\"GND\" ");
+			else if (chn1 & DRV_CHANNEL_FLAG)
+				bytes += sprintf(&data[bytes], "<ShortMess Chn1=\"Tx%d\" ",
+						chn1 & 0x7f);
+			else
+				bytes += sprintf(&data[bytes], "<ShortMess Chn1=\"Rx%d\" ",
+						chn1 & 0x7f);
+			if (chn2 == CHN_VDD)
+				bytes += sprintf(&data[bytes],
+						"Chn2=\"VDD\" ShortResistor= \"%dKom\"/>\n", r);
+			else if (chn2 == CHN_GND)
+				bytes += sprintf(&data[bytes],
+						"Chn2=\"GND\" ShortResistor= \"%dKom\"/>\n", r);
+			else if (chn2 & DRV_CHANNEL_FLAG)
+				bytes += sprintf(&data[bytes],
+						"Chn2=\"Tx%d\" ShortResistor= \"%dKom\"/>\n",
+						chn2 & 0x7f, r);
+			else
+				bytes += sprintf(&data[bytes],
+						"Chn2=\"Rx%d\" ShortResistor= \"%dKom\"/>\n",
+						chn2 & 0x7f, r);
+		}
+		bytes += sprintf(&data[bytes], "</Item>\n");
+		ret = fs_write(data, bytes);
+		if (ret < 0) {
+			ts_err("short res write fail.");
+			goto save_end;
+		}
+		bytes = 0;
+	}
+
+	/* rawdata max limit */
+	bytes += sprintf(&data[bytes], "<Item name=\"Rawdata Test Sets\">\n");
+	bytes += sprintf(&data[bytes], "<TotalFrameCnt>%d</TotalFrameCnt>\n",
+			TOTAL_FRAME_NUM);
+	bytes += sprintf(&data[bytes], "<MaxRawLimit>\n");
+	for (i = 0; i < tx * rx; i++) {
+		bytes += sprintf(&data[bytes], "%d,",
+			ts_test->test_params.max_limits[i]);
+		if ((i + 1) % tx == 0)
+			bytes += sprintf(&data[bytes], "\n");
+	}
+	bytes += sprintf(&data[bytes], "</MaxRawLimit>\n");
+	/* BeyondRawdataUpperLimit */
+	bytes += sprintf(&data[bytes], "<BeyondRawdataUpperLimitCnt>\n");
+	for (i = 0; i < tx * rx; i++) {
+		bytes += sprintf(&data[bytes], "%d,",
+				ts_test->open_res.beyond_max_limit_cnt[i]);
+		if ((i + 1) % tx == 0)
+			bytes += sprintf(&data[bytes], "\n");
+	}
+	bytes += sprintf(&data[bytes], "</BeyondRawdataUpperLimitCnt>\n");
+	ret = fs_write(data, bytes);
+	if (ret < 0) {
+		ts_err("rawdata limit write failed");
+		goto save_end;
+	}
+	bytes = 0;
+
+	/* rawdata min limit */
+	bytes += sprintf(&data[bytes], "<MinRawLimit>\n");
+	for (i = 0; i < tx * rx; i++) {
+		bytes += sprintf(&data[bytes], "%d,",
+			ts_test->test_params.min_limits[i]);
+		if ((i + 1) % tx == 0)
+			bytes += sprintf(&data[bytes], "\n");
+	}
+	bytes += sprintf(&data[bytes], "</MinRawLimit>\n");
+	/* BeyondRawdataLower limit */
+	bytes += sprintf(&data[bytes], "<BeyondRawdataLowerLimitCnt>\n");
+	for (i = 0; i < tx * rx; i++) {
+		bytes += sprintf(&data[bytes], "%d,",
+				ts_test->open_res.beyond_min_limit_cnt[i]);
+		if ((i + 1) % tx == 0)
+			bytes += sprintf(&data[bytes], "\n");
+	}
+	bytes += sprintf(&data[bytes], "</BeyondRawdataLowerLimitCnt>\n");
+	ret = fs_write(data, bytes);
+	if (ret < 0) {
+		ts_err("rawdata limit write failed");
+		goto save_end;
+	}
+	bytes = 0;
+
+	/* Max Accord limit */
+	bytes += sprintf(&data[bytes], "<MaxAccordLimit>\n");
+	for (i = 0; i < tx * rx; i++) {
+		bytes += sprintf(&data[bytes], "%d,",
+			ts_test->test_params.deviation_limits[i]);
+		if ((i + 1) % tx == 0)
+			bytes += sprintf(&data[bytes], "\n");
+	}
+	bytes += sprintf(&data[bytes], "</MaxAccordLimit>\n");
+	/* BeyondAccordLimitCnt */
+	bytes += sprintf(&data[bytes], "<BeyondAccordLimitCnt>\n");
+	for (i = 0; i < tx * rx; i++) {
+		bytes += sprintf(&data[bytes], "%d,",
+			ts_test->open_res.beyond_accord_limit_cnt[i]);
+		if ((i + 1) % tx == 0)
+			bytes += sprintf(&data[bytes], "\n");
+	}
+	bytes += sprintf(&data[bytes], "</BeyondAccordLimitCnt>\n");
+	bytes += sprintf(&data[bytes], "</Item>\n");
+	ret = fs_write(data, bytes);
+	if (ret < 0) {
+		ts_err("rawdata limit write failed");
+		goto save_end;
+	}
+	bytes = 0;
+
+	/* save noise limit */
+	if (ts_test->test_result[GTP_NOISE_TEST]) {
+		bytes += sprintf(&data[bytes],
+				"<Item name=\"Diffdata Test Sets\">\n");
+		bytes += sprintf(&data[bytes],
+				"<TotalFrameCnt>%d</TotalFrameCnt>\n",
+				NOISEDATA_TEST_TIMES);
+		bytes += sprintf(&data[bytes],
+				"<MaxJitterLimit>%d</MaxJitterLimit>\n",
+				ts_test->test_params.noise_threshold);
+		bytes += sprintf(&data[bytes], "</Item>\n");
+		ret = fs_write(data, bytes);
+		if (ret < 0) {
+			ts_err("noise limit write failed");
+			goto save_end;
+		}
+		bytes = 0;
+	}
+
+	/* save self rawdata limit */
+	if (ts_test->test_result[GTP_SELFCAP_TEST]) {
+		bytes += sprintf(&data[bytes],
+				"<Item name=\"Self Rawdata Test Sets\">\n");
+		bytes += sprintf(&data[bytes],
+				"<TotalFrameCnt>1</TotalFrameCnt>\n");
+		bytes += sprintf(&data[bytes],
+				"<MaxRawLimit>\n");
+		for (i = 0; i < tx + rx; i++) {
+			bytes += sprintf(&data[bytes], "%d,",
+				ts_test->test_params.self_max_limits[i]);
+			if ((i + 1) % tx == 0)
+				bytes += sprintf(&data[bytes], "\n");
+		}
+		if ((tx + rx) % tx != 0)
+			bytes += sprintf(&data[bytes], "\n");
+		bytes += sprintf(&data[bytes], "</MaxRawLimit>\n");
+		bytes += sprintf(&data[bytes], "<MinRawLimit>\n");
+		for (i = 0; i < tx + rx; i++) {
+			bytes += sprintf(&data[bytes], "%d,",
+				ts_test->test_params.self_min_limits[i]);
+			if ((i + 1) % tx == 0)
+				bytes += sprintf(&data[bytes], "\n");
+		}
+		if ((tx + rx) % tx != 0)
+			bytes += sprintf(&data[bytes], "\n");
+		bytes += sprintf(&data[bytes], "</MinRawLimit>\n");
+		bytes += sprintf(&data[bytes], "</Item>\n");
+		ret = fs_write(data, bytes);
+		if (ret < 0) {
+			ts_err("self rawdata limit write failed");
+			goto save_end;
+		}
+		bytes = 0;
+	}
+
+	/* save selfnoise limit */
+	if (ts_test->test_result[GTP_SELFNOISE_TEST]) {
+		bytes += sprintf(&data[bytes],
+				"<Item name=\"Self Diffdata Test Sets\">\n");
+		bytes += sprintf(&data[bytes],
+				"<TotalFrameCnt>1</TotalFrameCnt>\n");
+		bytes += sprintf(&data[bytes],
+				"<MaxJitterLimit>%d</MaxJitterLimit>\n",
+				ts_test->test_params.self_noise_threshold);
+		bytes += sprintf(&data[bytes], "</Item>\n");
+		ret = fs_write(data, bytes);
+		if (ret < 0) {
+			ts_err("raw limit write failed");
+			goto save_end;
+		}
+		bytes = 0;
+	}
+
+	bytes += sprintf(&data[bytes], "</TestItems>\n");
+	ret = fs_write(data, bytes);
+	if (ret < 0)
+		ts_err("limit write fail.");
+
+save_end:
+	vfree(data);
+	return ret;
+}
+
+static int goodix_save_rawdata(struct goodix_ts_test *ts_test)
+{
+	int i;
+	int j;
+	int ret;
+	int bytes = 0;
+	s16 stat_result[3];
+	char *data = NULL;
+	int tx = ts_test->test_params.drv_num;
+	int rx = ts_test->test_params.sen_num;
+	int len = tx * rx;
+
+	data = vzalloc(MAX_DATA_BUFFER);
+	if (!data)
+		return -ENOMEM;
+
+	bytes += sprintf(&data[bytes], "<RawDataRecord>\n");
+	for (i = 0; i < TOTAL_FRAME_NUM; i++) {
+		goodix_data_cal(ts_test->rawdata[i].data, len, stat_result);
+		bytes += sprintf(&data[bytes],
+			"<DataContent No.=\"%d\" DataCount=\"%d\" Maximum=\"%d\" Minimum=\"%d\" Average=\"%d\">\n",
+			i, len, stat_result[1], stat_result[2], stat_result[0]);
+		for (j = 0; j < len; j++) {
+			bytes += sprintf(&data[bytes], "%d,",
+					ts_test->rawdata[i].data[j]);
+			if ((j + 1) % tx == 0)
+				bytes += sprintf(&data[bytes], "\n");
+		}
+		bytes += sprintf(&data[bytes], "</DataContent>\n");
+		goodix_data_cal(ts_test->accord_arr[i].data, len, stat_result);
+		bytes += sprintf(&data[bytes],
+			"<RawAccord No.=\"%d\" DataCount=\"%d\" Maximum=\"%d\" Minimum=\"%d\" Average=\"%d\">\n",
+			i, len, stat_result[1], stat_result[2], stat_result[0]);
+		for (j = 0; j < len; j++) {
+			bytes += sprintf(&data[bytes], "%d,",
+					ts_test->accord_arr[i].data[j]);
+			if ((j + 1) % tx == 0)
+				bytes += sprintf(&data[bytes], "\n");
+		}
+		bytes += sprintf(&data[bytes], "</RawAccord>\n");
+		ret = fs_write(data, bytes);
+		if (ret < 0) {
+			ts_err("rawdata write fail.");
+			goto save_end;
+		}
+		bytes = 0;
+	}
+
+	bytes += sprintf(&data[bytes], "</RawDataRecord>\n");
+	ret = fs_write(data, bytes);
+	if (ret < 0)
+		ts_err("rawdata write fail.");
+
+save_end:
+	vfree(data);
+	return ret;
+}
+
+static int goodix_save_noise_data(struct goodix_ts_test *ts_test)
+{
+	int i;
+	int j;
+	int ret = 0;
+	int bytes = 0;
+	s16 stat_result[3];
+	char *data = NULL;
+	int tx = ts_test->test_params.drv_num;
+	int rx = ts_test->test_params.sen_num;
+	int len = tx * rx;
+
+	data = vzalloc(MAX_DATA_BUFFER);
+	if (!data)
+		return -ENOMEM;
+
+	bytes += sprintf(&data[bytes], "<DiffDataRecord>\n");
+	for (i = 0; i < NOISEDATA_TEST_TIMES; i++) {
+		goodix_data_cal(ts_test->noisedata[i].data, len, stat_result);
+		bytes += sprintf(&data[bytes],
+			"<DataContent No.=\"%d\" DataCount=\"%d\" Maximum=\"%d\" Minimum=\"%d\" Average=\"%d\">\n",
+			i, len, stat_result[1], stat_result[2], stat_result[0]);
+		for (j = 0; j < len; j++) {
+			bytes += sprintf(&data[bytes], "%d,",
+					ts_test->noisedata[i].data[j]);
+			if ((j + 1) % tx == 0)
+				bytes += sprintf(&data[bytes], "\n");
+		}
+		bytes += sprintf(&data[bytes], "</DataContent>\n");
+		ret = fs_write(data, bytes);
+		if (ret < 0) {
+			ts_err("noisedata write fail.");
+			goto save_end;
+		}
+		bytes = 0;
+	}
+
+	bytes += sprintf(&data[bytes], "</DiffDataRecord>\n");
+	ret = fs_write(data, bytes);
+	if (ret < 0)
+		ts_err("noisedata write fail.");
+
+save_end:
+	vfree(data);
+	return ret;
+}
+
+static int goodix_save_self_data(struct goodix_ts_test *ts_test,
+		s16 *src_data, u8 *title, int len)
+{
+	int i;
+	int ret = 0;
+	s32 bytes = 0;
+	char *data;
+	s16 stat_result[3];
+	int tx = ts_test->test_params.drv_num;
+
+	data = vzalloc(MAX_DATA_BUFFER);
+	if (!data)
+		return -ENOMEM;
+
+	bytes += sprintf(&data[bytes], "<%s>\n", title);
+	ret = fs_write(data, bytes);
+	if (ret < 0) {
+		ts_err("rawdata write fail.");
+		goto save_end;
+	}
+	bytes = 0;
+
+	goodix_data_cal(src_data, len, stat_result);
+	bytes += sprintf(&data[bytes],
+		"<DataContent No.=\"0\" DataCount=\"%d\" Maximum=\"%d\" Minimum=\"%d\" Average=\"%d\">\n",
+		len, stat_result[1], stat_result[2], stat_result[0]);
+	for (i = 0; i < len; i++) {
+		bytes += sprintf(&data[bytes], "%d,", src_data[i]);
+		if ((i + 1) % tx == 0)
+			bytes += sprintf(&data[bytes], "\n");
+	}
+	if (len % tx != 0)
+		bytes += sprintf(&data[bytes], "\n");
+	bytes += sprintf(&data[bytes], "</DataContent>\n");
+	bytes += sprintf(&data[bytes], "</%s>\n", title);
+	ret = fs_write(data, bytes);
+	if (ret < 0)
+		ts_err("rawdata write fail.");
+
+save_end:
+	vfree(data);
+	return ret;
+}
+
+static int goodix_save_data(struct goodix_ts_test *ts_test)
+{
+	int ret;
+	int bytes = 0;
+	char *data = NULL;
+
+	data = vzalloc(MAX_DATA_BUFFER);
+	if (!data)
+		return -ENOMEM;
+
+	bytes += sprintf(&data[bytes], "<DataRecord>\n");
+	ret = fs_write(data, bytes);
+	if (ret < 0) {
+		ts_err("rawdata record lable failed");
+		goto save_end;
+	}
+	bytes = 0;
+
+	ret = goodix_save_rawdata(ts_test);
+	if (ret < 0)
+		goto save_end;
+
+	if (ts_test->test_result[GTP_NOISE_TEST]) {
+		ret = goodix_save_noise_data(ts_test);
+		if (ret < 0)
+			goto save_end;
+	}
+
+	if (ts_test->test_result[GTP_SELFCAP_TEST]) {
+		ret = goodix_save_self_data(ts_test,
+				ts_test->self_rawdata.data,
+				"selfDataRecord",
+				ts_test->self_rawdata.size);
+		if (ret < 0)
+			goto save_end;
+	}
+
+	if (ts_test->test_result[GTP_SELFNOISE_TEST]) {
+		ret = goodix_save_self_data(ts_test,
+				ts_test->self_noisedata.data,
+				"selfDiffDataRecord",
+				ts_test->self_noisedata.size);
+		if (ret < 0)
+			goto save_end;
+	}
+
+	bytes += sprintf(&data[bytes], "</DataRecord>\n");
+	ret = fs_write(data, bytes);
+	if (ret < 0)
+		ts_err("rawdata data record lable fail.");
+
+save_end:
+	vfree(data);
+	return ret;
+}
+
+/* save end tag in csv file */
+static int goodix_save_tail(struct goodix_ts_test *ts_test)
+{
+	int ret = 0;
+	int bytes = 0;
+	char *data = NULL;
+
+	data = vzalloc(MAX_DATA_BUFFER);
+	if (!data)
+		return -ENOMEM;
+
+	bytes += sprintf(&data[bytes], "</TESTLOG>\n");
+	ret = fs_write(data, bytes);
+	if (ret < 0)
+		ts_err("tail write failed");
+
+	vfree(data);
+	return ret;
+}
+
+static void goodix_save_result_data(struct goodix_ts_test *ts_test)
+{
+	int ret;
+
+	/* save header */
+	ret = goodix_save_header(ts_test);
+	if (ret < 0)
+		return;
+
+	/* save limits */
+	ret = goodix_save_limits(ts_test);
+	if (ret < 0)
+		return;
+
+	/* save data */
+	ret = goodix_save_data(ts_test);
+	if (ret < 0)
+		return;
+
+	/* save tail */
+	ret = goodix_save_tail(ts_test);
+	if (ret < 0)
+		return;
+}
+
+static void goodix_put_test_result(struct goodix_ts_test *ts_test,
+		struct ts_rawdata_info *info)
+{
+	int i;
+	bool have_bus_error = false;
+	bool have_panel_error = false;
+	char statistics_data[STATISTICS_DATA_LEN] = {0};
+	struct goodix_ts_core *ts = ts_test->ts;
+
+	ts_info("put test result IN");
+
+	info->buff[0] = ts_test->test_params.sen_num;
+	info->buff[1] = ts_test->test_params.drv_num;
+	info->used_size = 2;
+	/* save rawdata to info->buff, only one frame */
+	if (ts_test->rawdata[0].size) {
+		for (i = 0; i < ts_test->rawdata[0].size; i++)
+			info->buff[info->used_size + i] =
+					ts_test->rawdata[0].data[i];
+		info->used_size += ts_test->rawdata[0].size;
+	}
+
+	/* save noisedata to info->buff */
+	if (ts_test->noisedata[0].size) {
+		for (i = 0; i < ts_test->noisedata[0].size; i++)
+			info->buff[info->used_size + i] =
+					ts_test->noisedata[0].data[i];
+		info->used_size += ts_test->noisedata[0].size;
+	}
+
+	/* save self_noisedata to info->buff */
+	if (ts_test->self_noisedata.size) {
+		for (i = 0; i < ts_test->self_noisedata.size; i++)
+			info->buff[info->used_size + i] =
+					ts_test->self_noisedata.data[i];
+		info->used_size += ts_test->self_noisedata.size;
+	}
+
+	/* save self_rawdata to info->buff */
+	if (ts_test->self_rawdata.size) {
+		for (i = 0; i < ts_test->self_rawdata.size; i++)
+			info->buff[info->used_size + i] =
+					ts_test->self_rawdata.data[i];
+		info->used_size += ts_test->self_rawdata.size;
+	}
+
+	/* check if there have bus error */
+	for (i = 0; i < MAX_TEST_ITEMS; i++) {
+		if (ts_test->test_result[i] == SYS_SOFTWARE_REASON)
+			have_bus_error = true;
+		else if (ts_test->test_result[i] == GTP_PANEL_REASON)
+			have_panel_error = true;
+	}
+	ts_info("Have bus error:%d", have_bus_error);
+	if (have_bus_error || have_panel_error)
+		goodix_strncat(ts_test->test_info, "[FAIL]-",
+				TS_RAWDATA_RESULT_MAX);
+	else
+		goodix_strncat(ts_test->test_info, "[PASS]-",
+				TS_RAWDATA_RESULT_MAX);
+
+	if (have_bus_error)
+		goodix_strncat(ts_test->test_info, "0F-",
+				TS_RAWDATA_RESULT_MAX);
+	else
+		goodix_strncat(ts_test->test_info, "0P-",
+				TS_RAWDATA_RESULT_MAX);
+
+	for (i = 0; i < MAX_TEST_ITEMS; i++) {
+		/* if have tested, show result */
+		if (ts_test->test_result[i]) {
+			if (ts_test->test_result[i] == GTP_TEST_PASS)
+				goodix_strncatint(ts_test->test_info, i, "%dP-",
+					TS_RAWDATA_RESULT_MAX);
+			else
+				goodix_strncatint(ts_test->test_info, i, "%dF-",
+					TS_RAWDATA_RESULT_MAX);
+		}
+	}
+
+	/* calculate rawdata min avg max value*/
+	if (ts_test->rawdata[0].size) {
+		goodix_data_statistics(
+				ts_test->rawdata[0].data,
+				ts_test->rawdata[0].size,
+				statistics_data,
+				STATISTICS_DATA_LEN);
+		goodix_strncat(ts_test->test_info, statistics_data,
+			TS_RAWDATA_RESULT_MAX);
+	} else {
+		ts_err("NO valiable rawdata");
+		goodix_strncat(ts_test->test_info, "[0,0,0]",
+			TS_RAWDATA_RESULT_MAX);
+	}
+
+	/* calculate noisedata min avg max value*/
+	if (ts_test->test_params.test_items[GTP_NOISE_TEST]) {
+		if (ts_test->noisedata[0].size) {
+			goodix_data_statistics(
+					ts_test->noisedata[0].data,
+					ts_test->noisedata[0].size,
+					statistics_data,
+					STATISTICS_DATA_LEN);
+			goodix_strncat(ts_test->test_info, statistics_data,
+				TS_RAWDATA_RESULT_MAX);
+		} else {
+			ts_err("NO valiable noisedata");
+			goodix_strncat(ts_test->test_info, "[0,0,0]",
+				TS_RAWDATA_RESULT_MAX);
+		}
+	}
+
+	/* calculate self_rawdata min avg max value*/
+	if (ts_test->test_params.test_items[GTP_SELFCAP_TEST]) {
+		if (ts_test->self_rawdata.size) {
+			goodix_data_statistics(
+					ts_test->self_rawdata.data,
+					ts_test->self_rawdata.size,
+					statistics_data,
+					STATISTICS_DATA_LEN);
+			goodix_strncat(ts_test->test_info, statistics_data,
+				TS_RAWDATA_RESULT_MAX);
+		} else {
+			ts_err("NO valiable self_rawdata");
+			goodix_strncat(ts_test->test_info, "[0,0,0]",
+				TS_RAWDATA_RESULT_MAX);
+		}
+	}
+
+	/* calculate self_noisedata min avg max value*/
+	if (ts_test->test_params.test_items[GTP_SELFNOISE_TEST]) {
+		if (ts_test->self_noisedata.size) {
+			goodix_data_statistics(
+					ts_test->self_noisedata.data,
+					ts_test->self_noisedata.size,
+					statistics_data,
+					STATISTICS_DATA_LEN);
+			goodix_strncat(ts_test->test_info, statistics_data,
+				TS_RAWDATA_RESULT_MAX);
+		} else {
+			ts_err("NO valiable self_noisedata");
+			goodix_strncat(ts_test->test_info, "[0,0,0]",
+				TS_RAWDATA_RESULT_MAX);
+		}
+	}
+
+	goodix_strncat(ts_test->test_info, "-GT",
+		TS_RAWDATA_RESULT_MAX);
+	goodix_strncat(ts_test->test_info, ts->fw_version.patch_pid,
+		TS_RAWDATA_RESULT_MAX);
+	strncpy(info->result, ts_test->test_info, TS_RAWDATA_RESULT_MAX - 1);
+
+	/* save result */
+	goodix_save_result_data(ts_test);
+}
+
+static int goodix_do_inspect(struct goodix_ts_core *cd,
+		struct ts_rawdata_info *info)
+{
+	int ret;
+	struct goodix_ts_test *ts_test = NULL;
+
+	if (!cd || !info) {
+		ts_err("core_data or info is NULL");
+		return -ENODEV;
+	}
+
+	ts_test = vzalloc(sizeof(*ts_test));
+	if (!ts_test)
+		return -ENOMEM;
+
+	ts_test->ts = cd;
+	ret = goodix_tptest_prepare(ts_test);
+	if (ret < 0) {
+		ts_err("Failed to prepare TP test, exit");
+		strncpy(info->result, "[FAIL]-0F-software reason\n",
+				TS_RAWDATA_RESULT_MAX - 1);
+		goto exit_finish;
+	}
+	ts_info("TP test prepare OK");
+
+	goodix_capacitance_test(ts_test); /* 1F 3F 6F 7F test */
+	if (ts_test->test_params.test_items[GTP_SHORT_TEST])
+		goodix_shortcircut_test(ts_test); /* 5F test */
+	goodix_put_test_result(ts_test, info);
+	goodix_tptest_finish(ts_test);
+
+exit_finish:
+	vfree(ts_test);
+	return ret;
+}
+
+static int auto_test_result_show(struct seq_file *m, void *v)
+{
+	struct goodix_ts_core *cd = m->private;
+	struct ts_rawdata_info *info = NULL;
+
+	ts_info("--- IN");
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	g_seq = m;
+	goodix_do_inspect(cd, info);
+	ts_info("test_result:%s", info->result);
+	kfree(info);
+	ts_info("--- OUT");
+
+	return 0;
+}
+
+static int auto_test_open(struct inode *inode, struct file *file)
+{
+	return single_open_size(file, auto_test_result_show,
+			PDE_DATA(inode), DEFAULT_SEQ_FILE_SIZE);
+}
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0))
+static const struct proc_ops auto_test_ops = {
+	.proc_open = auto_test_open,
+	.proc_read = seq_read,
+	.proc_lseek = seq_lseek,
+	.proc_release = single_release,
+};
+#else
+static const struct file_operations auto_test_ops = {
+	.open = auto_test_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+#endif
+
+int inspect_module_init(struct goodix_ts_core *core_data)
+{
+	struct proc_dir_entry *proc_entry;
+
+	if (module_initialized) {
+		ts_info("inspect module has already init");
+		return 0;
+	}
+
+	proc_entry = proc_create_data("goodix_ts/auto_test",
+			0660, NULL, &auto_test_ops, core_data);
+	if (!proc_entry) {
+        ts_err("failed to create proc entry");
+        return -ENOMEM;
+    }
+
+	module_initialized = true;
+	ts_info("inspect module init success");
+	return 0;
+}
+
+void inspect_module_exit(void)
+{
+	ts_info("inspect module exit");
+	if (!module_initialized)
+		return;
+
+	remove_proc_entry("goodix_ts/auto_test", NULL);
+	module_initialized = false;
+}
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_tools.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_tools.c
new file mode 100644
index 00000000000000..9a69feeb6d5d2a
--- /dev/null
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_tools.c
@@ -0,0 +1,503 @@
+ /*
+  * Goodix Touchscreen Driver
+  * Copyright (C) 2020 - 2021 Goodix, Inc.
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2 of the License, or
+  * (at your option) any later version.
+  *
+  * This program is distributed in the hope that it will be a reference
+  * to you, when you are integrating the GOODiX's CTP IC into your system,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * General Public License for more details.
+  *
+  */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/ioctl.h>
+#include <linux/wait.h>
+#include "goodix_ts_core.h"
+
+#define GOODIX_TOOLS_NAME		"gtp_tools"
+#define GOODIX_TOOLS_VER_MAJOR		1
+#define GOODIX_TOOLS_VER_MINOR		0
+static const u16 goodix_tools_ver = ((GOODIX_TOOLS_VER_MAJOR << 8) +
+			(GOODIX_TOOLS_VER_MINOR));
+
+#define GOODIX_TS_IOC_MAGIC		'G'
+#define NEGLECT_SIZE_MASK		(~(_IOC_SIZEMASK << _IOC_SIZESHIFT))
+
+#define GTP_IRQ_ENABLE	_IO(GOODIX_TS_IOC_MAGIC, 0)
+#define GTP_DEV_RESET	_IO(GOODIX_TS_IOC_MAGIC, 1)
+#define GTP_SEND_COMMAND (_IOW(GOODIX_TS_IOC_MAGIC, 2, u8) & NEGLECT_SIZE_MASK)
+#define GTP_SEND_CONFIG	(_IOW(GOODIX_TS_IOC_MAGIC, 3, u8) & NEGLECT_SIZE_MASK)
+#define GTP_ASYNC_READ	(_IOR(GOODIX_TS_IOC_MAGIC, 4, u8) & NEGLECT_SIZE_MASK)
+#define GTP_SYNC_READ	(_IOR(GOODIX_TS_IOC_MAGIC, 5, u8) & NEGLECT_SIZE_MASK)
+#define GTP_ASYNC_WRITE	(_IOW(GOODIX_TS_IOC_MAGIC, 6, u8) & NEGLECT_SIZE_MASK)
+#define GTP_READ_CONFIG	(_IOW(GOODIX_TS_IOC_MAGIC, 7, u8) & NEGLECT_SIZE_MASK)
+#define GTP_ESD_ENABLE	_IO(GOODIX_TS_IOC_MAGIC, 8)
+#define GTP_TOOLS_VER   (_IOR(GOODIX_TS_IOC_MAGIC, 9, u8) & NEGLECT_SIZE_MASK)
+#define GTP_TOOLS_CTRL_SYNC (_IOW(GOODIX_TS_IOC_MAGIC, 10, u8) & NEGLECT_SIZE_MASK)
+
+#define MAX_BUF_LENGTH		(16*1024)
+#define IRQ_FALG		(0x01 << 2)
+
+#define I2C_MSG_HEAD_LEN	20
+
+/*
+ * struct goodix_tools_dev - goodix tools device struct
+ * @ts_core: The core data struct of ts driver
+ * @ops_mode: represent device work mode
+ * @rawdiffcmd: Set slave device into rawdata mode
+ * @normalcmd: Set slave device into normal mode
+ * @wq: Wait queue struct use in synchronous data read
+ * @mutex: Protect goodix_tools_dev
+ * @in_use: device in use
+ */
+struct goodix_tools_dev {
+	struct goodix_ts_core *ts_core;
+	struct list_head head;
+	unsigned int ops_mode;
+	struct goodix_ts_cmd rawdiffcmd, normalcmd;
+	wait_queue_head_t wq;
+	struct mutex mutex;
+	atomic_t in_use;
+	struct goodix_ext_module module;
+} *goodix_tools_dev;
+
+
+/* read data asynchronous,
+ * success return data length, otherwise return < 0
+ */
+static int async_read(struct goodix_tools_dev *dev, void __user *arg)
+{
+	u8 *databuf = NULL;
+	int ret = 0;
+	u32 reg_addr, length;
+	u8 i2c_msg_head[I2C_MSG_HEAD_LEN];
+	const struct goodix_ts_hw_ops *hw_ops = dev->ts_core->hw_ops;
+
+	ret = copy_from_user(&i2c_msg_head, arg, I2C_MSG_HEAD_LEN);
+	if (ret)
+		return -EFAULT;
+
+	reg_addr = i2c_msg_head[0] + (i2c_msg_head[1] << 8)
+			+ (i2c_msg_head[2] << 16) + (i2c_msg_head[3] << 24);
+	length = i2c_msg_head[4] + (i2c_msg_head[5] << 8)
+			+ (i2c_msg_head[6] << 16) + (i2c_msg_head[7] << 24);
+	if (length > MAX_BUF_LENGTH) {
+		ts_err("buffer too long:%d > %d", length, MAX_BUF_LENGTH);
+		return -EINVAL;
+	}
+	databuf = kzalloc(length, GFP_KERNEL);
+	if (!databuf) {
+		ts_err("Alloc memory failed");
+		return -ENOMEM;
+	}
+
+	if (hw_ops->read(dev->ts_core, reg_addr, databuf, length)) {
+		ret = -EBUSY;
+		ts_err("Read i2c failed");
+		goto err_out;
+	}
+	ret = copy_to_user((u8 *)arg + I2C_MSG_HEAD_LEN, databuf, length);
+	if (ret) {
+		ret = -EFAULT;
+		ts_err("Copy_to_user failed");
+		goto err_out;
+	}
+	ret = length;
+err_out:
+	kfree(databuf);
+	return ret;
+}
+
+/* if success return config data length */
+static int read_config_data(struct goodix_ts_core *ts_core, void __user *arg)
+{
+	int ret = 0;
+	u32 reg_addr, length;
+	u8 i2c_msg_head[I2C_MSG_HEAD_LEN];
+	u8 *tmp_buf;
+
+	ret = copy_from_user(&i2c_msg_head, arg, I2C_MSG_HEAD_LEN);
+	if (ret) {
+		ts_err("Copy data from user failed");
+		return -EFAULT;
+	}
+	reg_addr = i2c_msg_head[0] + (i2c_msg_head[1] << 8)
+		   + (i2c_msg_head[2] << 16) + (i2c_msg_head[3] << 24);
+	length = i2c_msg_head[4] + (i2c_msg_head[5] << 8)
+		 + (i2c_msg_head[6] << 16) + (i2c_msg_head[7] << 24);
+	ts_info("read config,reg_addr=0x%x, length=%d", reg_addr, length);
+	if (length > MAX_BUF_LENGTH) {
+		ts_err("buffer too long:%d > %d", length, MAX_BUF_LENGTH);
+		return -EINVAL;
+	}
+	tmp_buf = kzalloc(length, GFP_KERNEL);
+	if (!tmp_buf) {
+		ts_err("failed alloc memory");
+		return -ENOMEM;
+	}
+	/* if reg_addr == 0, read config data with specific flow */
+	if (!reg_addr) {
+		if (ts_core->hw_ops->read_config)
+			ret = ts_core->hw_ops->read_config(ts_core, tmp_buf, length);
+		else
+			ret = -EINVAL;
+	} else {
+		ret = ts_core->hw_ops->read(ts_core, reg_addr, tmp_buf, length);
+		if (!ret)
+			ret = length;
+	}
+	if (ret <= 0)
+		goto err_out;
+
+	if (copy_to_user((u8 *)arg + I2C_MSG_HEAD_LEN, tmp_buf, ret)) {
+		ret = -EFAULT;
+		ts_err("Copy_to_user failed");
+	}
+
+err_out:
+	kfree(tmp_buf);
+	return ret;
+}
+
+/* write data to i2c asynchronous,
+ * success return bytes write, else return <= 0
+ */
+static int async_write(struct goodix_tools_dev *dev, void __user *arg)
+{
+	u8 *databuf;
+	int ret = 0;
+	u32 reg_addr, length;
+	u8 i2c_msg_head[I2C_MSG_HEAD_LEN];
+	struct goodix_ts_core *ts_core = dev->ts_core;
+	const struct goodix_ts_hw_ops *hw_ops = ts_core->hw_ops;
+
+	ret = copy_from_user(&i2c_msg_head, arg, I2C_MSG_HEAD_LEN);
+	if (ret) {
+		ts_err("Copy data from user failed");
+		return -EFAULT;
+	}
+	reg_addr = i2c_msg_head[0] + (i2c_msg_head[1] << 8)
+			+ (i2c_msg_head[2] << 16) + (i2c_msg_head[3] << 24);
+	length = i2c_msg_head[4] + (i2c_msg_head[5] << 8)
+			+ (i2c_msg_head[6] << 16) + (i2c_msg_head[7] << 24);
+	if (length > MAX_BUF_LENGTH) {
+		ts_err("buffer too long:%d > %d", length, MAX_BUF_LENGTH);
+		return -EINVAL;
+	}
+
+	databuf = kzalloc(length, GFP_KERNEL);
+	if (!databuf) {
+		ts_err("Alloc memory failed");
+		return -ENOMEM;
+	}
+	ret = copy_from_user(databuf, (u8 *)arg + I2C_MSG_HEAD_LEN, length);
+	if (ret) {
+		ret = -EFAULT;
+		ts_err("Copy data from user failed");
+		goto err_out;
+	}
+
+	if (hw_ops->write(ts_core, reg_addr, databuf, length)) {
+		ret = -EBUSY;
+		ts_err("Write data to device failed");
+	} else {
+		ret = length;
+	}
+
+err_out:
+	kfree(databuf);
+	return ret;
+}
+
+static int init_cfg_data(struct goodix_ic_config *cfg, void __user *arg)
+{
+	int ret = 0;
+	u32 length;
+	u8 i2c_msg_head[I2C_MSG_HEAD_LEN] = {0};
+
+	ret = copy_from_user(&i2c_msg_head, arg, I2C_MSG_HEAD_LEN);
+	if (ret) {
+		ts_err("Copy data from user failed");
+		return -EFAULT;
+	}
+
+	length = i2c_msg_head[4] + (i2c_msg_head[5] << 8)
+			+ (i2c_msg_head[6] << 16) + (i2c_msg_head[7] << 24);
+	if (length > GOODIX_CFG_MAX_SIZE) {
+		ts_err("buffer too long:%d > %d", length, MAX_BUF_LENGTH);
+		return -EINVAL;
+	}
+	ret = copy_from_user(cfg->data, (u8 *)arg + I2C_MSG_HEAD_LEN, length);
+	if (ret) {
+		ts_err("Copy data from user failed");
+		return -EFAULT;
+	}
+	cfg->len = length;
+	return 0;
+}
+
+/**
+ * goodix_tools_ioctl - ioctl implementation
+ *
+ * @filp: Pointer to file opened
+ * @cmd: Ioctl opertion command
+ * @arg: Command data
+ * Returns >=0 - succeed, else failed
+ */
+static long goodix_tools_ioctl(struct file *filp, unsigned int cmd,
+					unsigned long arg)
+{
+	int ret = 0;
+	struct goodix_tools_dev *dev = filp->private_data;
+	struct goodix_ts_core *ts_core;
+	const struct goodix_ts_hw_ops *hw_ops;
+	struct goodix_ic_config *temp_cfg = NULL;
+
+	if (dev->ts_core == NULL) {
+		ts_err("Tools module not register");
+		return -EINVAL;
+	}
+	ts_core = dev->ts_core;
+	hw_ops = ts_core->hw_ops;
+
+	if (_IOC_TYPE(cmd) != GOODIX_TS_IOC_MAGIC) {
+		ts_err("Bad magic num:%c", _IOC_TYPE(cmd));
+		return -ENOTTY;
+	}
+
+	switch (cmd & NEGLECT_SIZE_MASK) {
+	case GTP_IRQ_ENABLE:
+		if (arg == 1) {
+			hw_ops->irq_enable(ts_core, true);
+			mutex_lock(&dev->mutex);
+			dev->ops_mode |= IRQ_FALG;
+			mutex_unlock(&dev->mutex);
+			ts_info("IRQ enabled");
+		} else if (arg == 0) {
+			hw_ops->irq_enable(ts_core, false);
+			mutex_lock(&dev->mutex);
+			dev->ops_mode &= ~IRQ_FALG;
+			mutex_unlock(&dev->mutex);
+			ts_info("IRQ disabled");
+		} else {
+			ts_info("Irq aready set with, arg = %ld", arg);
+		}
+		ret = 0;
+		break;
+	case GTP_ESD_ENABLE:
+		if (arg == 0)
+			goodix_ts_blocking_notify(NOTIFY_ESD_OFF, NULL);
+		else
+			goodix_ts_blocking_notify(NOTIFY_ESD_ON, NULL);
+		break;
+	case GTP_DEV_RESET:
+		hw_ops->reset(ts_core, GOODIX_NORMAL_RESET_DELAY_MS);
+		break;
+	case GTP_SEND_COMMAND:
+		/* deprecated command */
+		ts_err("the GTP_SEND_COMMAND function has been removed");
+		ret = -EINVAL;
+		break;
+	case GTP_SEND_CONFIG:
+		temp_cfg = kzalloc(sizeof(struct goodix_ic_config), GFP_KERNEL);
+		if (temp_cfg == NULL) {
+			ts_err("Memory allco err");
+			ret = -ENOMEM;
+			goto err_out;
+		}
+
+		ret = init_cfg_data(temp_cfg, (void __user *)arg);
+		if (!ret && hw_ops->send_config) {
+			ret = hw_ops->send_config(ts_core, temp_cfg->data, temp_cfg->len);
+			if (ret) {
+				ts_err("Failed send config");
+				ret = -EAGAIN;
+			} else {
+				ts_info("Send config success");
+				ret = 0;
+			}
+		}
+		kfree(temp_cfg);
+		temp_cfg = NULL;
+		break;
+	case GTP_READ_CONFIG:
+		ret = read_config_data(ts_core, (void __user *)arg);
+		if (ret > 0)
+			ts_info("success read config:len=%d", ret);
+		else
+			ts_err("failed read config:ret=0x%x", ret);
+		break;
+	case GTP_ASYNC_READ:
+		ret = async_read(dev, (void __user *)arg);
+		if (ret < 0)
+			ts_err("Async data read failed");
+		break;
+	case GTP_SYNC_READ:
+		ts_info("unsupport sync read");
+		break;
+	case GTP_ASYNC_WRITE:
+		ret = async_write(dev, (void __user *)arg);
+		if (ret < 0)
+			ts_err("Async data write failed");
+		break;
+	case GTP_TOOLS_VER:
+		ret = copy_to_user((u8 *)arg, &goodix_tools_ver,
+					sizeof(u16));
+		if (ret)
+			ts_err("failed copy driver version info to user");
+		break;
+	case GTP_TOOLS_CTRL_SYNC:
+		ts_core->tools_ctrl_sync = !!arg;
+		ts_info("set tools ctrl sync %d", ts_core->tools_ctrl_sync);
+		break;
+	default:
+		ts_info("Invalid cmd");
+		ret = -ENOTTY;
+		break;
+	}
+
+err_out:
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long goodix_tools_compat_ioctl(struct file *file, unsigned int cmd,
+				unsigned long arg)
+{
+	void __user *arg32 = compat_ptr(arg);
+
+	if (!file->f_op || !file->f_op->unlocked_ioctl)
+		return -ENOTTY;
+	return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)arg32);
+}
+#endif
+
+static int goodix_tools_open(struct inode *inode, struct file *filp)
+{
+	int ret = 0;
+
+	ts_info("try open tool");
+	/* Only the first time open device need to register module */
+	ret = goodix_register_ext_module_no_wait(&goodix_tools_dev->module);
+	if (ret) {
+		ts_info("failed register to core module");
+		return -EFAULT;
+	}
+	ts_info("success open tools");
+	goodix_ts_blocking_notify(NOTIFY_ESD_OFF, NULL);
+	filp->private_data = goodix_tools_dev;
+	atomic_set(&goodix_tools_dev->in_use, 1);
+	return 0;
+}
+
+static int goodix_tools_release(struct inode *inode, struct file *filp)
+{
+	int ret = 0;
+	/* when the last close this dev node unregister the module */
+	goodix_tools_dev->ts_core->tools_ctrl_sync = false;
+	atomic_set(&goodix_tools_dev->in_use, 0);
+	goodix_ts_blocking_notify(NOTIFY_ESD_ON, NULL);
+	ret = goodix_unregister_ext_module(&goodix_tools_dev->module);
+	return ret;
+}
+
+static int goodix_tools_module_init(struct goodix_ts_core *core_data,
+			struct goodix_ext_module *module)
+{
+	struct goodix_tools_dev *tools_dev = module->priv_data;
+
+	if (core_data)
+		tools_dev->ts_core = core_data;
+	else
+		return -ENODEV;
+
+	return 0;
+}
+
+static int goodix_tools_module_exit(struct goodix_ts_core *core_data,
+		struct goodix_ext_module *module)
+{
+	struct goodix_tools_dev *tools_dev = module->priv_data;
+	ts_debug("tools module unregister");
+	if (atomic_read(&tools_dev->in_use)) {
+		ts_err("tools module busy, please close it then retry");
+		return -EBUSY;
+	}
+	return 0;
+}
+
+static const struct file_operations goodix_tools_fops = {
+	.owner		= THIS_MODULE,
+	.open		= goodix_tools_open,
+	.release	= goodix_tools_release,
+	.unlocked_ioctl	= goodix_tools_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = goodix_tools_compat_ioctl,
+#endif
+};
+
+static struct miscdevice goodix_tools_miscdev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= GOODIX_TOOLS_NAME,
+	.fops	= &goodix_tools_fops,
+};
+
+static struct goodix_ext_module_funcs goodix_tools_module_funcs = {
+	.init = goodix_tools_module_init,
+	.exit = goodix_tools_module_exit,
+};
+
+/**
+ * goodix_tools_init - init goodix tools device and register a miscdevice
+ *
+ * return: 0 success, else failed
+ */
+int goodix_tools_init(void)
+{
+	int ret;
+
+	goodix_tools_dev = kzalloc(sizeof(struct goodix_tools_dev), GFP_KERNEL);
+	if (goodix_tools_dev == NULL) {
+		ts_err("Memory allco err");
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&goodix_tools_dev->head);
+	goodix_tools_dev->ops_mode = 0;
+	goodix_tools_dev->ops_mode |= IRQ_FALG;
+	init_waitqueue_head(&goodix_tools_dev->wq);
+	mutex_init(&goodix_tools_dev->mutex);
+	atomic_set(&goodix_tools_dev->in_use, 0);
+
+	goodix_tools_dev->module.funcs = &goodix_tools_module_funcs;
+	goodix_tools_dev->module.name = GOODIX_TOOLS_NAME;
+	goodix_tools_dev->module.priv_data = goodix_tools_dev;
+	goodix_tools_dev->module.priority = EXTMOD_PRIO_DBGTOOL;
+
+	ret = misc_register(&goodix_tools_miscdev);
+	if (ret)
+		ts_err("Debug tools miscdev register failed");
+	else
+		ts_info("Debug tools miscdev register success");
+
+	return ret;
+}
+
+void goodix_tools_exit(void)
+{
+	misc_deregister(&goodix_tools_miscdev);
+	kfree(goodix_tools_dev);
+	ts_info("Debug tools miscdev exit");
+}
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_utils.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_utils.c
new file mode 100644
index 00000000000000..d083c5760aabf2
--- /dev/null
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_utils.c
@@ -0,0 +1,278 @@
+ /*
+  * Goodix Touchscreen Driver
+  * Copyright (C) 2020 - 2021 Goodix, Inc.
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2 of the License, or
+  * (at your option) any later version.
+  *
+  * This program is distributed in the hope that it will be a reference
+  * to you, when you are integrating the GOODiX's CTP IC into your system,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * General Public License for more details.
+  *
+  */
+#include "goodix_ts_core.h"
+
+bool debug_log_flag = false;
+
+/*****************************************************************************
+* goodix_append_checksum
+* @summary
+*    Calcualte data checksum with the specified mode.
+*
+* @param data
+*   data need to be calculate
+* @param len
+*   data length
+* @param mode
+*   calculate for u8 or u16 checksum
+* @return
+*   return the data checksum value.
+*
+*****************************************************************************/
+u32 goodix_append_checksum(u8 *data, int len, int mode)
+{
+	u32 checksum = 0;
+	int i;
+
+	checksum = 0;
+	if (mode == CHECKSUM_MODE_U8_LE) {
+		for (i = 0; i < len; i++)
+			checksum += data[i];
+	} else {
+		for (i = 0; i < len; i+=2)
+			checksum += (data[i] + (data[i+1] << 8));
+	}
+
+	if (mode == CHECKSUM_MODE_U8_LE) {
+		data[len] = checksum & 0xff;
+		data[len + 1] = (checksum >> 8) & 0xff;
+		return 0xFFFF & checksum;
+	}
+	data[len] = checksum & 0xff;
+	data[len + 1] = (checksum >> 8) & 0xff;
+	data[len + 2] = (checksum >> 16) & 0xff;
+	data[len + 3] = (checksum >> 24) & 0xff;
+	return checksum;
+}
+
+/* checksum_cmp: check data valid or not
+ * @data: data need to be check
+ * @size: data length need to be check(include the checksum bytes)
+ * @mode: compare with U8 or U16 mode
+ * */
+int checksum_cmp(const u8 *data, int size, int mode)
+{
+	u32 cal_checksum = 0;
+	u32 r_checksum = 0;
+	u32 i;
+
+	if (mode == CHECKSUM_MODE_U8_LE) {
+		if (size < 2)
+			return 1;
+		for (i = 0; i < size - 2; i++)
+			cal_checksum += data[i];
+		r_checksum = data[size - 2] + (data[size - 1] << 8);
+		return (cal_checksum & 0xFFFF) == r_checksum ? 0 : 1;
+	}
+
+	if (size < 4)
+		return 1;
+	for (i = 0; i < size - 4; i += 2)
+		cal_checksum += data[i] + (data[i + 1] << 8);
+	r_checksum = data[size - 4] + (data[size - 3] << 8) +
+		(data[size - 2] << 16) + (data[size - 1] << 24);
+	return cal_checksum == r_checksum ? 0 : 1;
+}
+
+/* return 1 if all data is zero or ff
+ * else return 0
+ */
+int is_risk_data(const u8 *data, int size)
+{
+	int i;
+	int zero_count =  0;
+	int ff_count = 0;
+
+	for (i = 0; i < size; i++) {
+		if (data[i] == 0)
+			zero_count++;
+		else if (data[i] == 0xFF)
+			ff_count++;
+	}
+	if (zero_count == size || ff_count == size) {
+		ts_info("warning data is all %s\n",
+			zero_count == size ? "0x00" : "0xFF");
+		return 1;
+	}
+
+	return 0;
+}
+
+/* get config id form config file */
+#define CONFIG_ID_OFFSET 		30
+u32 goodix_get_file_config_id(u8 *ic_config)
+{
+	if (!ic_config)
+		return 0;
+	return le32_to_cpup((__le32 *)&ic_config[CONFIG_ID_OFFSET]);
+}
+
+void print_ic_info(struct goodix_ic_info *ic_info)
+{
+	struct goodix_ic_info_version *version = &ic_info->version;
+	struct goodix_ic_info_feature *feature = &ic_info->feature;
+	struct goodix_ic_info_param *parm = &ic_info->parm;
+	struct goodix_ic_info_misc *misc = &ic_info->misc;
+	struct goodix_ic_info_other *other = &ic_info->other;
+
+	ts_info("ic_info_length:                %d",
+		ic_info->length);
+	ts_info("info_customer_id:              0x%01X",
+		version->info_customer_id);
+	ts_info("info_version_id:               0x%01X",
+		version->info_version_id);
+	ts_info("ic_die_id:                     0x%01X",
+		version->ic_die_id);
+	ts_info("ic_version_id:                 0x%01X",
+		version->ic_version_id);
+	ts_info("config_id:                     0x%4X",
+		version->config_id);
+	ts_info("config_version:                0x%01X",
+		version->config_version);
+	ts_info("frame_data_customer_id:        0x%01X",
+		version->frame_data_customer_id);
+	ts_info("frame_data_version_id:         0x%01X",
+		version->frame_data_version_id);
+	ts_info("touch_data_customer_id:        0x%01X",
+		version->touch_data_customer_id);
+	ts_info("touch_data_version_id:         0x%01X",
+		version->touch_data_version_id);
+
+	ts_info("freqhop_feature:               0x%04X",
+		feature->freqhop_feature);
+	ts_info("calibration_feature:           0x%04X",
+		feature->calibration_feature);
+	ts_info("gesture_feature:               0x%04X",
+		feature->gesture_feature);
+	ts_info("side_touch_feature:            0x%04X",
+		feature->side_touch_feature);
+	ts_info("stylus_feature:                0x%04X",
+		feature->stylus_feature);
+
+	ts_info("Drv*Sen,Button,Force num:      %d * %d, %d, %d",
+		parm->drv_num, parm->sen_num,
+		parm->button_num, parm->force_num);
+
+	ts_info("screen_max_x * screen_max_y:   %d * %d",
+		other->screen_max_x, other->screen_max_y);
+
+	ts_info("Cmd:                           0x%04X, %d",
+		misc->cmd_addr, misc->cmd_max_len);
+	ts_info("Cmd-Reply:                     0x%04X, %d",
+		misc->cmd_reply_addr, misc->cmd_reply_len);
+	ts_info("FW-State:                      0x%04X, %d",
+		misc->fw_state_addr, misc->fw_state_len);
+	ts_info("FW-Buffer:                     0x%04X, %d",
+		misc->fw_buffer_addr, misc->fw_buffer_max_len);
+	ts_info("Touch-Data:                    0x%04X, %d",
+		misc->touch_data_addr, misc->touch_data_head_len);
+	ts_info("point_struct_len:              %d",
+		misc->point_struct_len);
+	ts_info("mutual_rawdata_addr:           0x%04X",
+		misc->mutual_rawdata_addr);
+	ts_info("mutual_diffdata_addr:          0x%04X",
+		misc->mutual_diffdata_addr);
+	ts_info("self_rawdata_addr:             0x%04X",
+		misc->self_rawdata_addr);
+	ts_info("self_diffdata_addr:            0x%04X",
+		misc->self_diffdata_addr);
+	ts_info("stylus_rawdata_addr:           0x%04X, %d",
+		misc->stylus_rawdata_addr, misc->stylus_rawdata_len);
+	ts_info("esd_addr:                      0x%04X",
+		misc->esd_addr);
+}
+
+/* matrix transpose */
+void goodix_rotate_abcd2cbad(int tx, int rx, s16 *data)
+{
+	s16 *temp_buf = NULL;
+	int size = tx * rx;
+	int i;
+	int j;
+	int col;
+
+	temp_buf = kcalloc(size, sizeof(s16), GFP_KERNEL);
+	if (!temp_buf) {
+		ts_err("malloc failed");
+		return;
+	}
+
+	for (i = 0, j = 0, col = 0; i < size; i++) {
+		temp_buf[i] = data[j++ * rx + col];
+		if (j == tx) {
+			j = 0;
+			col++;
+		}
+	}
+
+	memcpy(data, temp_buf, size * sizeof(s16));
+	kfree(temp_buf);
+}
+
+/* get ic type */
+int goodix_get_ic_type(struct device_node *node,
+		struct goodix_bus_interface *bus_inf)
+{
+	const struct property *prop;
+	char ic_name[128] = {0};
+	int i;
+
+	prop = of_find_property(node, "compatible", NULL);
+	if (!prop || !prop->value || prop->length > sizeof(ic_name)) {
+		ts_err("invalid compatible property");
+		return -EINVAL;
+	}
+
+	memcpy(ic_name, prop->value, prop->length);
+
+	/* replace string end flag with ';' */
+	for (i = 0; i < prop->length - 1; i++)
+		if (ic_name[i] == 0)
+			ic_name[i] = ';';
+
+	ts_info("ic_name %s", ic_name);
+
+	if (strstr(ic_name, "brl-a")) {
+		ts_info("ic type is brl-a");
+		bus_inf->ic_type = IC_TYPE_BERLIN_A;
+		return 0;
+	}
+
+	if (strstr(ic_name, "brl-b")) {
+		ts_info("ic type is brl-b");
+		bus_inf->ic_type = IC_TYPE_BERLIN_B;
+		if (strstr(ic_name, "ga687x")) {
+			bus_inf->sub_ic_type = IC_TYPE_SUB_B2;
+			ts_info("sub ic type is brl-b2");
+		}
+		return 0;
+	}
+	if (strstr(ic_name, "brl-d")) {
+		ts_info("ic type is brl-d");
+		bus_inf->ic_type = IC_TYPE_BERLIN_D;
+		return 0;
+	}
+	if (strstr(ic_name, "nottingham")) {
+		ts_info("ic type is nottingham");
+		bus_inf->ic_type = IC_TYPE_NOTTINGHAM;
+		return 0;
+	}
+	
+	ts_err("unsupported ic type %s", ic_name);
+	return -EINVAL;
+}
+

From 0e8c213aee45cef4733f623ae6c8565de7a9fd8b Mon Sep 17 00:00:00 2001
From: Luca Weiss <luca.weiss@fairphone.com>
Date: Wed, 21 Jun 2023 10:50:59 +0200
Subject: [PATCH 690/707] Input - goodix_berlin_a_driver: import new version

---
 .../goodix_berlin_a_driver/Kconfig            |  12 +-
 .../goodix_berlin_a_driver/goodix_brl_hw.c    |  50 ++--
 .../goodix_berlin_a_driver/goodix_ts_core.c   | 276 +++++++++++++++++-
 .../goodix_berlin_a_driver/goodix_ts_core.h   |  34 +++
 4 files changed, 324 insertions(+), 48 deletions(-)

diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/Kconfig b/drivers/input/touchscreen/goodix_berlin_a_driver/Kconfig
index 396268b3af4231..36f4ac51f1e159 100644
--- a/drivers/input/touchscreen/goodix_berlin_a_driver/Kconfig
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/Kconfig
@@ -6,15 +6,17 @@ menuconfig TOUCHSCREEN_GOODIX_BRL
 	help
 	  Say Y here if you have a Goodix berlin series touch controller
 	  to your system.
-
+	  
 	  If build module, say M.
 	  If unsure, say N.
 
-if TOUCHSCREEN_GOODIX_BRL
+#if TOUCHSCREEN_GOODIX_BRL
 
 config TOUCHSCREEN_GOODIX_BRL_SPI
-	bool "support SPI bus connection"
+	depends on TOUCHSCREEN_GOODIX_BRL
+	bool "support SPI bus connection" 
+	default n 
 	help
-	  Say Y here if the touchscreen is connected via SPI bus.
+		Say Y here if the touchscreen is connected via SPI bus.
 
-endif
+#endif
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_hw.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_hw.c
index cbc8f82cc92319..178b4912ce325d 100644
--- a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_hw.c
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_hw.c
@@ -87,7 +87,7 @@ static int brl_dev_confirm(struct goodix_ts_core *cd)
 	int retry = GOODIX_RETRY_3;
 	u8 tx_buf[8] = {0};
 	u8 rx_buf[8] = {0};
-	u8 i =0;
+//	u8 i =0;
 
 /*Add by T2M-mingwu.zhang for FP5-195 remarks: Double click on driver update.[Begin]*/	
 return ret;
@@ -99,8 +99,8 @@ return ret;
 
 	memset(tx_buf, DEV_CONFIRM_VAL, sizeof(tx_buf));
 
-	for(i=0;i<8;i++)
-		ts_err("zmw---tx_buf[%d]",tx_buf[i]);	
+/* 	for(i=0;i<8;i++)
+		ts_err("zmw---tx_buf[%d]",tx_buf[i]); */	
 
 	while (retry--) {
 		ret = hw_ops->write(cd, BOOTOPTION_ADDR,
@@ -235,26 +235,26 @@ static int brl_power_on(struct goodix_ts_core *cd, bool on)
 
 ts_err("zmw---brl_power_on---op");
 	if (on) {
-		if (avdd_gpio > 0) {
-			gpio_direction_output(avdd_gpio, 1);
-		} else if (cd->avdd) {
-			ret = regulator_enable(cd->avdd);
-ts_err("zmw---brl_power_on---222");			
+		if (iovdd_gpio > 0) {
+			gpio_direction_output(iovdd_gpio, 1);
+		} else if (cd->iovdd) {			
+			ret = regulator_enable(cd->iovdd);
+ts_err("zmw:name=[%s] line=[%d] iovdd \n",__func__,__LINE__);			
 			if (ret < 0) {
-				ts_err("Failed to enable avdd:%d", ret);
+				ts_err("Failed to enable iovdd:%d", ret);
 				goto power_off;
 			}
 		}
 
 		usleep_range(3000, 3100);
-		
-		if (iovdd_gpio > 0) {
-			gpio_direction_output(iovdd_gpio, 1);
-		} else if (cd->iovdd) {
-ts_err("zmw---brl_power_on---111");			
-			ret = regulator_enable(cd->iovdd);
+
+		if (avdd_gpio > 0) {
+			gpio_direction_output(avdd_gpio, 1);
+		} else if (cd->avdd) {
+			ret = regulator_enable(cd->avdd);	
+ts_err("zmw:name=[%s] line=[%d] avdd \n",__func__,__LINE__);					
 			if (ret < 0) {
-				ts_err("Failed to enable iovdd:%d", ret);
+				ts_err("Failed to enable avdd:%d", ret);
 				goto power_off;
 			}
 		}
@@ -263,12 +263,10 @@ ts_err("zmw---brl_power_on---111");
 		usleep_range(4000, 4100);
 		msleep(GOODIX_NORMAL_RESET_DELAY_MS);	
 			
-		ret = brl_dev_confirm(cd);
-ts_err("zmw---brl_power_on---333");		
+		ret = brl_dev_confirm(cd);		
 		if (ret < 0)
 			goto power_off;
-		ret = brl_reset_after(cd);
-ts_err("zmw---brl_power_on---444");		
+		ret = brl_reset_after(cd);		
 		if (ret < 0)
 			goto power_off;
 
@@ -397,9 +395,6 @@ static int brl_send_cmd(struct goodix_ts_core *cd,
 	struct goodix_ic_info_misc *misc = &cd->ic_info.misc;
 	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
 
-ts_err("zmw---brl_send_cmd---cmd_addr=[%d] fw_buffer_addr=[%d] touch_data_addr=[%d]op",
-misc->cmd_addr,misc->fw_buffer_addr,misc->touch_data_addr);
-
 	mutex_lock(&cmd_mutex);
 
 	cmd->state = 0;
@@ -409,8 +404,7 @@ misc->cmd_addr,misc->fw_buffer_addr,misc->touch_data_addr);
 	ts_debug("cmd data %*ph", cmd->len, &(cmd->buf[2]));
 
 	retry = 0;
-	while (retry++ < GOODIX_CMD_RETRY) {
-ts_err("zmw---brl_send_cmd---111");		
+	while (retry++ < GOODIX_CMD_RETRY) {	
 		ret = hw_ops->write(cd, misc->cmd_addr,
 				    cmd->buf, sizeof(*cmd));
 		if (ret < 0) {
@@ -426,7 +420,7 @@ ts_err("zmw---brl_send_cmd---111");
 				ts_err("failed read command ack, %d", ret);
 				goto exit;
 			}
-			ts_err("cmd ack data %*ph",
+			ts_debug("cmd ack data %*ph",
 				 (int)sizeof(cmd_ack), cmd_ack.buf);
 			if (cmd_ack.ack == CMD_ACK_OK) {
 				msleep(40);		// wait for cmd response
@@ -438,7 +432,7 @@ ts_err("zmw---brl_send_cmd---111");
 				usleep_range(1000, 1100);
 				continue;
 			}
-ts_err("zmw---brl_send_cmd---222");	
+
 			if (cmd_ack.ack == CMD_ACK_BUFFER_OVERFLOW)
 				usleep_range(10000, 11000);
 			usleep_range(1000, 1100);
@@ -718,8 +712,6 @@ static int brl_read_config(struct goodix_ts_core *cd, u8 *cfg, int size)
 	if (!cfg)
 		return -EINVAL;
 
-ts_err("zmw---brl_read_config---op");
-
 	cfg_cmd.len = CONFIG_CND_LEN;
 	cfg_cmd.cmd = CONFIG_CMD_READ_START;
 	ret = send_cfg_cmd(cd, &cfg_cmd);
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.c
index 0a1fafcd77f58a..4c79b6e86077b0 100644
--- a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.c
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.c
@@ -34,6 +34,219 @@
 struct goodix_module goodix_modules;
 int core_module_prob_sate = CORE_MODULE_UNPROBED;
 
+/*Add by T2M-mingwu.zhang for FP5-187 remarks: Touch parameter scene differentiation.[Begin]*/
+#ifdef CONFIG_PROJECT_FP5
+static int usb_if_err=0;
+static bool tp_if_exist=false;
+static bool tp_if_suspend=false;
+struct goodix_ts_core *global_core_data;
+const u32 CUSTOM_ADDR= 0x10180;
+const static unsigned char CUSTOM_USB_ONLINE_BUF[2][6]=
+{
+	{0x00, 0x00, 0x04, 0x10, 0x14, 0x00},
+    {0x00, 0x00, 0x04, 0x11, 0x15, 0x00},
+};
+const static unsigned char CUSTOM_SCREEN_BUF[3][8]=
+{
+	{0x00, 0x00, 0x06, 0x17, 0x30, 0x00, 0x4D, 0x00},
+	{0x00, 0x00, 0x06, 0x17, 0x70, 0x01, 0x8E, 0x00},
+	{0x00, 0x00, 0x06, 0x17, 0xB0, 0x01, 0xCE, 0x00},
+};
+
+static DEFINE_MUTEX(config_type_mutex);
+static int goodix_ts_switch_config(struct goodix_ts_core *cd, enum GOODIX_IC_CONFIG_TYPE type)
+{
+	struct goodix_ts_hw_ops *hw_ops = cd->hw_ops;
+	struct goodix_ic_config *cfg = NULL;
+	int ret = -EFAULT;
+
+	cfg = cd->ic_configs[type];
+	if (!cfg || cfg->len <= 0) {
+		ts_info("no valid config found type %d", type);
+		return -EINVAL;
+	}
+
+//	mutex_lock(&config_type_mutex);
+	hw_ops->irq_enable(cd, false);
+
+	if (hw_ops->send_config) {
+		ret = hw_ops->send_config(cd, cfg->data, cfg->len);
+		if (!ret)
+			cd->config_type = type;
+	}
+
+	if (type == CFG_TYPE_CHARGE) {
+		ts_debug("ready for sending charge cmd ......");
+		ret = cd->hw_ops->write(cd,
+								CUSTOM_ADDR,
+								(unsigned char *)&CUSTOM_USB_ONLINE_BUF[GOODIX_CUSTOM_CHARGE_CMD],
+								sizeof(CUSTOM_USB_ONLINE_BUF[GOODIX_CUSTOM_CHARGE_CMD]));
+		if(ret)
+			ts_debug("goodix write charge command error,ret=[%d]!",ret);
+	} else {
+		ts_debug("ready for sending nochange cmd ......");
+		ret = cd->hw_ops->write(cd,
+								CUSTOM_ADDR,
+								(unsigned char *)&CUSTOM_USB_ONLINE_BUF[GOODIX_CUSTOM_NONCHARGE_CMD],
+								sizeof(CUSTOM_USB_ONLINE_BUF[GOODIX_CUSTOM_NONCHARGE_CMD]));
+		if(ret)
+			ts_debug("goodix write noncharge command error,ret=[%d]!",ret);		
+	}
+
+	hw_ops->irq_enable(cd, true);
+//	mutex_unlock(&config_type_mutex);
+
+	return ret;
+}
+
+/* set work mode */
+static ssize_t goodix_ts_config_type_store(struct device *dev,
+						struct device_attribute *attr,
+						const char *buf, size_t count)
+{
+	struct goodix_ts_core *core_data = dev_get_drvdata(dev);
+	enum GOODIX_IC_CONFIG_TYPE type;
+	long result;
+	int ret;
+
+	if (kstrtol(buf, 0, &result)) {
+		ts_err("faield get config type");
+		return -EFAULT;
+	}
+	if (result >= GOODIX_MAX_CONFIG_GROUP || result < 0) {
+		ts_err("unsupported config type %ld", result);
+		return -EINVAL;
+	}
+	type = (enum GOODIX_IC_CONFIG_TYPE)result;
+
+	ret = goodix_ts_switch_config(core_data, type);
+	if (ret) {
+		ts_err("failed switch to config type %d", type);
+		ret = -EINVAL;
+	} else {
+		ts_info("success switch to config type %d", type);
+	}
+
+	return ret ? -EINVAL : count;
+}
+
+static void goodix_tpusb_online(struct work_struct *work)
+{
+	struct goodix_ts_core *cd =
+			container_of(work, struct goodix_ts_core, tpusb_online_work);
+	int ret = 0;
+
+	mutex_lock(&config_type_mutex);
+
+	if(tp_if_suspend != false)
+		goto Nothing_happened;
+
+	ts_err("usb_online=[%d]!\n",atomic_read(&cd->usb_online));
+
+	if(atomic_read(&cd->usb_online)){
+		ret = goodix_ts_switch_config(cd, (enum GOODIX_IC_CONFIG_TYPE)CFG_TYPE_CHARGE);			//charging mode
+		if(ret)
+			ts_debug("goodix switch charge config error,ret=[%d]!",ret);									
+	}else{
+		ret = goodix_ts_switch_config(cd, (enum GOODIX_IC_CONFIG_TYPE)CFG_TYPE_NON_CHARGE);		//Non charging mode
+		if(ret)
+			ts_debug("goodix switch noncharge config error,ret=[%d]!",ret);										
+	}
+
+	if(ret){
+		if(usb_if_err){
+			usb_if_err = -EAGAIN; //recover error flage
+			ts_err("resume touch config fail, keep error flage=[%d]!\n",usb_if_err);
+		}else{
+			usb_if_err = -EPERM;
+			ts_err("usb touch config fail, error flage=[%d]!\n",usb_if_err);
+		}
+	}else{
+		usb_if_err = 0;
+		ts_info("Regardless of the previous state, as long as the switch is successful, it will be cleared...");
+	}
+
+Nothing_happened:
+	mutex_unlock(&config_type_mutex);	
+}
+
+static DEFINE_MUTEX(usb_online_mutex);
+void tp_get_usb_online(int online)
+{
+	mutex_lock(&usb_online_mutex);
+	if(tp_if_exist != true){
+		ts_err("The specified touch panel does not exist!\n");
+		goto non_exist;
+	} 
+
+	if (atomic_read(&global_core_data->usb_online) != online){
+		atomic_set(&global_core_data->usb_online,online);
+		schedule_work(&global_core_data->tpusb_online_work);		
+	}else
+		ts_info("tp get usb online ,state is same not changed! \n");
+
+non_exist:
+	ts_debug("USB online finish,tp_if_exist=[%d]\n",tp_if_exist);
+	mutex_unlock(&usb_online_mutex);	
+}
+EXPORT_SYMBOL_GPL(tp_get_usb_online);
+
+static u8 screen_mode=0;
+/* screen mode show */
+static ssize_t goodix_ts_screen_mode_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	int cnt = 0;
+
+	cnt = snprintf(buf, PAGE_SIZE, "screen mode:%s\n",
+	screen_mode==0 ? "VERTICAL" : screen_mode==1 ? "HORIZONTAL_90" : "HORIZONTAL_270");
+
+	return cnt;
+}
+
+/* screen mode store */
+static ssize_t goodix_ts_screen_mode_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct goodix_ts_core *core_data = dev_get_drvdata(dev);
+	struct goodix_ts_hw_ops *hw_ops = core_data->hw_ops;
+	int ret = 0;
+
+	if (!buf || count <= 0)
+		return -EINVAL;
+
+	if (buf[0] == '0'){
+		screen_mode = GOODIX_CUSTOM_VERTICAL_SCREEN_CMD;
+		ret = hw_ops->write(core_data,
+							CUSTOM_ADDR,
+							(unsigned char *)&CUSTOM_SCREEN_BUF[GOODIX_CUSTOM_VERTICAL_SCREEN_CMD],
+							sizeof(CUSTOM_SCREEN_BUF[GOODIX_CUSTOM_VERTICAL_SCREEN_CMD]));
+		ts_info("The current mode is VERTICAL!\n");							
+	}else if(buf[0] == '1'){
+		screen_mode = GOODIX_CUSTOM_HORIZONTAL_90_SCREEN_CMD;
+		ret = hw_ops->write(core_data,
+							CUSTOM_ADDR,
+							(unsigned char *)&CUSTOM_SCREEN_BUF[GOODIX_CUSTOM_HORIZONTAL_90_SCREEN_CMD],
+							sizeof(CUSTOM_SCREEN_BUF[GOODIX_CUSTOM_HORIZONTAL_90_SCREEN_CMD]));
+		ts_info("The current mode is HORIZONTAL_90!\n");
+	}else if(buf[0] == '2'){
+		screen_mode = GOODIX_CUSTOM_HORIZONTAL_270_SCREEN_CMD;
+		ret = hw_ops->write(core_data,
+							CUSTOM_ADDR,
+							(unsigned char *)&CUSTOM_SCREEN_BUF[GOODIX_CUSTOM_HORIZONTAL_270_SCREEN_CMD],
+							sizeof(CUSTOM_SCREEN_BUF[GOODIX_CUSTOM_HORIZONTAL_270_SCREEN_CMD]));
+		ts_info("The current mode is HORIZONTAL_270!\n");					
+	}else{
+		ts_err("Invalid parameter value!\n");
+	}
+
+	return count;
+}
+#endif
+/*Add by T2M-mingwu.zhang [End]*/
+
 #if IS_ENABLED(CONFIG_DRM)
 #include <drm/drm_panel.h>
 struct drm_panel *gdix_active_panel;
@@ -847,6 +1060,10 @@ static DEVICE_ATTR(debug_log, 0664,
 		goodix_ts_debug_log_show, goodix_ts_debug_log_store);
 static DEVICE_ATTR(die_info, 0440,
 		die_info_show, NULL);
+static DEVICE_ATTR(config_type, 0220,
+		NULL, goodix_ts_config_type_store);
+static DEVICE_ATTR(screen_mode, 0664,
+		goodix_ts_screen_mode_show, goodix_ts_screen_mode_store);		
 
 static struct attribute *sysfs_attrs[] = {
 	&dev_attr_driver_info.attr,
@@ -859,6 +1076,8 @@ static struct attribute *sysfs_attrs[] = {
 	&dev_attr_esd_info.attr,
 	&dev_attr_debug_log.attr,
 	&dev_attr_die_info.attr,
+	&dev_attr_config_type.attr,
+	&dev_attr_screen_mode.attr,
 	NULL,
 };
 
@@ -1750,20 +1969,12 @@ static int goodix_esd_notifier_callback(struct notifier_block *nb,
 	switch (action) {
 	case NOTIFY_FWUPDATE_START:
 	case NOTIFY_SUSPEND:
-ts_err("zmw---SUSPEND");	
-/*Add by T2M-mingwu.zhang for FP5-195 remarks: Double click on driver update.[Begin]*/	
-//		goodix_ts_power_off(ts_esd->ts_core);
-/*Add by T2M-mingwu.zhang [End]*/
-		break;		
 	case NOTIFY_ESD_OFF:
 		goodix_ts_esd_off(ts_esd->ts_core);
 		break;
 	case NOTIFY_FWUPDATE_FAILED:
 	case NOTIFY_FWUPDATE_SUCCESS:
 	case NOTIFY_RESUME:
-ts_err("zmw---RESUME");	
-		goodix_ts_power_on(ts_esd->ts_core);
-		break;	
 	case NOTIFY_ESD_ON:
 		goodix_ts_esd_on(ts_esd->ts_core);
 		break;
@@ -1874,12 +2085,10 @@ static int goodix_ts_suspend(struct goodix_ts_core *core_data)
 
 	/* enter sleep mode or power off */
 	if (core_data->board_data.sleep_enable)
-		hw_ops->suspend(core_data);
-/*Add by T2M-mingwu.zhang for FP5-195 remarks: Double click on driver update.[Begin]*/	
-/* 	else
-		goodix_ts_power_off(core_data); */
-/*Add by T2M-mingwu.zhang [End]*/		
-
+		hw_ops->suspend(core_data);	
+	else
+		goodix_ts_power_off(core_data);
+		
 	/* inform exteranl modules */
 	mutex_lock(&goodix_modules.mutex);
 	if (!list_empty(&goodix_modules.head)) {
@@ -1902,6 +2111,13 @@ static int goodix_ts_suspend(struct goodix_ts_core *core_data)
 
 out:
 	goodix_ts_release_connects(core_data);
+/*Add by T2M-mingwu.zhang for FP5-187 remarks: Touch parameter scene differentiation.[Begin]*/
+#ifdef CONFIG_PROJECT_FP5
+	usb_if_err = -EPERM;
+	tp_if_suspend = true;
+	ts_err("usb_if_err=[%d],tp_if_suspend=[%d]",usb_if_err,tp_if_suspend);
+#endif
+/*Add by T2M-mingwu.zhang [End]*/
 	ts_info("Suspend end");
 	return 0;
 }
@@ -1949,6 +2165,10 @@ static int goodix_ts_resume(struct goodix_ts_core *core_data)
 	else
 		goodix_ts_power_on(core_data);
 
+	/* recover config */
+/* 	if (core_data->config_type != CONFIG_TYPE_NORMAL)
+		goodix_ts_switch_config(core_data, core_data->config_type); */
+
 	mutex_lock(&goodix_modules.mutex);
 	if (!list_empty(&goodix_modules.head)) {
 		list_for_each_entry_safe(ext_module, next,
@@ -1969,6 +2189,12 @@ static int goodix_ts_resume(struct goodix_ts_core *core_data)
 	mutex_unlock(&goodix_modules.mutex);
 
 out:
+/*Add by T2M-mingwu.zhang for FP5-187 remarks: Touch parameter scene differentiation.[Begin]*/
+	if(tp_if_suspend && (tp_if_exist != false)){	
+		tp_if_suspend = false;	
+		schedule_work(&global_core_data->tpusb_online_work);		
+	}
+/*Add by T2M-mingwu.zhang [End]*/
 	/* enable irq */
 	hw_ops->irq_enable(core_data, true);
 	/* open esd */
@@ -2197,6 +2423,16 @@ int goodix_ts_stage2_init(struct goodix_ts_core *cd)
 	INIT_WORK(&cd->self_check_work, goodix_self_check);
 	schedule_work(&cd->self_check_work);
 
+/*Add by T2M-mingwu.zhang for FP5-187 remarks: Touch parameter scene differentiation.[Begin]*/
+#ifdef CONFIG_PROJECT_FP5
+	INIT_WORK(&cd->tpusb_online_work, goodix_tpusb_online);
+	global_core_data = cd;
+	atomic_set(&cd->usb_online,CFG_TYPE_CHARGE);
+	screen_mode = GOODIX_CUSTOM_VERTICAL_SCREEN_CMD;
+	tp_if_exist=true;
+#endif
+/*Add by T2M-mingwu.zhang [End]*/
+
 	return 0;
 exit:
 	goodix_ts_pen_dev_remove(cd);
@@ -2287,6 +2523,18 @@ static int goodix_later_init_thread(void *data)
 	 * if not we will send config with interactive mode
 	 */
 	goodix_send_ic_config(cd, CONFIG_TYPE_NORMAL);
+	cd->config_type = CONFIG_TYPE_NORMAL;
+
+/*Add by T2M-mingwu.zhang for FP5-187 remarks: Touch parameter scene differentiation.[Begin]*/
+#ifdef CONFIG_PROJECT_FP5
+	ret = cd->hw_ops->write(cd,
+							CUSTOM_ADDR,
+							(unsigned char *)&CUSTOM_USB_ONLINE_BUF[GOODIX_CUSTOM_CHARGE_CMD],
+							sizeof(CUSTOM_USB_ONLINE_BUF[GOODIX_CUSTOM_CHARGE_CMD]));
+	if(ret)
+		ts_err("goodix write charge command error,ret=[%d]!",ret);
+#endif
+/*Add by T2M-mingwu.zhang [End]*/		
 
 	/* init other resources */
 	ret = goodix_ts_stage2_init(cd);
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.h b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.h
index 7af4bd9848171b..fa4aeeaeea0893 100644
--- a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.h
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.h
@@ -39,6 +39,11 @@
 #include <linux/notifier.h>
 #include <linux/fb.h>
 #endif
+/*Add by T2M-mingwu.zhang for FP5-187 remarks: Touch parameter scene differentiation.[Begin]*/
+#ifdef CONFIG_PROJECT_FP5
+#include <asm/atomic.h>
+#endif
+/*Add by T2M-mingwu.zhang [End]*/
 
 /*Add by T2M-mingwu.zhang for FP5-538 remarks: TP/LCD Device Information Development.[Begin]*/	
 #ifdef CONFIG_EMKIT_INFO
@@ -649,6 +654,25 @@ struct goodix_ic_config {
 	u8 data[GOODIX_CFG_MAX_SIZE];
 };
 
+/*Add by T2M-mingwu.zhang for FP5-187 remarks: Touch parameter scene differentiation.[Begin]*/
+#ifdef CONFIG_PROJECT_FP5
+extern struct goodix_ts_core *global_core_data;
+enum GOODIX_CFG_CHARGE_TYPE {
+	CFG_TYPE_NULL = 0,
+	CFG_TYPE_CHARGE = CONFIG_TYPE_NORMAL,
+	CFG_TYPE_NON_CHARGE = CONFIG_TYPE_HOLSTER,
+};
+enum GOODIX_CUSTOM_TYPE {
+	GOODIX_CUSTOM_CHARGE_CMD=0,
+	GOODIX_CUSTOM_NONCHARGE_CMD,
+	GOODIX_CUSTOM_VERTICAL_SCREEN_CMD=0,
+	GOODIX_CUSTOM_HORIZONTAL_90_SCREEN_CMD,
+	GOODIX_CUSTOM_HORIZONTAL_270_SCREEN_CMD,
+	GOODIX_CUSTOM_MAX_CMD,
+};
+#endif
+/*Add by T2M-mingwu.zhang [End]*/
+
 struct goodix_ts_core {
 	int init_stage;
 	struct platform_device *pdev;
@@ -677,6 +701,9 @@ struct goodix_ts_core {
 
 	atomic_t irq_enabled;
 	atomic_t suspended;
+
+	/* target config type enum GOODIX_IC_CONFIG_TYPE */
+	enum GOODIX_IC_CONFIG_TYPE config_type;
 	/* when this flag is true, driver should not clean the sync flag */
 	bool tools_ctrl_sync;
 
@@ -686,6 +713,13 @@ struct goodix_ts_core {
 #if (IS_ENABLED(CONFIG_FB) || IS_ENABLED(CONFIG_DRM))
 	struct notifier_block fb_notifier;
 #endif
+
+/*Add by T2M-mingwu.zhang for FP5-187 remarks: Touch parameter scene differentiation.[Begin]*/
+#ifdef CONFIG_PROJECT_FP5
+	struct work_struct tpusb_online_work;
+	atomic_t usb_online;
+#endif
+/*Add by T2M-mingwu.zhang [End]*/
 };
 
 /* external module structures */

From a6aa8bf32c849ad8eb033d26a1e6e3c37d364a80 Mon Sep 17 00:00:00 2001
From: Luca Weiss <luca.weiss@fairphone.com>
Date: Mon, 3 Apr 2023 23:26:11 +0200
Subject: [PATCH 691/707] Input - goodix_berlin_a_driver: fix compile

---
 .../touchscreen/goodix_berlin_a_driver/goodix_brl_i2c.c     | 6 ++----
 .../touchscreen/goodix_berlin_a_driver/goodix_brl_spi.c     | 3 +--
 .../touchscreen/goodix_berlin_a_driver/goodix_ts_core.c     | 6 +++---
 .../touchscreen/goodix_berlin_a_driver/goodix_ts_inspect.c  | 3 ++-
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_i2c.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_i2c.c
index c88afc212efc84..117fc995a7f6d9 100644
--- a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_i2c.c
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_i2c.c
@@ -163,8 +163,7 @@ static void goodix_pdev_release(struct device *dev)
 	kfree(goodix_pdev);
 }
 
-static int goodix_i2c_probe(struct i2c_client *client,
-	const struct i2c_device_id *dev_id)
+static int goodix_i2c_probe(struct i2c_client *client)
 {
 	int ret = 0;
 
@@ -217,10 +216,9 @@ static int goodix_i2c_probe(struct i2c_client *client,
 	return ret;
 }
 
-static int goodix_i2c_remove(struct i2c_client *client)
+static void goodix_i2c_remove(struct i2c_client *client)
 {
 	platform_device_unregister(goodix_pdev);
-	return 0;
 }
 
 #ifdef CONFIG_OF
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_spi.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_spi.c
index f1a365c22d0642..e7a887298242aa 100644
--- a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_spi.c
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_spi.c
@@ -253,10 +253,9 @@ static int goodix_spi_probe(struct spi_device *spi)
 	return ret;
 }
 
-static int goodix_spi_remove(struct spi_device *spi)
+static void goodix_spi_remove(struct spi_device *spi)
 {
 	platform_device_unregister(goodix_pdev);
-	return 0;
 }
 
 #ifdef CONFIG_OF
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.c
index 4c79b6e86077b0..7ee0592c9bb6ee 100644
--- a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.c
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_core.c
@@ -1155,7 +1155,7 @@ static int rawdata_proc_show(struct seq_file *m, void *v)
 static int rawdata_proc_open(struct inode *inode, struct file *file)
 {
 	return single_open_size(file, rawdata_proc_show,
-			PDE_DATA(inode), PAGE_SIZE * 10);
+			pde_data(inode), PAGE_SIZE * 10);
 }
 
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0))
@@ -2230,7 +2230,7 @@ int goodix_ts_fb_notifier_callback(struct notifier_block *self,
 }
 #endif
 
-#if IS_ENABLED(CONFIG_DRM)
+#if 0 // IS_ENABLED(CONFIG_DRM)
 int goodix_ts_drm_notifier_callback(struct notifier_block *self,
         unsigned long event, void *data)
 {
@@ -2392,7 +2392,7 @@ int goodix_ts_stage2_init(struct goodix_ts_core *cd)
 	cd->fb_notifier.notifier_call = goodix_ts_fb_notifier_callback;
 	if (fb_register_client(&cd->fb_notifier))
 		ts_err("Failed to register fb notifier client:%d", ret);
-#elif IS_ENABLED(CONFIG_DRM)	
+#elif 0 // IS_ENABLED(CONFIG_DRM)	
 	cd->fb_notifier.notifier_call = goodix_ts_drm_notifier_callback;
 	if (gdix_active_panel) {	
 		ret = drm_panel_notifier_register(gdix_active_panel,
diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_inspect.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_inspect.c
index 67eff2ac4caf72..7b9c9a152f671c 100644
--- a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_inspect.c
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_ts_inspect.c
@@ -21,6 +21,7 @@
 #include <linux/version.h>
 #include <linux/proc_fs.h>
 #include <linux/fs.h>
+#include <linux/proc_fs.h>
 #include <asm/uaccess.h>
 
 
@@ -2903,7 +2904,7 @@ static int auto_test_result_show(struct seq_file *m, void *v)
 static int auto_test_open(struct inode *inode, struct file *file)
 {
 	return single_open_size(file, auto_test_result_show,
-			PDE_DATA(inode), DEFAULT_SEQ_FILE_SIZE);
+			pde_data(inode), DEFAULT_SEQ_FILE_SIZE);
 }
 
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0))

From 0b1c42e99aa5e75a84fbdc1370b62b9aed24914b Mon Sep 17 00:00:00 2001
From: Luca Weiss <luca.weiss@fairphone.com>
Date: Mon, 21 Aug 2023 14:39:20 +0200
Subject: [PATCH 692/707] Input - goodix_berlin_a_driver: remove extra newlines

The ts_err macro already adds a newline after each print.
---
 .../input/touchscreen/goodix_berlin_a_driver/goodix_brl_hw.c  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_hw.c b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_hw.c
index 178b4912ce325d..56347560d261d5 100644
--- a/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_hw.c
+++ b/drivers/input/touchscreen/goodix_berlin_a_driver/goodix_brl_hw.c
@@ -239,7 +239,7 @@ ts_err("zmw---brl_power_on---op");
 			gpio_direction_output(iovdd_gpio, 1);
 		} else if (cd->iovdd) {			
 			ret = regulator_enable(cd->iovdd);
-ts_err("zmw:name=[%s] line=[%d] iovdd \n",__func__,__LINE__);			
+ts_err("zmw:name=[%s] line=[%d] iovdd",__func__,__LINE__);
 			if (ret < 0) {
 				ts_err("Failed to enable iovdd:%d", ret);
 				goto power_off;
@@ -252,7 +252,7 @@ ts_err("zmw:name=[%s] line=[%d] iovdd \n",__func__,__LINE__);
 			gpio_direction_output(avdd_gpio, 1);
 		} else if (cd->avdd) {
 			ret = regulator_enable(cd->avdd);	
-ts_err("zmw:name=[%s] line=[%d] avdd \n",__func__,__LINE__);					
+ts_err("zmw:name=[%s] line=[%d] avdd",__func__,__LINE__);
 			if (ret < 0) {
 				ts_err("Failed to enable avdd:%d", ret);
 				goto power_off;

From 1cce82f4050ab46a084d090f6aa3f85e9f8b3551 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Sun, 28 Jan 2024 00:48:53 +0300
Subject: [PATCH 693/707] arm64: dts: qcom: Add SM8475 device tree

The Qualcomm Snapdragon 8+ Gen 1 (SM8475) is software-wise similar to
the Qualcomm Snapdragon 8 Gen 1 (SM8450) with minor differences.

Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
---
 arch/arm64/boot/dts/qcom/sm8475.dtsi | 76 ++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 arch/arm64/boot/dts/qcom/sm8475.dtsi

diff --git a/arch/arm64/boot/dts/qcom/sm8475.dtsi b/arch/arm64/boot/dts/qcom/sm8475.dtsi
new file mode 100644
index 00000000000000..5d41f6ef2ae052
--- /dev/null
+++ b/arch/arm64/boot/dts/qcom/sm8475.dtsi
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2024, Danila Tikhonov <danila@jiaxyga.com>
+ */
+
+#include "sm8450.dtsi"
+
+/delete-node/ &usb_1_hsphy;
+/delete-node/ &ufs_mem_phy;
+
+&soc {
+	usb_1_hsphy: phy@88e3000 {
+		compatible = "qcom,sm8475-snps-eusb2-phy",
+			     "qcom,sm8550-snps-eusb2-phy";
+		reg = <0 0x088e3000 0 0x154>;
+		#phy-cells = <0>;
+
+		clocks = <&rpmhcc RPMH_CXO_CLK>;
+		clock-names = "ref";
+
+		resets = <&gcc GCC_QUSB2PHY_PRIM_BCR>;
+
+		status = "disabled";
+	};
+
+	ufs_mem_phy: phy@1d80000 {
+		compatible = "qcom,sm8475-qmp-ufs-phy";
+		reg = <0 0x1d80000 0 0x2000>;
+
+		clock-names = "ref", "ref_aux", "qref";
+		clocks = <&rpmhcc RPMH_CXO_CLK>,
+			 <&gcc GCC_UFS_PHY_PHY_AUX_CLK>,
+			 <&gcc GCC_UFS_0_CLKREF_EN>;
+
+		resets = <&ufs_mem_hc 0>;
+		reset-names = "ufsphy";
+
+		#clock-cells = <1>;
+		#phy-cells = <0>;
+
+		status = "disabled";
+	};
+};
+
+&gcc {
+	compatible = "qcom,gcc-sm8475";
+};
+
+&gpucc {
+	compatible = "qcom,sm8475-gpucc";
+};
+
+&videocc {
+	compatible = "qcom,sm8475-videocc";
+};
+
+&camcc {
+	compatible = "qcom,sm8475-camcc";
+};
+
+&dispcc {
+	compatible = "qcom,sm8475-dispcc";
+};
+
+&ufs_mem_hc {
+	freq-table-hz =
+		<75000000 850000000>,
+		<0 0>,
+		<0 0>,
+		<75000000 850000000>,
+		<75000000 850000000>,
+		<0 0>,
+		<0 0>,
+		<0 0>;
+};

From 629c304d56fbd096d99b6e8308cbd25e990c0f34 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Sun, 28 Jan 2024 00:50:59 +0300
Subject: [PATCH 694/707] arm64: dts: qcom: Add device-tree for Nothing Phone 2

Add device tree for the Nothing Phone 2 (Pong) smartphone.
This device is based on Qualcomm Snapdragon 8+ Gen 1 (SM8475) SoC.

Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
---
 arch/arm64/boot/dts/qcom/Makefile             |   1 +
 .../boot/dts/qcom/sm8475-nothing-pong.dts     | 560 ++++++++++++++++++
 2 files changed, 561 insertions(+)
 create mode 100644 arch/arm64/boot/dts/qcom/sm8475-nothing-pong.dts

diff --git a/arch/arm64/boot/dts/qcom/Makefile b/arch/arm64/boot/dts/qcom/Makefile
index 4e2fe85a4bb66b..42c17125c5b858 100644
--- a/arch/arm64/boot/dts/qcom/Makefile
+++ b/arch/arm64/boot/dts/qcom/Makefile
@@ -234,6 +234,7 @@ dtb-$(CONFIG_ARCH_QCOM)	+= sm8450-qrd.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8450-sony-xperia-nagara-pdx223.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8450-sony-xperia-nagara-pdx224.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8450-xiaomi-cupid.dtb
+dtb-$(CONFIG_ARCH_QCOM)	+= sm8475-nothing-pong.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8550-hdk.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8550-mtp.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8550-qrd.dtb
diff --git a/arch/arm64/boot/dts/qcom/sm8475-nothing-pong.dts b/arch/arm64/boot/dts/qcom/sm8475-nothing-pong.dts
new file mode 100644
index 00000000000000..4be3995041fcdc
--- /dev/null
+++ b/arch/arm64/boot/dts/qcom/sm8475-nothing-pong.dts
@@ -0,0 +1,560 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2024, Danila Tikhonov <danila@jiaxyga.com>
+ */
+/dts-v1/;
+
+#include <dt-bindings/arm/qcom,ids.h>
+#include <dt-bindings/regulator/qcom,rpmh-regulator.h>
+#include <dt-bindings/leds/common.h>
+
+#include "sm8475.dtsi"
+#include "pm8350.dtsi"
+#include "pm8350b.dtsi"
+#include "pm8350c.dtsi"
+#include "pm8450.dtsi"
+#include "pmk8350.dtsi"
+#include "pmr735a.dtsi"
+
+/delete-node/ &rmtfs_mem;
+/delete-node/ &video_mem;
+/delete-node/ &adsp_mem;
+
+/ {
+	model = "Nothing Phone 2";
+	compatible = "nothing,Pong", "qcom,sm8475";
+	chassis-type = "handset";
+
+	reserved-memory {
+		adsp_mem: memory@85700000 {
+			reg = <0x0 0x85700000 0x0 0x2800000>;
+			no-map;
+		};
+
+		ramoops@85200000 {
+			compatible = "ramoops";
+			reg = <0x0 0x85200000 0x0 0x400000>;
+
+			record-size = <0x200000>;
+			pmsg-size = <0x200000>;
+			console-size = <0x200000>;
+			no-map;
+		};
+
+		video_mem: memory@9fd00000 {
+			reg = <0x0 0x9fd00000 0x0 0x700000>;
+			no-map;
+		};
+
+		/*
+		 * bootloader_log_region: reg = <0x0 0xa7605000 0x0 0x8000>;
+		 * splash_region: reg = <0x0 0xb8000000 0x0 0x2b00000>;
+		 */
+
+		rmtfs_mem: memory@fc700000 {
+			compatible = "qcom,rmtfs-mem";
+			reg = <0x0 0xf3300000 0x0 0x280000>;
+			no-map;
+
+			qcom,client-id = <1>;
+			qcom,vmid = <QCOM_SCM_VMID_MSS_MSA>;
+		};
+	};
+
+	chosen {
+		#address-cells = <2>;
+		#size-cells = <2>;
+		ranges;
+
+		bootargs = "PMOS_NOSPLASH console=tty0";
+	};
+};
+
+&apps_rsc {
+	regulators-0 {
+		compatible = "qcom,pm8350-rpmh-regulators";
+		qcom,pmic-id = "b";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+		vdd-s4-supply = <&vph_pwr>;
+		vdd-s5-supply = <&vph_pwr>;
+		vdd-s6-supply = <&vph_pwr>;
+		vdd-s7-supply = <&vph_pwr>;
+		vdd-s8-supply = <&vph_pwr>;
+		vdd-s9-supply = <&vph_pwr>;
+		vdd-s10-supply = <&vph_pwr>;
+		vdd-s11-supply = <&vph_pwr>;
+		vdd-s12-supply = <&vph_pwr>;
+
+		vdd-l1-l4-supply = <&pm8350_s11>;
+		vdd-l2-l7-supply = <&vreg_bob>;
+		vdd-l3-l5-supply = <&pm8350_s11>;
+		vdd-l6-l9-l10-supply = <&pm8350_s12>;
+		*/
+
+		/*
+		 * ARC regulators:
+		 * s5 - gfx.lvl
+		 * l8 - lcx.lvl
+		 */
+
+		pm8350_s10: smps10 {
+			regulator-name = "pm8350_s10";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1800000>;
+		};
+
+		pm8350_s11: smps11 {
+			regulator-name = "pm8350_s11";
+			regulator-min-microvolt = <382000>;
+			regulator-max-microvolt = <1170000>;
+		};
+
+		pm8350_s12: smps12 {
+			regulator-name = "pm8350_s12";
+			regulator-min-microvolt = <1224000>;
+			regulator-max-microvolt = <2040000>;
+		};
+
+		pm8350_l1: ldo1 {
+			regulator-name = "pm8350_l1";
+			regulator-min-microvolt = <830000>;
+			regulator-max-microvolt = <920000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l2: ldo2 {
+			regulator-name = "pm8350_l2";
+			regulator-min-microvolt = <2700000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l3: ldo3 {
+			regulator-name = "pm8350_l3";
+			regulator-min-microvolt = <870000>;
+			regulator-max-microvolt = <970000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l5: ldo5 {
+			regulator-name = "pm8350_l5";
+			regulator-min-microvolt = <720000>;
+			regulator-max-microvolt = <958000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l6: ldo6 {
+			regulator-name = "pm8350_l6";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1216000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l7: ldo7 {
+			regulator-name = "pm8350_l7";
+			regulator-min-microvolt = <2400000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l9: ldo9 {
+			regulator-name = "pm8350_l9";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1304000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+	};
+
+	regulators-1 {
+		compatible = "qcom,pm8350c-rpmh-regulators";
+		qcom,pmic-id = "c";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+		vdd-s4-supply = <&vph_pwr>;
+		vdd-s5-supply = <&vph_pwr>;
+		vdd-s6-supply = <&vph_pwr>;
+		vdd-s7-supply = <&vph_pwr>;
+		vdd-s8-supply = <&vph_pwr>;
+		vdd-s9-supply = <&vph_pwr>;
+		vdd-s10-supply = <&vph_pwr>;
+
+		vdd-l1-l12-supply = <&pm8350c_s1>;
+		vdd-l2-l8-supply = <&pm8350c_s1>;
+		vdd-l3-l4-l5-l7-l13-supply = <&vreg_bob>;
+		vdd-l6-l9-l11-supply = <&vreg_bob>;
+		vdd-l10-supply = <&pm8350_s12>;
+
+		vdd-bob-supply = <&vph_pwr>;
+		*/
+
+		/*
+		 * ARC regulators:
+		 * s2 - mxc.lvl
+		 * s4 - mss.lvl
+		 * s6 - cx.lvl
+		 */
+
+		pm8350c_s1: smps1 {
+			regulator-name = "pm8350c_s1";
+			regulator-min-microvolt = <1900000>;
+			regulator-max-microvolt = <2024000>;
+		};
+
+		pm8350c_s10: smps10 {
+			regulator-name = "pm8350c_s10";
+			regulator-min-microvolt = <1052000>;
+			regulator-max-microvolt = <1170000>;
+		};
+
+		vreg_bob: bob {
+			regulator-name = "vreg_bob";
+			regulator-min-microvolt = <3008000>;
+			regulator-max-microvolt = <3960000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_AUTO>;
+		};
+
+		pm8350c_l2: ldo2 {
+			regulator-name = "pm8350c_l2";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1980000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l3: ldo3 {
+			regulator-name = "pm8350c_l3";
+			regulator-min-microvolt = <2700000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l4: ldo4 {
+			regulator-name = "pm8350c_l4";
+			regulator-min-microvolt = <1620000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l5: ldo5 {
+			regulator-name = "pm8350c_l5";
+			regulator-min-microvolt = <1620000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l6: ldo6 {
+			regulator-name = "pm8350c_l6";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l7: ldo7 {
+			regulator-name = "pm8350c_l7";
+			regulator-min-microvolt = <3000000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l8: ldo8 {
+			regulator-name = "pm8350c_l8";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <2000000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l9: ldo9 {
+			regulator-name = "pm8350c_l9";
+			regulator-min-microvolt = <2700000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l10: ldo10 {
+			regulator-name = "pm8350c_l10";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1304000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l12: ldo12 {
+			regulator-name = "pm8350c_l12";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1980000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l13: ldo13 {
+			regulator-name = "pm8350c_l13";
+			regulator-min-microvolt = <2700000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+	};
+
+	regulators-2 {
+		compatible = "qcom,pm8450-rpmh-regulators";
+		qcom,pmic-id = "h";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+		vdd-s4-supply = <&vph_pwr>;
+		vdd-s5-supply = <&vph_pwr>;
+		vdd-s6-supply = <&vph_pwr>;
+
+		vdd-l2-supply = <&vreg_bob>;
+		vdd-l3-supply = <&vreg_bob>;
+		vdd-l4-supply = <&vreg_bob>;
+		*/
+
+		/*
+		 * ARC regulators:
+		 * S2 - ebi.lvl
+		 * S4 - mmcx.lvl
+		 * S6 - mx.lvl
+		 * L1 - lmx.lvl
+		 */
+
+		pm8450_s3: smps3 {
+			regulator-name = "pm8450_s3";
+			regulator-min-microvolt = <470000>;
+			regulator-max-microvolt = <570000>;
+		};
+
+		pm8450_l2: ldo2 {
+			regulator-name = "pm8450_l2";
+			regulator-min-microvolt = <820000>;
+			regulator-max-microvolt = <958000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8450_l3: ldo3 {
+			regulator-name = "pm8450_l3";
+			regulator-min-microvolt = <866000>;
+			regulator-max-microvolt = <958000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+	};
+
+	regulators-3 {
+		compatible = "qcom,pmr735a-rpmh-regulators";
+		qcom,pmic-id = "e";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+
+		vdd-l1-l2-supply = <&pmr735a_s2>;
+		vdd-l3-supply = <&pmr735a_s1>;
+		vdd-l4-supply = <&pm8350c_s1>;
+		vdd-l5-l6-supply = <&pm8350c_s1>;
+		vdd-l7-bob-supply = <&vreg_bob>;
+		*/
+
+		pmr735a_s2: smps2 {
+			regulator-name = "pmr735a_s2";
+			regulator-min-microvolt = <500000>;
+			regulator-max-microvolt = <1040000>;
+		};
+
+		pmr735a_s3: smps3 {
+			regulator-name = "pmr735a_s3";
+			regulator-min-microvolt = <300000>;
+			regulator-max-microvolt = <2352000>;
+		};
+
+		pmr735a_l1: ldo1 {
+			regulator-name = "pmr735a_l1";
+			regulator-min-microvolt = <800000>;
+			regulator-max-microvolt = <880000>;
+		};
+
+		pmr735a_l2: ldo2 {
+			regulator-name = "pmr735a_l2";
+			regulator-min-microvolt = <480000>;
+			regulator-max-microvolt = <1200000>;
+		};
+
+		pmr735a_l3: ldo3 {
+			regulator-name = "pmr735a_l3";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1200000>;
+		};
+
+		pmr735a_l4: ldo4 {
+			regulator-name = "pmr735a_l4";
+			regulator-min-microvolt = <1776000>;
+			regulator-max-microvolt = <1776000>;
+		};
+
+		pmr735a_l5: ldo5 {
+			regulator-name = "pmr735a_l5";
+			regulator-min-microvolt = <880000>;
+			regulator-max-microvolt = <920000>;
+		};
+
+		pmr735a_l6: ldo6 {
+			regulator-name = "pmr735a_l6";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1200000>;
+		};
+
+		pmr735a_l7: ldo7 {
+			regulator-name = "pmr735a_l7";
+			regulator-min-microvolt = <2800000>;
+			regulator-max-microvolt = <2800000>;
+		};
+	};
+};
+
+&gpi_dma0 {
+	status = "okay";
+};
+
+&gpi_dma1 {
+	status = "okay";
+};
+
+&gpi_dma2 {
+	status = "okay";
+};
+
+&i2c5 {
+	clock-frequency = <100000>;
+	status = "okay";
+
+	/* nq@64 rtc6226 */
+	/* fsa4480@42 qcom,fsa4480-i2c */
+	/* redriver@1c onnn,redriver */
+	/* aw20036_led@3a awinic,aw20036_led */
+	/* eusb2_repeater@4f nxp,eusb2-repeater */
+
+	nxp_eusb2_repeater: eusb2_repeater@4f {
+		compatible = "nxp,eusb2-repeater";
+		reg = <0x4f>;
+		vdd18-supply = <&pm8350_s10>;
+		vdd3-supply = <&pm8350_l2>;
+		reset-gpio = <&pm8350c_gpios 7 GPIO_ACTIVE_HIGH>;
+		pinctrl-names = "default";
+		pinctrl-0 = <&eusb2_reset_ctrl_default>;
+		#phy-cells = <0>;
+		qcom,param-override-seq =
+				/* Rx squelch detection threshold to 110mV; default is 125mV */
+				<0x40 0x06
+				/*
+				 * Tx Deemphasis to 2dB, Tx Deemphasis bit duration to 0.8UI;
+				 * default is 0 for both
+				 */
+				0x22 0x07
+				/* Output Voltage Swing to 550mV; default is 450mV */
+				0x64 0x08>;
+	};
+};
+
+&i2c9 {
+	clock-frequency = <1000000>;
+	status = "okay";
+
+	/* nq@28(ts) qcom,sn-nci */
+};
+
+&i2c13 {
+	clock-frequency = <1000000>;
+	status = "okay";
+
+	/* tfa98xx@34 */
+	/* tfa98xx@35 */
+};
+
+&i2c18 {
+	clock-frequency = <1000000>;
+	status = "okay";
+
+	/* haptic_hv@5a awinic,aw8692x */
+};
+
+&pm8350c_gpios {
+	eusb2_reset_ctrl_default: eusb2_reset_ctrl_default {
+		pins = "gpio7";
+		function = "normal";
+		input-enable;
+		output-enable;
+		bias-disable;
+		power-source = <1>;	/* 1.8V */
+		qcom,drive-strength = <2>;
+	};
+};
+
+&qupv3_id_0 {
+	status = "okay";
+};
+
+&qupv3_id_1 {
+	status = "okay";
+};
+
+&qupv3_id_2 {
+	status = "okay";
+};
+
+&spi4 {
+	clock-frequency = <20000000>;
+	status = "okay";
+
+	/* goodix-berlin@0 goodix,brl-d */
+};
+
+&tlmm {
+	gpio-reserved-ranges = <28 4>;
+};
+
+&ufs_mem_hc {
+	status = "okay";
+
+	reset-gpios = <&tlmm 210 GPIO_ACTIVE_LOW>;
+
+	vcc-supply = <&pm8350_l7>;
+	vcc-max-microamp = <1100000>;
+	vccq-supply = <&pm8350_l9>;
+	vccq-max-microamp = <1200000>;
+};
+
+&ufs_mem_phy {
+	status = "okay";
+
+	vdda-phy-supply = <&pm8350_l5>;
+	vdda-pll-supply = <&pm8350c_l10>;
+};
+
+&usb_1 {
+	/* USB 2.0 only */
+	qcom,select-utmi-as-pipe-clk;
+	status = "okay";
+};
+
+&usb_1_dwc3 {
+	dr_mode = "peripheral";
+	maximum-speed = "high-speed";
+	/* Remove USB3 phy */
+	phys = <&usb_1_hsphy>;
+	phy-names = "usb2-phy";
+};
+
+&usb_1_hsphy {
+	vdd-supply = <&pm8350_l5>;
+	vdda12-supply = <&pm8350c_l10>;
+
+	phys = <&nxp_eusb2_repeater>;
+
+	status = "okay";
+};

From 54b59571b83995e115e518495cf8c18e81bc0e47 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Mon, 29 Jan 2024 14:58:17 +0300
Subject: [PATCH 695/707] drm: panel: Generate
 panel-visionox-nt37705-amoled-120hz driver

Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
---
 drivers/gpu/drm/panel/Kconfig                 |  10 +
 drivers/gpu/drm/panel/Makefile                |   1 +
 .../panel-visionox-nt37705-amoled-120hz.c     | 672 ++++++++++++++++++
 3 files changed, 683 insertions(+)
 create mode 100644 drivers/gpu/drm/panel/panel-visionox-nt37705-amoled-120hz.c

diff --git a/drivers/gpu/drm/panel/Kconfig b/drivers/gpu/drm/panel/Kconfig
index 250841246ae881..77870be939afcd 100644
--- a/drivers/gpu/drm/panel/Kconfig
+++ b/drivers/gpu/drm/panel/Kconfig
@@ -837,6 +837,16 @@ config DRM_PANEL_TRULY_NT35597_WQXGA
 	  Say Y here if you want to enable support for Truly NT35597 WQXGA Dual DSI
 	  Video Mode panel
 
+config DRM_PANEL_VISIONOX_NT37705
+	tristate "Visionox nt37705"
+	depends on OF
+	depends on DRM_MIPI_DSI
+	depends on BACKLIGHT_CLASS_DEVICE
+	select VIDEOMODE_HELPERS
+	help
+	  Say Y here if you want to enable support for Visionox
+	  nt37705 (2412x1080@120Hz) DSI DSC CMD Mode panel.
+
 config DRM_PANEL_VISIONOX_RM69299
 	tristate "Visionox RM69299"
 	depends on OF
diff --git a/drivers/gpu/drm/panel/Makefile b/drivers/gpu/drm/panel/Makefile
index 471bfcdf590fc7..9113a8622ace6f 100644
--- a/drivers/gpu/drm/panel/Makefile
+++ b/drivers/gpu/drm/panel/Makefile
@@ -85,6 +85,7 @@ obj-$(CONFIG_DRM_PANEL_TPO_TD028TTEC1) += panel-tpo-td028ttec1.o
 obj-$(CONFIG_DRM_PANEL_TPO_TD043MTEA1) += panel-tpo-td043mtea1.o
 obj-$(CONFIG_DRM_PANEL_TPO_TPG110) += panel-tpo-tpg110.o
 obj-$(CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA) += panel-truly-nt35597.o
+obj-$(CONFIG_DRM_PANEL_VISIONOX_NT37705) += panel-visionox-nt37705-amoled-120hz.o
 obj-$(CONFIG_DRM_PANEL_VISIONOX_RM69299) += panel-visionox-rm69299.o
 obj-$(CONFIG_DRM_PANEL_VISIONOX_VTDR6130) += panel-visionox-vtdr6130.o
 obj-$(CONFIG_DRM_PANEL_VISIONOX_R66451) += panel-visionox-r66451.o
diff --git a/drivers/gpu/drm/panel/panel-visionox-nt37705-amoled-120hz.c b/drivers/gpu/drm/panel/panel-visionox-nt37705-amoled-120hz.c
new file mode 100644
index 00000000000000..704e325abedb66
--- /dev/null
+++ b/drivers/gpu/drm/panel/panel-visionox-nt37705-amoled-120hz.c
@@ -0,0 +1,672 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (c) 2024 FIXME
+// Generated with linux-mdss-dsi-panel-driver-generator from vendor device tree:
+//   Copyright (c) 2013, The Linux Foundation. All rights reserved. (FIXME)
+
+#include <linux/backlight.h>
+#include <linux/delay.h>
+#include <linux/gpio/consumer.h>
+#include <linux/module.h>
+#include <linux/of.h>
+
+#include <video/mipi_display.h>
+
+#include <drm/display/drm_dsc.h>
+#include <drm/display/drm_dsc_helper.h>
+#include <drm/drm_mipi_dsi.h>
+#include <drm/drm_modes.h>
+#include <drm/drm_panel.h>
+
+struct nt37705_visionox_amoled_120hz {
+	struct drm_panel panel;
+	struct mipi_dsi_device *dsi;
+	struct drm_dsc_config dsc;
+	struct gpio_desc *reset_gpio;
+	bool prepared;
+};
+
+static inline
+struct nt37705_visionox_amoled_120hz *to_nt37705_visionox_amoled_120hz(struct drm_panel *panel)
+{
+	return container_of(panel, struct nt37705_visionox_amoled_120hz, panel);
+}
+
+static void nt37705_visionox_amoled_120hz_reset(struct nt37705_visionox_amoled_120hz *ctx)
+{
+	gpiod_set_value_cansleep(ctx->reset_gpio, 0);
+	usleep_range(10000, 11000);
+	gpiod_set_value_cansleep(ctx->reset_gpio, 1);
+	usleep_range(1000, 2000);
+	gpiod_set_value_cansleep(ctx->reset_gpio, 0);
+	msleep(20);
+}
+
+static int nt37705_visionox_amoled_120hz_on(struct nt37705_visionox_amoled_120hz *ctx)
+{
+	struct mipi_dsi_device *dsi = ctx->dsi;
+	struct device *dev = &dsi->dev;
+	int ret;
+
+	dsi->mode_flags |= MIPI_DSI_MODE_LPM;
+
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xb5, 0x80, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x07);
+	mipi_dsi_dcs_write_seq(dsi, 0xc0, 0x87, 0x01, 0x08);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x03);
+	mipi_dsi_dcs_write_seq(dsi, 0xc0,
+			       0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc1,
+			       0x30, 0x0f, 0x36, 0x64, 0x3b, 0x19, 0x8e, 0x9c,
+			       0x3f, 0xc6, 0x26, 0x08, 0x3f, 0xc3, 0xfa, 0x9c,
+			       0x00, 0x0f, 0x3d, 0x5e, 0x64);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x15);
+	mipi_dsi_dcs_write_seq(dsi, 0xc1,
+			       0x21, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x82, 0x33,
+			       0xf0, 0x7e, 0x85, 0x3e, 0x76, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x25);
+	mipi_dsi_dcs_write_seq(dsi, 0xc1,
+			       0x22, 0x19, 0x1f, 0x00, 0x21, 0x2d, 0x00, 0x00,
+			       0x00, 0x21, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc2,
+			       0x36, 0x00, 0x36, 0x64, 0x3b, 0x19, 0x71, 0x64,
+			       0x00, 0x05, 0x95, 0xe0, 0x3f, 0x8d, 0x7f, 0x90,
+			       0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x15);
+	mipi_dsi_dcs_write_seq(dsi, 0xc2,
+			       0x21, 0x34, 0xbc, 0x37, 0x00, 0x00, 0x82, 0x33,
+			       0x30, 0x82, 0x85, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x25);
+	mipi_dsi_dcs_write_seq(dsi, 0xc2,
+			       0x22, 0x19, 0x1f, 0x00, 0x4e, 0x5a, 0x00, 0x00,
+			       0x00, 0x21, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc3,
+			       0x2a, 0x0f, 0x36, 0x64, 0x3b, 0x19, 0x8e, 0x9c,
+			       0x00, 0x3f, 0x2a, 0xa8, 0x00, 0x36, 0x7b, 0x0c,
+			       0x7f, 0xd5, 0x18, 0x25, 0x84);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x15);
+	mipi_dsi_dcs_write_seq(dsi, 0xc3,
+			       0x60, 0x34, 0xbc, 0x37, 0x89, 0xe9, 0x6b, 0x33,
+			       0x0f, 0x82, 0x7b, 0xc1, 0x8a, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x25);
+	mipi_dsi_dcs_write_seq(dsi, 0xc3,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x60, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc4,
+			       0x2c, 0x00, 0x36, 0x64, 0x3b, 0x19, 0x71, 0x64,
+			       0x3f, 0x8c, 0x91, 0x40, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x15);
+	mipi_dsi_dcs_write_seq(dsi, 0xc4,
+			       0x60, 0x00, 0x00, 0x7b, 0x89, 0xe9, 0x6b, 0x33,
+			       0xc0, 0x7e, 0x7b, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x25);
+	mipi_dsi_dcs_write_seq(dsi, 0xc4,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x60, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc5,
+			       0x21, 0x00, 0x03, 0x49, 0x03, 0x49, 0x00, 0x00,
+			       0x3f, 0xff, 0x41, 0x76, 0x3f, 0xff, 0x41, 0x76,
+			       0x00, 0x00, 0x0a, 0xca, 0xd1);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x15);
+	mipi_dsi_dcs_write_seq(dsi, 0xc5,
+			       0x08, 0x12, 0xfe, 0x1b, 0x00, 0x20, 0x3d, 0x33,
+			       0xf0, 0xe3, 0xe3, 0x03, 0x49, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x25);
+	mipi_dsi_dcs_write_seq(dsi, 0xc5,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x08, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc6,
+			       0x27, 0x00, 0x03, 0x49, 0x03, 0x49, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0x41, 0x76,
+			       0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x15);
+	mipi_dsi_dcs_write_seq(dsi, 0xc6,
+			       0x08, 0x22, 0x1c, 0x39, 0x00, 0x20, 0x3d, 0x33,
+			       0x30, 0x1d, 0xe3, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x25);
+	mipi_dsi_dcs_write_seq(dsi, 0xc6,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x08, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc7,
+			       0x2b, 0x00, 0x03, 0x49, 0x03, 0x49, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x7f, 0xff, 0xf5, 0x35, 0x2f);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x15);
+	mipi_dsi_dcs_write_seq(dsi, 0xc7,
+			       0x14, 0x22, 0x1c, 0x39, 0x00, 0x3e, 0x5b, 0x33,
+			       0x0f, 0x1d, 0x1d, 0xfc, 0xb7, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x25);
+	mipi_dsi_dcs_write_seq(dsi, 0xc7,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x14, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc8,
+			       0x2d, 0x00, 0x03, 0x49, 0x03, 0x49, 0x00, 0x00,
+			       0x3f, 0xff, 0x41, 0x76, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x15);
+	mipi_dsi_dcs_write_seq(dsi, 0xc8,
+			       0x14, 0x12, 0xfe, 0x1b, 0x00, 0x3e, 0x5b, 0x33,
+			       0xc0, 0xe3, 0x1d, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x25);
+	mipi_dsi_dcs_write_seq(dsi, 0xc8,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x14, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xc9,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x15);
+	mipi_dsi_dcs_write_seq(dsi, 0xc9,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x25);
+	mipi_dsi_dcs_write_seq(dsi, 0xc9,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xca,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x15);
+	mipi_dsi_dcs_write_seq(dsi, 0xca,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x25);
+	mipi_dsi_dcs_write_seq(dsi, 0xca,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xcb,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x15);
+	mipi_dsi_dcs_write_seq(dsi, 0xcb,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x25);
+	mipi_dsi_dcs_write_seq(dsi, 0xcb,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xcc,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x15);
+	mipi_dsi_dcs_write_seq(dsi, 0xcc,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x25);
+	mipi_dsi_dcs_write_seq(dsi, 0xcc,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x1b);
+	mipi_dsi_dcs_write_seq(dsi, 0xba, 0x18);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x1c);
+	mipi_dsi_dcs_write_seq(dsi, 0xba,
+			       0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x2c);
+	mipi_dsi_dcs_write_seq(dsi, 0xba,
+			       0x00, 0x01, 0x01, 0x01, 0x00, 0x01, 0x01, 0x01,
+			       0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x3c);
+	mipi_dsi_dcs_write_seq(dsi, 0xba,
+			       0x00, 0x01, 0x01, 0x00, 0x00, 0x03, 0x03, 0x00,
+			       0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x4c);
+	mipi_dsi_dcs_write_seq(dsi, 0xba,
+			       0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00,
+			       0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x5c);
+	mipi_dsi_dcs_write_seq(dsi, 0xba,
+			       0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x6c);
+	mipi_dsi_dcs_write_seq(dsi, 0xba,
+			       0x01, 0x02, 0x00, 0x04, 0x0b, 0x77, 0x03, 0x0b,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x7c);
+	mipi_dsi_dcs_write_seq(dsi, 0xba,
+			       0x01, 0x02, 0x00, 0x04, 0x0b, 0x77, 0x03, 0x0b,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x8c);
+	mipi_dsi_dcs_write_seq(dsi, 0xba,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x9c);
+	mipi_dsi_dcs_write_seq(dsi, 0xba,
+			       0x11, 0x11, 0x70, 0x13, 0x70, 0x03, 0x70, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0xa4);
+	mipi_dsi_dcs_write_seq(dsi, 0xba, 0x00, 0xf6, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0xa8);
+	mipi_dsi_dcs_write_seq(dsi, 0xba,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0xb0);
+	mipi_dsi_dcs_write_seq(dsi, 0xba,
+			       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x08);
+	mipi_dsi_dcs_write_seq(dsi, 0xbb, 0x01, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x0c);
+	mipi_dsi_dcs_write_seq(dsi, 0xbb, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x10);
+	mipi_dsi_dcs_write_seq(dsi, 0xbb, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x14);
+	mipi_dsi_dcs_write_seq(dsi, 0xbb, 0x00, 0x00, 0x00, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x18);
+	mipi_dsi_dcs_write_seq(dsi, 0xbb, 0x01, 0x1d, 0x1d, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x1c);
+	mipi_dsi_dcs_write_seq(dsi, 0xbb, 0x01, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x01);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x05);
+	mipi_dsi_dcs_write_seq(dsi, 0xc5, 0x15, 0x15, 0x15, 0xdd);
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x03);
+	mipi_dsi_dcs_write_seq(dsi, 0xc3, 0x02, 0x32, 0x22, 0x22);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x06);
+	mipi_dsi_dcs_write_seq(dsi, 0xc3, 0x02, 0x32, 0x22, 0x22);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x0c);
+	mipi_dsi_dcs_write_seq(dsi, 0xc3, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x01);
+	mipi_dsi_dcs_write_seq(dsi, 0xc3,
+			       0xdd, 0x06, 0x22, 0x0e, 0xf4, 0x00, 0x06, 0x20,
+			       0x0e, 0xff, 0x00, 0x03, 0xd0, 0x0e, 0x05, 0x2c,
+			       0x13, 0x03, 0xd0, 0x0e, 0x05, 0x2c, 0x13, 0x03,
+			       0xd0, 0x0e, 0x05, 0x2c, 0x13, 0x03, 0xd0, 0x0e,
+			       0x05, 0x2c, 0x13, 0x03, 0xd0, 0x0e, 0x05, 0x2c,
+			       0x13);
+	mipi_dsi_dcs_write_seq(dsi, 0xff, 0xaa, 0x55, 0xa5, 0x80);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x1a);
+	mipi_dsi_dcs_write_seq(dsi, 0xf4, 0x55);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x1d);
+	mipi_dsi_dcs_write_seq(dsi, 0xf2, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x23);
+	mipi_dsi_dcs_write_seq(dsi, 0xf2, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x19);
+	mipi_dsi_dcs_write_seq(dsi, 0xf2, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x28);
+	mipi_dsi_dcs_write_seq(dsi, 0xf2, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x2c);
+	mipi_dsi_dcs_write_seq(dsi, 0xf2, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0xff, 0xaa, 0x55, 0xa5, 0x81);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x05);
+	mipi_dsi_dcs_write_seq(dsi, 0xfe, 0x34);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x02);
+	mipi_dsi_dcs_write_seq(dsi, 0xf9, 0x04);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x24);
+	mipi_dsi_dcs_write_seq(dsi, 0xfb,
+			       0x00, 0x03, 0x04, 0x55, 0x77, 0x77, 0x77, 0x99,
+			       0x9b, 0x10, 0x00, 0x1e, 0x48, 0x9a, 0xbb, 0xbc,
+			       0xde, 0xf0, 0x11, 0x30);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x19);
+	mipi_dsi_dcs_write_seq(dsi, 0xfb, 0x30);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x1e);
+	mipi_dsi_dcs_write_seq(dsi, 0xfb, 0x0f);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x0d);
+	mipi_dsi_dcs_write_seq(dsi, 0xfb, 0x80);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x0a);
+	mipi_dsi_dcs_write_seq(dsi, 0xfd, 0x08);
+	mipi_dsi_dcs_write_seq(dsi, 0xff, 0xaa, 0x55, 0xa5, 0x83);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x12);
+	mipi_dsi_dcs_write_seq(dsi, 0xfe, 0x41);
+	mipi_dsi_dcs_write_seq(dsi, 0xff, 0xaa, 0x55, 0xa5, 0x80);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x31);
+	mipi_dsi_dcs_write_seq(dsi, 0xf8, 0x00, 0xfd);
+	mipi_dsi_dcs_write_seq(dsi, 0xff, 0xaa, 0x55, 0xa5, 0x80);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x15);
+	mipi_dsi_dcs_write_seq(dsi, 0xf8, 0x01, 0x4f);
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x01);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x03);
+	mipi_dsi_dcs_write_seq(dsi, 0xc7, 0x07);
+	mipi_dsi_dcs_write_seq(dsi, 0x17, 0x10);
+
+	ret = mipi_dsi_dcs_set_column_address(dsi, 0x0000, 0x0437);
+	if (ret < 0) {
+		dev_err(dev, "Failed to set column address: %d\n", ret);
+		return ret;
+	}
+
+	ret = mipi_dsi_dcs_set_page_address(dsi, 0x0000, 0x096b);
+	if (ret < 0) {
+		dev_err(dev, "Failed to set page address: %d\n", ret);
+		return ret;
+	}
+
+	mipi_dsi_dcs_write_seq(dsi, MIPI_DCS_SET_GAMMA_CURVE, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x2f, 0x00);
+
+	ret = mipi_dsi_dcs_set_tear_on(dsi, MIPI_DSI_DCS_TEAR_MODE_VBLANK);
+	if (ret < 0) {
+		dev_err(dev, "Failed to set tear on: %d\n", ret);
+		return ret;
+	}
+
+	ret = mipi_dsi_dcs_set_display_brightness(dsi, 0xbb0d);
+	if (ret < 0) {
+		dev_err(dev, "Failed to set display brightness: %d\n", ret);
+		return ret;
+	}
+
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0x04);
+
+	ret = mipi_dsi_dcs_set_display_brightness(dsi, 0xff0f);
+	if (ret < 0) {
+		dev_err(dev, "Failed to set display brightness: %d\n", ret);
+		return ret;
+	}
+
+	mipi_dsi_dcs_write_seq(dsi, MIPI_DCS_WRITE_CONTROL_DISPLAY, 0x20);
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x6f, 0xc5);
+	mipi_dsi_dcs_write_seq(dsi, 0xba, 0x10);
+	mipi_dsi_dcs_write_seq(dsi, 0x6d, 0x00);
+	mipi_dsi_dcs_write_seq(dsi, 0x90, 0x03, 0x03);
+	mipi_dsi_dcs_write_seq(dsi, 0x91,
+			       0xab, 0x28, 0x00, 0x0c, 0xc2, 0x00, 0x02, 0x0e,
+			       0x01, 0x1f, 0x00, 0x07, 0x08, 0xbb, 0x08, 0x7a,
+			       0x10, 0xf0);
+
+	ret = mipi_dsi_dcs_exit_sleep_mode(dsi);
+	if (ret < 0) {
+		dev_err(dev, "Failed to exit sleep mode: %d\n", ret);
+		return ret;
+	}
+	msleep(100);
+
+	ret = mipi_dsi_dcs_set_display_on(dsi);
+	if (ret < 0) {
+		dev_err(dev, "Failed to set display on: %d\n", ret);
+		return ret;
+	}
+
+	mipi_dsi_dcs_write_seq(dsi, 0xf0, 0x55, 0xaa, 0x52, 0x08, 0x08);
+	mipi_dsi_dcs_write_seq(dsi, 0xe0, 0x01);
+
+	return 0;
+}
+
+static int nt37705_visionox_amoled_120hz_off(struct nt37705_visionox_amoled_120hz *ctx)
+{
+	struct mipi_dsi_device *dsi = ctx->dsi;
+	struct device *dev = &dsi->dev;
+	int ret;
+
+	dsi->mode_flags &= ~MIPI_DSI_MODE_LPM;
+
+	ret = mipi_dsi_dcs_set_display_off(dsi);
+	if (ret < 0) {
+		dev_err(dev, "Failed to set display off: %d\n", ret);
+		return ret;
+	}
+	usleep_range(1000, 2000);
+
+	ret = mipi_dsi_dcs_enter_sleep_mode(dsi);
+	if (ret < 0) {
+		dev_err(dev, "Failed to enter sleep mode: %d\n", ret);
+		return ret;
+	}
+	msleep(100);
+
+	return 0;
+}
+
+static int nt37705_visionox_amoled_120hz_prepare(struct drm_panel *panel)
+{
+	struct nt37705_visionox_amoled_120hz *ctx = to_nt37705_visionox_amoled_120hz(panel);
+	struct device *dev = &ctx->dsi->dev;
+	struct drm_dsc_picture_parameter_set pps;
+	int ret;
+
+	if (ctx->prepared)
+		return 0;
+
+	nt37705_visionox_amoled_120hz_reset(ctx);
+
+	ret = nt37705_visionox_amoled_120hz_on(ctx);
+	if (ret < 0) {
+		dev_err(dev, "Failed to initialize panel: %d\n", ret);
+		gpiod_set_value_cansleep(ctx->reset_gpio, 1);
+		return ret;
+	}
+
+	drm_dsc_pps_payload_pack(&pps, &ctx->dsc);
+
+	ret = mipi_dsi_picture_parameter_set(ctx->dsi, &pps);
+	if (ret < 0) {
+		dev_err(panel->dev, "failed to transmit PPS: %d\n", ret);
+		return ret;
+	}
+
+	ret = mipi_dsi_compression_mode(ctx->dsi, true);
+	if (ret < 0) {
+		dev_err(dev, "failed to enable compression mode: %d\n", ret);
+		return ret;
+	}
+
+	msleep(28); /* TODO: Is this panel-dependent? */
+
+	ctx->prepared = true;
+	return 0;
+}
+
+static int nt37705_visionox_amoled_120hz_unprepare(struct drm_panel *panel)
+{
+	struct nt37705_visionox_amoled_120hz *ctx = to_nt37705_visionox_amoled_120hz(panel);
+	struct device *dev = &ctx->dsi->dev;
+	int ret;
+
+	if (!ctx->prepared)
+		return 0;
+
+	ret = nt37705_visionox_amoled_120hz_off(ctx);
+	if (ret < 0)
+		dev_err(dev, "Failed to un-initialize panel: %d\n", ret);
+
+	gpiod_set_value_cansleep(ctx->reset_gpio, 1);
+
+	ctx->prepared = false;
+	return 0;
+}
+
+static const struct drm_display_mode nt37705_visionox_amoled_120hz_mode = {
+	.clock = (1080 + 100 + 8 + 92) * (2412 + 14 + 2 + 22) * 120 / 1000,
+	.hdisplay = 1080,
+	.hsync_start = 1080 + 100,
+	.hsync_end = 1080 + 100 + 8,
+	.htotal = 1080 + 100 + 8 + 92,
+	.vdisplay = 2412,
+	.vsync_start = 2412 + 14,
+	.vsync_end = 2412 + 14 + 2,
+	.vtotal = 2412 + 14 + 2 + 22,
+	.width_mm = 69,
+	.height_mm = 156,
+};
+
+static int nt37705_visionox_amoled_120hz_get_modes(struct drm_panel *panel,
+						   struct drm_connector *connector)
+{
+	struct drm_display_mode *mode;
+
+	mode = drm_mode_duplicate(connector->dev, &nt37705_visionox_amoled_120hz_mode);
+	if (!mode)
+		return -ENOMEM;
+
+	drm_mode_set_name(mode);
+
+	mode->type = DRM_MODE_TYPE_DRIVER | DRM_MODE_TYPE_PREFERRED;
+	connector->display_info.width_mm = mode->width_mm;
+	connector->display_info.height_mm = mode->height_mm;
+	drm_mode_probed_add(connector, mode);
+
+	return 1;
+}
+
+static const struct drm_panel_funcs nt37705_visionox_amoled_120hz_panel_funcs = {
+	.prepare = nt37705_visionox_amoled_120hz_prepare,
+	.unprepare = nt37705_visionox_amoled_120hz_unprepare,
+	.get_modes = nt37705_visionox_amoled_120hz_get_modes,
+};
+
+static int nt37705_visionox_amoled_120hz_bl_update_status(struct backlight_device *bl)
+{
+	struct mipi_dsi_device *dsi = bl_get_data(bl);
+	u16 brightness = backlight_get_brightness(bl);
+	int ret;
+
+	dsi->mode_flags &= ~MIPI_DSI_MODE_LPM;
+
+	ret = mipi_dsi_dcs_set_display_brightness_large(dsi, brightness);
+	if (ret < 0)
+		return ret;
+
+	dsi->mode_flags |= MIPI_DSI_MODE_LPM;
+
+	return 0;
+}
+
+// TODO: Check if /sys/class/backlight/.../actual_brightness actually returns
+// correct values. If not, remove this function.
+static int nt37705_visionox_amoled_120hz_bl_get_brightness(struct backlight_device *bl)
+{
+	struct mipi_dsi_device *dsi = bl_get_data(bl);
+	u16 brightness;
+	int ret;
+
+	dsi->mode_flags &= ~MIPI_DSI_MODE_LPM;
+
+	ret = mipi_dsi_dcs_get_display_brightness_large(dsi, &brightness);
+	if (ret < 0)
+		return ret;
+
+	dsi->mode_flags |= MIPI_DSI_MODE_LPM;
+
+	return brightness;
+}
+
+static const struct backlight_ops nt37705_visionox_amoled_120hz_bl_ops = {
+	.update_status = nt37705_visionox_amoled_120hz_bl_update_status,
+	.get_brightness = nt37705_visionox_amoled_120hz_bl_get_brightness,
+};
+
+static struct backlight_device *
+nt37705_visionox_amoled_120hz_create_backlight(struct mipi_dsi_device *dsi)
+{
+	struct device *dev = &dsi->dev;
+	const struct backlight_properties props = {
+		.type = BACKLIGHT_RAW,
+		.brightness = 4095,
+		.max_brightness = 4095,
+	};
+
+	return devm_backlight_device_register(dev, dev_name(dev), dev, dsi,
+					      &nt37705_visionox_amoled_120hz_bl_ops, &props);
+}
+
+static int nt37705_visionox_amoled_120hz_probe(struct mipi_dsi_device *dsi)
+{
+	struct device *dev = &dsi->dev;
+	struct nt37705_visionox_amoled_120hz *ctx;
+	int ret;
+
+	ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->reset_gpio = devm_gpiod_get(dev, "reset", GPIOD_OUT_HIGH);
+	if (IS_ERR(ctx->reset_gpio))
+		return dev_err_probe(dev, PTR_ERR(ctx->reset_gpio),
+				     "Failed to get reset-gpios\n");
+
+	ctx->dsi = dsi;
+	mipi_dsi_set_drvdata(dsi, ctx);
+
+	dsi->lanes = 4;
+	//dsi->format = MIPI_DSI_FMT_RGB101010
+	dsi->format = MIPI_DSI_FMT_RGB888;
+	dsi->mode_flags = MIPI_DSI_CLOCK_NON_CONTINUOUS;
+
+	drm_panel_init(&ctx->panel, dev, &nt37705_visionox_amoled_120hz_panel_funcs,
+		       DRM_MODE_CONNECTOR_DSI);
+	ctx->panel.prepare_prev_first = true;
+
+	ctx->panel.backlight = nt37705_visionox_amoled_120hz_create_backlight(dsi);
+	if (IS_ERR(ctx->panel.backlight))
+		return dev_err_probe(dev, PTR_ERR(ctx->panel.backlight),
+				     "Failed to create backlight\n");
+
+	drm_panel_add(&ctx->panel);
+
+	/* This panel only supports DSC; unconditionally enable it */
+	dsi->dsc = &ctx->dsc;
+
+	ctx->dsc.dsc_version_major = 1;
+	ctx->dsc.dsc_version_minor = 1;
+
+	/* TODO: Pass slice_per_pkt = 2 */
+	ctx->dsc.slice_height = 12;
+	ctx->dsc.slice_width = 540;
+	/*
+	 * TODO: hdisplay should be read from the selected mode once
+	 * it is passed back to drm_panel (in prepare?)
+	 */
+	WARN_ON(1080 % ctx->dsc.slice_width);
+	ctx->dsc.slice_count = 1080 / ctx->dsc.slice_width;
+	ctx->dsc.bits_per_component = 10;
+	ctx->dsc.bits_per_pixel = 8 << 4; /* 4 fractional bits */
+	ctx->dsc.block_pred_enable = true;
+
+	ret = mipi_dsi_attach(dsi);
+	if (ret < 0) {
+		dev_err(dev, "Failed to attach to DSI host: %d\n", ret);
+		drm_panel_remove(&ctx->panel);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void nt37705_visionox_amoled_120hz_remove(struct mipi_dsi_device *dsi)
+{
+	struct nt37705_visionox_amoled_120hz *ctx = mipi_dsi_get_drvdata(dsi);
+	int ret;
+
+	ret = mipi_dsi_detach(dsi);
+	if (ret < 0)
+		dev_err(&dsi->dev, "Failed to detach from DSI host: %d\n", ret);
+
+	drm_panel_remove(&ctx->panel);
+}
+
+static const struct of_device_id nt37705_visionox_amoled_120hz_of_match[] = {
+	{ .compatible = "mdss,nt37705-visionox-amoled-120hz" }, // FIXME
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, nt37705_visionox_amoled_120hz_of_match);
+
+static struct mipi_dsi_driver nt37705_visionox_amoled_120hz_driver = {
+	.probe = nt37705_visionox_amoled_120hz_probe,
+	.remove = nt37705_visionox_amoled_120hz_remove,
+	.driver = {
+		.name = "panel-nt37705-visionox-amoled-120hz",
+		.of_match_table = nt37705_visionox_amoled_120hz_of_match,
+	},
+};
+module_mipi_dsi_driver(nt37705_visionox_amoled_120hz_driver);
+
+MODULE_AUTHOR("linux-mdss-dsi-panel-driver-generator <fix@me>"); // FIXME
+MODULE_DESCRIPTION("DRM driver for nt37705 amoled fhd+ 120hz cmd mode dsi visionox panel");
+MODULE_LICENSE("GPL");

From 6d386370634ee5b7b58d09cc2bad4faf145c7fb3 Mon Sep 17 00:00:00 2001
From: Jens Reidel <adrian@travitia.xyz>
Date: Thu, 1 Feb 2024 11:34:06 +0100
Subject: [PATCH 696/707] drivers: phy: qualcomm: Add i2c eusb2 repeater driver

Signed-off-by: Jens Reidel <adrian@travitia.xyz>
---
 arch/arm64/configs/sm8450.config              |   2 +-
 drivers/phy/qualcomm/Kconfig                  |   8 +
 drivers/phy/qualcomm/Makefile                 |   1 +
 .../qualcomm/phy-qcom-i2c-eusb2-repeater.c    | 444 ++++++++++++++++++
 4 files changed, 454 insertions(+), 1 deletion(-)
 create mode 100644 drivers/phy/qualcomm/phy-qcom-i2c-eusb2-repeater.c

diff --git a/arch/arm64/configs/sm8450.config b/arch/arm64/configs/sm8450.config
index 1f336f1f3b46f9..09deeb5ae7cb29 100644
--- a/arch/arm64/configs/sm8450.config
+++ b/arch/arm64/configs/sm8450.config
@@ -3,7 +3,7 @@ CONFIG_LOCALVERSION="-sm8450"
 # CONFIG_LOCALVERSION_AUTO is not set
 
 # Common for SM8450 devices
-# nothing yet
+CONFIG_PHY_QCOM_I2C_EUSB2_REPEATER=y
 
 # Xiaomi 12 (Cupid)
 CONFIG_DRM_PANEL_XIAOMI_42_02_0A=y
diff --git a/drivers/phy/qualcomm/Kconfig b/drivers/phy/qualcomm/Kconfig
index 846f8c99547fd5..ec807ea2f9f3a0 100644
--- a/drivers/phy/qualcomm/Kconfig
+++ b/drivers/phy/qualcomm/Kconfig
@@ -143,6 +143,14 @@ config PHY_QCOM_EUSB2_REPEATER
 	  PMICs. The repeater is paired with a Synopsys eUSB2 Phy
 	  on Qualcomm SOCs.
 
+config PHY_QCOM_I2C_EUSB2_REPEATER
+	tristate "Qualcomm SNPS I2C eUSB2 Repeater Driver"
+	depends on OF && I2C && (ARCH_QCOM || COMPILE_TEST)
+	select GENERIC_PHY
+	help
+	  Enable support for high-speed SNPS eUSB2 repeaters on Qualcomm
+	  platforms connected via I2C.
+
 config PHY_QCOM_M31_USB
 	tristate "Qualcomm M31 HS PHY driver support"
 	depends on USB && (ARCH_QCOM || COMPILE_TEST)
diff --git a/drivers/phy/qualcomm/Makefile b/drivers/phy/qualcomm/Makefile
index eb60e950ad5333..73a2ac366e0dc7 100644
--- a/drivers/phy/qualcomm/Makefile
+++ b/drivers/phy/qualcomm/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_PHY_QCOM_QMP_USB_LEGACY)	+= phy-qcom-qmp-usb-legacy.o
 obj-$(CONFIG_PHY_QCOM_QUSB2)		+= phy-qcom-qusb2.o
 obj-$(CONFIG_PHY_QCOM_SNPS_EUSB2)	+= phy-qcom-snps-eusb2.o
 obj-$(CONFIG_PHY_QCOM_EUSB2_REPEATER)	+= phy-qcom-eusb2-repeater.o
+obj-$(CONFIG_PHY_QCOM_I2C_EUSB2_REPEATER) += phy-qcom-i2c-eusb2-repeater.o
 obj-$(CONFIG_PHY_QCOM_USB_HS) 		+= phy-qcom-usb-hs.o
 obj-$(CONFIG_PHY_QCOM_USB_HSIC) 	+= phy-qcom-usb-hsic.o
 obj-$(CONFIG_PHY_QCOM_USB_HS_28NM)	+= phy-qcom-usb-hs-28nm.o
diff --git a/drivers/phy/qualcomm/phy-qcom-i2c-eusb2-repeater.c b/drivers/phy/qualcomm/phy-qcom-i2c-eusb2-repeater.c
new file mode 100644
index 00000000000000..e386336e583292
--- /dev/null
+++ b/drivers/phy/qualcomm/phy-qcom-i2c-eusb2-repeater.c
@@ -0,0 +1,444 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023, Linaro Limited
+ */
+
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/gpio/consumer.h>
+#include <linux/module.h>
+#include <linux/regulator/consumer.h>
+#include <linux/regmap.h>
+#include <linux/of.h>
+#include <linux/phy/phy.h>
+
+#define EUSB2_3P0_VOL_MIN			3075000 /* uV */
+#define EUSB2_3P0_VOL_MAX			3300000 /* uV */
+#define EUSB2_3P0_HPM_LOAD			3500	/* uA */
+
+#define EUSB2_1P8_VOL_MIN			1800000 /* uV */
+#define EUSB2_1P8_VOL_MAX			1800000 /* uV */
+#define EUSB2_1P8_HPM_LOAD			32000	/* uA */
+
+/* NXP eUSB2 repeater registers */
+#define RESET_CONTROL			0x01
+#define LINK_CONTROL1			0x02
+#define LINK_CONTROL2			0x03
+#define eUSB2_RX_CONTROL		0x04
+#define eUSB2_TX_CONTROL		0x05
+#define USB2_RX_CONTROL			0x06
+#define USB2_TX_CONTROL1		0x07
+#define USB2_TX_CONTROL2		0x08
+#define USB2_HS_TERMINATION		0x09
+#define RAP_SIGNATURE			0x0D
+#define VDX_CONTROL			0x0E
+#define DEVICE_STATUS			0x0F
+#define LINK_STATUS			0x10
+#define REVISION_ID			0x13
+#define CHIP_ID_0			0x14
+#define CHIP_ID_1			0x15
+#define CHIP_ID_2			0x16
+
+/* TI eUSB2 repeater registers */
+#define GPIO0_CONFIG			0x00
+#define GPIO1_CONFIG			0x40
+#define UART_PORT1			0x50
+#define EXTRA_PORT1			0x51
+#define U_TX_ADJUST_PORT1		0x70
+#define U_HS_TX_PRE_EMPHASIS_P1		0x71
+#define U_RX_ADJUST_PORT1		0x72
+#define U_DISCONNECT_SQUELCH_PORT1	0x73
+#define E_HS_TX_PRE_EMPHASIS_P1		0x77
+#define E_TX_ADJUST_PORT1		0x78
+#define E_RX_ADJUST_PORT1		0x79
+#define REV_ID				0xB0
+#define GLOBAL_CONFIG			0xB2
+#define INT_ENABLE_1			0xB3
+#define INT_ENABLE_2			0xB4
+#define BC_CONTROL			0xB6
+#define BC_STATUS_1			0xB7
+#define INT_STATUS_1			0xA3
+#define INT_STATUS_2			0xA4
+
+enum eusb2_repeater_type {
+	TI_REPEATER,
+	NXP_REPEATER,
+};
+
+struct i2c_repeater_chip {
+	enum eusb2_repeater_type repeater_type;
+};
+
+struct i2c_eusb2_repeater {
+	struct device			*dev;
+	struct phy			*phy;
+	struct regmap			*regmap;
+	const struct i2c_repeater_chip	*chip;
+	u16				reg_base;
+	struct regulator		*vdd18;
+	struct regulator		*vdd3;
+	bool				power_enabled;
+
+	struct gpio_desc		*reset_gpiod;
+	u32				*param_override_seq;
+	u8				param_override_seq_cnt;
+	u32				*param_override_seq_host;
+	u8				param_override_seq_cnt_host;
+};
+
+static int eusb2_i2c_read_reg(struct i2c_eusb2_repeater *rptr, u8 reg, u8 *val)
+{
+	int ret;
+	unsigned int reg_val;
+
+	ret = regmap_read(rptr->regmap, reg, &reg_val);
+	if (ret < 0) {
+		dev_err(rptr->dev, "Failed to read reg: 0x%02x ret=%d\n", reg, ret);
+		return ret;
+	}
+
+	*val = reg_val;
+	dev_dbg(rptr->dev, "read reg: 0x%02x val:0x%02x\n", reg, *val);
+
+	return 0;
+}
+
+static int eusb2_i2c_write_reg(struct i2c_eusb2_repeater *rptr, u8 reg, u8 val)
+{
+	int ret;
+
+	ret = regmap_write(rptr->regmap, reg, val);
+	if (ret < 0) {
+		dev_err(rptr->dev, "failed to write 0x%02x to reg: 0x%02x ret=%d\n", val, reg, ret);
+		return ret;
+	}
+
+	dev_dbg(rptr->dev, "write reg: 0x%02x val:0x%02x\n", reg, val);
+
+	return 0;
+}
+
+static void eusb2_repeater_update_seq(struct i2c_eusb2_repeater *rptr, u32 *seq, u8 cnt)
+{
+	int i;
+
+	dev_dbg(rptr->dev, "param override seq count: %d\n", cnt);
+	for (i = 0; i < cnt; i = i+2) {
+		dev_dbg(rptr->dev, "write 0x%02x to 0x%02x\n", seq[i], seq[i+1]);
+		eusb2_i2c_write_reg(rptr, seq[i+1], seq[i]);
+	}
+}
+
+static int i2c_eusb2_repeater_init(struct phy *phy)
+{
+	struct i2c_eusb2_repeater *rptr = phy_get_drvdata(phy);
+	const struct i2c_repeater_chip *chip = rptr->chip;
+	int ret;
+	u8 reg_val;
+
+	if (rptr->power_enabled) {
+		dev_info(rptr->dev, "regulators are already on\n");
+		return 0;
+	}
+
+	ret = regulator_set_load(rptr->vdd18, EUSB2_1P8_HPM_LOAD);
+	if (ret < 0) {
+		dev_err(rptr->dev, "Unable to set HPM of vdd12: %d\n", ret);
+		return ret;
+	}
+
+	ret = regulator_set_voltage(rptr->vdd18, EUSB2_1P8_VOL_MIN,
+						EUSB2_1P8_VOL_MAX);
+	if (ret) {
+		dev_err(rptr->dev,
+				"Unable to set voltage for vdd18: %d\n", ret);
+		return ret;
+	}
+
+	ret = regulator_enable(rptr->vdd18);
+	if (ret) {
+		dev_err(rptr->dev, "Unable to enable vdd18: %d\n", ret);
+		return ret;
+	}
+
+	ret = regulator_set_load(rptr->vdd3, EUSB2_3P0_HPM_LOAD);
+	if (ret < 0) {
+		dev_err(rptr->dev, "Unable to set HPM of vdd3: %d\n", ret);
+		return ret;
+	}
+
+	ret = regulator_set_voltage(rptr->vdd3, EUSB2_3P0_VOL_MIN,
+						EUSB2_3P0_VOL_MAX);
+	if (ret) {
+		dev_err(rptr->dev,
+				"Unable to set voltage for vdd3: %d\n", ret);
+		return ret;
+	}
+
+	ret = regulator_enable(rptr->vdd3);
+	if (ret) {
+		dev_err(rptr->dev, "Unable to enable vdd3: %d\n", ret);
+		return ret;
+	}
+
+	rptr->power_enabled = true;
+	pr_debug("I2C eUSB2 repeater regulators are turned on\n");
+
+	switch (chip->repeater_type) {
+	case TI_REPEATER:
+		eusb2_i2c_read_reg(rptr, REV_ID, &reg_val);
+		break;
+	case NXP_REPEATER:
+		eusb2_i2c_read_reg(rptr, REVISION_ID, &reg_val);
+		break;
+	default:
+		dev_err(rptr->dev, "Invalid repeater\n");
+	}
+
+	dev_info(rptr->dev, "eUSB2 repeater version = 0x%x\n", reg_val);
+	dev_info(rptr->dev, "eUSB2 repeater init\n");
+
+	return ret;
+}
+
+static int i2c_eusb2_repeater_set_mode(struct phy *phy,
+				   enum phy_mode mode, int submode)
+{
+	struct i2c_eusb2_repeater *rptr = phy_get_drvdata(phy);
+
+	switch (mode) {
+	case PHY_MODE_USB_HOST:
+		eusb2_repeater_update_seq(rptr, rptr->param_override_seq_host,
+					rptr->param_override_seq_cnt_host);
+		dev_info(rptr->dev, "Set phy mode to usb host\n");
+		break;
+	case PHY_MODE_USB_DEVICE:
+		eusb2_repeater_update_seq(rptr, rptr->param_override_seq,
+					rptr->param_override_seq_cnt);
+		dev_info(rptr->dev, "Set phy mode to usb device\n");
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int i2c_eusb2_repeater_exit(struct phy *phy)
+{
+	struct i2c_eusb2_repeater *rptr = phy_get_drvdata(phy);
+	int ret;
+
+	ret = regulator_disable(rptr->vdd3);
+	if (ret)
+		dev_err(rptr->dev, "Unable to disable vdd3: %d\n", ret);
+
+	ret = regulator_set_voltage(rptr->vdd3, 0, EUSB2_3P0_VOL_MAX);
+	if (ret)
+		dev_err(rptr->dev,
+			"Unable to set (0) voltage for vdd3: %d\n", ret);
+
+	ret = regulator_set_load(rptr->vdd3, 0);
+	if (ret < 0)
+		dev_err(rptr->dev, "Unable to set (0) HPM of vdd3\n");
+
+	ret = regulator_disable(rptr->vdd18);
+	if (ret)
+		dev_err(rptr->dev, "Unable to disable vdd18:%d\n", ret);
+
+	ret = regulator_set_voltage(rptr->vdd18, 0, EUSB2_1P8_VOL_MAX);
+	if (ret)
+		dev_err(rptr->dev,
+			"Unable to set (0) voltage for vdd18: %d\n", ret);
+
+	ret = regulator_set_load(rptr->vdd18, 0);
+	if (ret < 0)
+		dev_err(rptr->dev, "Unable to set LPM of vdd18\n");
+
+	rptr->power_enabled = false;
+	dev_dbg(rptr->dev, "I2C eUSB2 repeater's regulators are turned off.\n");
+
+	return ret;
+}
+
+static const struct phy_ops i2c_eusb2_repeater_ops = {
+	.init		= i2c_eusb2_repeater_init,
+	.exit		= i2c_eusb2_repeater_exit,
+	.set_mode	= i2c_eusb2_repeater_set_mode,
+	.owner		= THIS_MODULE,
+};
+
+static const struct regmap_config i2c_eusb2_regmap = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = 0xff,
+};
+
+static struct i2c_repeater_chip repeater_chip[] = {
+	[NXP_REPEATER] = {
+		.repeater_type = NXP_REPEATER,
+	},
+	[TI_REPEATER] = {
+		.repeater_type = TI_REPEATER,
+	}
+};
+
+static const struct of_device_id i2c_eusb2_repeater_of_match_table[] = {
+	{
+		.compatible = "nxp,eusb2-repeater",
+		.data = &repeater_chip[NXP_REPEATER],
+	},
+	{
+		.compatible = "ti,eusb2-repeater",
+		.data = &repeater_chip[TI_REPEATER],
+	},
+	{ },
+};
+MODULE_DEVICE_TABLE(of, i2c_eusb2_repeater_of_match_table);
+
+static int i2c_eusb2_repeater_probe(struct i2c_client *client)
+{
+	struct i2c_eusb2_repeater *rptr;
+	struct device *dev = &client->dev;
+	struct phy_provider *phy_provider;
+	struct device_node *np = dev->of_node;
+	const struct of_device_id *match;
+	int ret, num_elem;
+	u32 res;
+
+	rptr = devm_kzalloc(dev, sizeof(*rptr), GFP_KERNEL);
+	if (!rptr) {
+		dev_err(dev, "unable to allocate i2c_eusb2_repeater\n");
+		return -ENOMEM;
+	}
+
+	rptr->dev = dev;
+	dev_set_drvdata(dev, rptr);
+
+	match = of_match_node(i2c_eusb2_repeater_of_match_table, dev->of_node);
+	rptr->chip = match->data;
+
+	rptr->regmap = devm_regmap_init_i2c(client, &i2c_eusb2_regmap);
+	if (!rptr->regmap) {
+		dev_err(dev, "devm_regmap_init_i2c failed\n");
+		return -ENODEV;
+	}
+
+	ret = of_property_read_u32(np, "reg", &res);
+	if (ret < 0) {
+		dev_err(dev, "failed to read reg\n");
+		return ret;
+	}
+
+	rptr->vdd3 = devm_regulator_get(dev, "vdd3");
+	if (IS_ERR(rptr->vdd3)) {
+		dev_err(dev, "unable to get vdd3 supply\n");
+		return PTR_ERR(rptr->vdd3);
+	}
+
+	rptr->vdd18 = devm_regulator_get(dev, "vdd18");
+	if (IS_ERR(rptr->vdd18)) {
+		dev_err(dev, "unable to get vdd18 supply\n");
+		return PTR_ERR(rptr->vdd18);
+	}
+
+	rptr->reset_gpiod = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH);
+	if (IS_ERR(rptr->reset_gpiod)) {
+		return PTR_ERR(rptr->reset_gpiod);
+	}
+
+	num_elem = of_property_count_elems_of_size(dev->of_node, "qcom,param-override-seq",
+				sizeof(*rptr->param_override_seq));
+	if (num_elem > 0) {
+		if (num_elem % 2) {
+			dev_err(dev, "invalid param_override_seq_len\n");
+			return -EINVAL;
+		}
+
+		rptr->param_override_seq_cnt = num_elem;
+		rptr->param_override_seq = devm_kcalloc(dev,
+				rptr->param_override_seq_cnt,
+				sizeof(*rptr->param_override_seq), GFP_KERNEL);
+		if (!rptr->param_override_seq) {
+			return -ENOMEM;
+		}
+
+		ret = of_property_read_u32_array(dev->of_node,
+				"qcom,param-override-seq",
+				rptr->param_override_seq,
+				rptr->param_override_seq_cnt);
+		if (ret) {
+			dev_err(dev, "qcom,param-override-seq read failed %d\n",
+									ret);
+			return ret;
+		}
+	}
+
+	num_elem = of_property_count_elems_of_size(dev->of_node, "qcom,param-override-seq-host",
+				sizeof(*rptr->param_override_seq_host));
+	if (num_elem > 0) {
+		if (num_elem % 2) {
+			dev_err(dev, "invalid param_override_seq_host_len\n");
+			return -EINVAL;
+		}
+
+		rptr->param_override_seq_cnt_host = num_elem;
+		rptr->param_override_seq_host = devm_kcalloc(dev,
+				rptr->param_override_seq_cnt_host,
+				sizeof(*rptr->param_override_seq_host), GFP_KERNEL);
+		if (!rptr->param_override_seq_host) {
+			return -ENOMEM;
+		}
+
+		ret = of_property_read_u32_array(dev->of_node,
+				"qcom,param-override-seq-host",
+				rptr->param_override_seq_host,
+				rptr->param_override_seq_cnt_host);
+		if (ret) {
+			dev_err(dev, "qcom,param-override-seq-host read failed %d\n",
+									ret);
+			return ret;
+		}
+	}
+
+
+	rptr->phy = devm_phy_create(dev, np, &i2c_eusb2_repeater_ops);
+	if (IS_ERR(rptr->phy)) {
+		dev_err(dev, "failed to create PHY: %d\n", ret);
+		return PTR_ERR(rptr->phy);
+	}
+
+	phy_set_drvdata(rptr->phy, rptr);
+
+	phy_provider = devm_of_phy_provider_register(dev, of_phy_simple_xlate);
+	if (IS_ERR(phy_provider))
+		return PTR_ERR(phy_provider);
+
+	dev_info(dev, "Registered Qcom-I2C-eUSB2 repeater\n");
+
+	return 0;
+}
+
+static void i2c_eusb2_repeater_remove(struct i2c_client *client)
+{
+	struct i2c_eusb2_repeater *rptr = i2c_get_clientdata(client);
+
+	if (!rptr)
+		return;
+
+	i2c_eusb2_repeater_exit(rptr->phy);
+}
+
+static struct i2c_driver i2c_eusb2_repeater_driver = {
+	.probe		= i2c_eusb2_repeater_probe,
+	.remove		= i2c_eusb2_repeater_remove,
+	.driver = {
+		.name	= "qcom-i2c-eusb2-repeater",
+		.of_match_table = i2c_eusb2_repeater_of_match_table,
+	},
+};
+
+module_i2c_driver(i2c_eusb2_repeater_driver);
+
+MODULE_DESCRIPTION("Qualcomm I2C eUSB2 Repeater driver");
+MODULE_LICENSE("GPL");

From 0ed392c9399bde31f0695f02510dd7a34bbd1471 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Thu, 1 Feb 2024 20:42:22 +0300
Subject: [PATCH 697/707] ufs: host: ufs-qcom: Add 850Mhz clk freq support for
 unipro core

Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
---
 drivers/ufs/host/ufs-qcom.c | 3 +++
 drivers/ufs/host/ufs-qcom.h | 1 +
 2 files changed, 4 insertions(+)

diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c
index 0aeaee1c564c52..ded9fec3e4590d 100644
--- a/drivers/ufs/host/ufs-qcom.c
+++ b/drivers/ufs/host/ufs-qcom.c
@@ -1162,6 +1162,9 @@ static int ufs_qcom_set_clk_40ns_cycles(struct ufs_hba *hba,
 	 * generic formulae.
 	 */
 	switch (cycles_in_1us) {
+	case UNIPRO_CORE_CLK_FREQ_850_MHZ:
+		cycles_in_40ns = 34;
+		break;
 	case UNIPRO_CORE_CLK_FREQ_403_MHZ:
 		cycles_in_40ns = 16;
 		break;
diff --git a/drivers/ufs/host/ufs-qcom.h b/drivers/ufs/host/ufs-qcom.h
index 9dd9a391ebb762..cf44198771f738 100644
--- a/drivers/ufs/host/ufs-qcom.h
+++ b/drivers/ufs/host/ufs-qcom.h
@@ -134,6 +134,7 @@ enum {
 #define UNIPRO_CORE_CLK_FREQ_300_MHZ           300
 #define UNIPRO_CORE_CLK_FREQ_201_5_MHZ         202
 #define UNIPRO_CORE_CLK_FREQ_403_MHZ           403
+#define UNIPRO_CORE_CLK_FREQ_850_MHZ           850
 
 static inline void
 ufs_qcom_get_controller_revision(struct ufs_hba *hba,

From 062a100f2f9c7525c288a5e7d3d41134719866c5 Mon Sep 17 00:00:00 2001
From: Jens Reidel <adrian@travitia.xyz>
Date: Fri, 2 Feb 2024 04:20:15 +0100
Subject: [PATCH 698/707] arm64: dts: qcom: Add device-tree for Xiaomi 12T Pro

Add device tree for the Xiaomi 12T Pro / Redmi K50 Ultra (diting)
smartphone. This device is based on Qualcomm Snapdragon 8+ Gen 1
(SM8475) SoC.

Signed-off-by: Jens Reidel <adrian@travitia.xyz>
---
 arch/arm64/boot/dts/qcom/Makefile             |   1 +
 .../boot/dts/qcom/sm8475-xiaomi-diting.dts    | 573 ++++++++++++++++++
 2 files changed, 574 insertions(+)
 create mode 100644 arch/arm64/boot/dts/qcom/sm8475-xiaomi-diting.dts

diff --git a/arch/arm64/boot/dts/qcom/Makefile b/arch/arm64/boot/dts/qcom/Makefile
index 42c17125c5b858..ffc6bd5faa7e30 100644
--- a/arch/arm64/boot/dts/qcom/Makefile
+++ b/arch/arm64/boot/dts/qcom/Makefile
@@ -235,6 +235,7 @@ dtb-$(CONFIG_ARCH_QCOM)	+= sm8450-sony-xperia-nagara-pdx223.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8450-sony-xperia-nagara-pdx224.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8450-xiaomi-cupid.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8475-nothing-pong.dtb
+dtb-$(CONFIG_ARCH_QCOM)	+= sm8475-xiaomi-diting.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8550-hdk.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8550-mtp.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8550-qrd.dtb
diff --git a/arch/arm64/boot/dts/qcom/sm8475-xiaomi-diting.dts b/arch/arm64/boot/dts/qcom/sm8475-xiaomi-diting.dts
new file mode 100644
index 00000000000000..89feb452a48146
--- /dev/null
+++ b/arch/arm64/boot/dts/qcom/sm8475-xiaomi-diting.dts
@@ -0,0 +1,573 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2024, Jens Reidel <adrian@travitia.xyz>
+ */
+/dts-v1/;
+
+#include <dt-bindings/arm/qcom,ids.h>
+#include <dt-bindings/regulator/qcom,rpmh-regulator.h>
+#include <dt-bindings/leds/common.h>
+
+#include "sm8475.dtsi"
+#include "pm8350.dtsi"
+#include "pm8350b.dtsi"
+#include "pm8350c.dtsi"
+#include "pm8450.dtsi"
+#include "pmk8350.dtsi"
+#include "pmr735a.dtsi"
+
+/delete-node/ &adsp_mem;
+/delete-node/ &rmtfs_mem;
+/delete-node/ &xbl_ramdump_mem;
+/delete-node/ &xbl_sc_mem;
+
+/ {
+	model = "Xiaomi 12T Pro / Redmi K50 Ultra";
+	compatible = "xiaomi,diting", "qcom,sm8475";
+	chassis-type = "handset";
+
+	reserved-memory {
+		xbl_ramdump_mem: memory@a6b80000 {
+			reg = <0x0 0xa6b80000 0x0 0x280000>;
+			no-map;
+		};
+
+		xbl_sc_mem: memory@a6e00000 {
+			reg = <0x0 0xa6e00000 0x0 0x40000>;
+			no-map;
+		};
+
+		adsp_mem: memory@9fd00000 {
+			reg = <0x0 0x9fd00000 0x0 0x3100000>;
+			no-map;
+		};
+
+		ramoops@a7000000 {
+			compatible = "ramoops";
+			reg = <0x0 0xa7000000 0x0 0x400000>;
+
+			record-size = <0x200000>;
+			pmsg-size = <0x200000>;
+			console-size = <0x200000>;
+			no-map;
+		};
+
+		rmtfs_mem: memory@f8700000 {
+			compatible = "qcom,rmtfs-mem";
+			reg = <0x0 0xf8700000 0x0 0x280000>;
+			no-map;
+
+			qcom,client-id = <1>;
+			qcom,vmid = <QCOM_SCM_VMID_MSS_MSA>;
+		};
+	};
+
+	chosen {
+		#address-cells = <2>;
+		#size-cells = <2>;
+		ranges;
+
+		bootargs = "PMOS_NOSPLASH console=tty0";
+	};
+};
+
+&apps_rsc {
+	regulators-0 {
+		compatible = "qcom,pm8350-rpmh-regulators";
+		qcom,pmic-id = "b";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+		vdd-s4-supply = <&vph_pwr>;
+		vdd-s5-supply = <&vph_pwr>;
+		vdd-s6-supply = <&vph_pwr>;
+		vdd-s7-supply = <&vph_pwr>;
+		vdd-s8-supply = <&vph_pwr>;
+		vdd-s9-supply = <&vph_pwr>;
+		vdd-s10-supply = <&vph_pwr>;
+		vdd-s11-supply = <&vph_pwr>;
+		vdd-s12-supply = <&vph_pwr>;
+
+		vdd-l1-l4-supply = <&pm8350_s11>;
+		vdd-l2-l7-supply = <&vreg_bob>;
+		vdd-l3-l5-supply = <&pm8350_s11>;
+		vdd-l6-l9-l10-supply = <&pm8350_s12>;
+		*/
+
+		/*
+		 * ARC regulators:
+		 * s5 - gfx.lvl
+		 * l8 - lcx.lvl
+		 */
+
+		pm8350_s10: smps10 {
+			regulator-name = "pm8350_s10";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1800000>;
+		};
+
+		pm8350_s11: smps11 {
+			regulator-name = "pm8350_s11";
+			regulator-min-microvolt = <382000>;
+			regulator-max-microvolt = <1170000>;
+		};
+
+		pm8350_s12: smps12 {
+			regulator-name = "pm8350_s12";
+			regulator-min-microvolt = <1224000>;
+			regulator-max-microvolt = <2040000>;
+		};
+
+		pm8350_l1: ldo1 {
+			regulator-name = "pm8350_l1";
+			regulator-min-microvolt = <830000>;
+			regulator-max-microvolt = <920000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l2: ldo2 {
+			regulator-name = "pm8350_l2";
+			regulator-min-microvolt = <2700000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l3: ldo3 {
+			regulator-name = "pm8350_l3";
+			regulator-min-microvolt = <870000>;
+			regulator-max-microvolt = <970000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l5: ldo5 {
+			regulator-name = "pm8350_l5";
+			regulator-min-microvolt = <720000>;
+			regulator-max-microvolt = <958000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l6: ldo6 {
+			regulator-name = "pm8350_l6";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1216000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l7: ldo7 {
+			regulator-name = "pm8350_l7";
+			regulator-min-microvolt = <2400000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l9: ldo9 {
+			regulator-name = "pm8350_l9";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1304000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+	};
+
+	regulators-1 {
+		compatible = "qcom,pm8350c-rpmh-regulators";
+		qcom,pmic-id = "c";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+		vdd-s4-supply = <&vph_pwr>;
+		vdd-s5-supply = <&vph_pwr>;
+		vdd-s6-supply = <&vph_pwr>;
+		vdd-s7-supply = <&vph_pwr>;
+		vdd-s8-supply = <&vph_pwr>;
+		vdd-s9-supply = <&vph_pwr>;
+		vdd-s10-supply = <&vph_pwr>;
+
+		vdd-l1-l12-supply = <&pm8350c_s1>;
+		vdd-l2-l8-supply = <&pm8350c_s1>;
+		vdd-l3-l4-l5-l7-l13-supply = <&vreg_bob>;
+		vdd-l6-l9-l11-supply = <&vreg_bob>;
+		vdd-l10-supply = <&pm8350_s12>;
+
+		vdd-bob-supply = <&vph_pwr>;
+		*/
+
+		/*
+		 * ARC regulators:
+		 * s2 - mxc.lvl
+		 * s4 - mss.lvl
+		 * s6 - cx.lvl
+		 */
+
+		pm8350c_s1: smps1 {
+			regulator-name = "pm8350c_s1";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <2024000>;
+		};
+
+		pm8350c_s10: smps10 {
+			regulator-name = "pm8350c_s10";
+			regulator-min-microvolt = <1052000>;
+			regulator-max-microvolt = <1170000>;
+		};
+
+		vreg_bob: bob {
+			regulator-name = "vreg_bob";
+			regulator-min-microvolt = <3008000>;
+			regulator-max-microvolt = <3960000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_AUTO>;
+		};
+
+		pm8350c_l1: ldo1 {
+			regulator_name = "pm8350c_l1";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1800000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l2: ldo2 {
+			regulator-name = "pm8350c_l2";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1980000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l3: ldo3 {
+			regulator-name = "pm8350c_l3";
+			regulator-min-microvolt = <3300000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l4: ldo4 {
+			regulator-name = "pm8350c_l4";
+			regulator-min-microvolt = <1620000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l5: ldo5 {
+			regulator-name = "pm8350c_l5";
+			regulator-min-microvolt = <1620000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l6: ldo6 {
+			regulator-name = "pm8350c_l6";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l7: ldo7 {
+			regulator-name = "pm8350c_l7";
+			regulator-min-microvolt = <3000000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l8: ldo8 {
+			regulator-name = "pm8350c_l8";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <2000000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l9: ldo9 {
+			regulator-name = "pm8350c_l9";
+			regulator-min-microvolt = <2700000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l10: ldo10 {
+			regulator-name = "pm8350c_l10";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1304000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l11: ldo11 {
+			regulator-name = "pm8350c_l11";
+			regulator-min-microvolt = <2400000>;
+			regulator-max-microvolt = <3008000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l12: ldo12 {
+			regulator-name = "pm8350c_l12";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1980000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l13: ldo13 {
+			regulator-name = "pm8350c_l13";
+			regulator-min-microvolt = <2700000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+	};
+
+	regulators-2 {
+		compatible = "qcom,pm8450-rpmh-regulators";
+		qcom,pmic-id = "h";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+		vdd-s4-supply = <&vph_pwr>;
+		vdd-s5-supply = <&vph_pwr>;
+		vdd-s6-supply = <&vph_pwr>;
+
+		vdd-l2-supply = <&vreg_bob>;
+		vdd-l3-supply = <&vreg_bob>;
+		vdd-l4-supply = <&vreg_bob>;
+		*/
+
+		/*
+		 * ARC regulators:
+		 * S2 - ebi.lvl
+		 * S4 - mmcx.lvl
+		 * S6 - mx.lvl
+		 * L1 - lmx.lvl
+		 */
+
+		pm8450_s3: smps3 {
+			regulator-name = "pm8450_s3";
+			regulator-min-microvolt = <470000>;
+			regulator-max-microvolt = <570000>;
+		};
+
+		pm8450_l2: ldo2 {
+			regulator-name = "pm8450_l2";
+			regulator-min-microvolt = <820000>;
+			regulator-max-microvolt = <958000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8450_l3: ldo3 {
+			regulator-name = "pm8450_l3";
+			regulator-min-microvolt = <866000>;
+			regulator-max-microvolt = <958000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8450_l4: ldo4 {
+			regulator-name = "pm8450_l4";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1808000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+	};
+
+	regulators-3 {
+		compatible = "qcom,pmr735a-rpmh-regulators";
+		qcom,pmic-id = "e";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+
+		vdd-l1-l2-supply = <&pmr735a_s2>;
+		vdd-l3-supply = <&pmr735a_s1>;
+		vdd-l4-supply = <&pm8350c_s1>;
+		vdd-l5-l6-supply = <&pm8350c_s1>;
+		vdd-l7-bob-supply = <&vreg_bob>;
+		*/
+
+		pmr735a_s2: smps2 {
+			regulator-name = "pmr735a_s2";
+			regulator-min-microvolt = <500000>;
+			regulator-max-microvolt = <1040000>;
+		};
+
+		pmr735a_s3: smps3 {
+			regulator-name = "pmr735a_s3";
+			regulator-min-microvolt = <300000>;
+			regulator-max-microvolt = <2352000>;
+		};
+
+		pmr735a_l1: ldo1 {
+			regulator-name = "pmr735a_l1";
+			regulator-min-microvolt = <800000>;
+			regulator-max-microvolt = <880000>;
+		};
+
+		pmr735a_l2: ldo2 {
+			regulator-name = "pmr735a_l2";
+			regulator-min-microvolt = <480000>;
+			regulator-max-microvolt = <1200000>;
+		};
+
+		pmr735a_l3: ldo3 {
+			regulator-name = "pmr735a_l3";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1200000>;
+		};
+
+		pmr735a_l4: ldo4 {
+			regulator-name = "pmr735a_l4";
+			regulator-min-microvolt = <1776000>;
+			regulator-max-microvolt = <1776000>;
+		};
+
+		pmr735a_l5: ldo5 {
+			regulator-name = "pmr735a_l5";
+			regulator-min-microvolt = <880000>;
+			regulator-max-microvolt = <920000>;
+		};
+
+		pmr735a_l6: ldo6 {
+			regulator-name = "pmr735a_l6";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1200000>;
+		};
+
+		pmr735a_l7: ldo7 {
+			regulator-name = "pmr735a_l7";
+			regulator-min-microvolt = <2800000>;
+			regulator-max-microvolt = <2800000>;
+		};
+	};
+};
+
+&gpi_dma0 {
+	status = "okay";
+};
+
+&gpi_dma1 {
+	status = "okay";
+};
+
+&gpi_dma2 {
+	status = "okay";
+};
+
+&i2c5 {
+	clock-frequency = <100000>;
+	status = "okay";
+
+	/* fsa4480@42 qcom,fsa4480-i2c */
+
+	nxp_eusb2_repeater: eusb2_repeater@4f {
+		compatible = "nxp,eusb2-repeater";
+		reg = <0x4f>;
+		vdd18-supply = <&pm8350_s10>;
+		vdd3-supply = <&pm8350_l2>;
+		reset-gpio = <&pm8350c_gpios 7 GPIO_ACTIVE_HIGH>;
+		pinctrl-names = "default";
+		pinctrl-0 = <&eusb2_reset_ctrl_default>;
+		#phy-cells = <0>;
+		qcom,param-override-seq =
+				/* Rx squelch detection threshold to 110mV; default is 125mV */
+				<0x40 0x06
+				/*
+				 * Tx Deemphasis to 2dB, Tx Deemphasis bit duration to 0.8UI;
+				 * default is 0 for both
+				 */
+				0x22 0x07
+				/* Output Voltage Swing to 500mV; default is 450mV */
+				0x63 0x08>;
+	};
+
+	/* wl2866d@28 xiaomi,wl2866d */
+};
+
+&i2c9 {
+	clock-frequency = <1000000>;
+	status = "okay";
+
+	/* nq@28 qcom,sn-nci */
+};
+
+&spi13 {
+	clock-frequency = <19200000>;
+	status = "okay";
+
+	/* ir-spi@0 ir-spi */
+};
+
+&i2c15 {
+	clock-frequency = <1000000>;
+	status = "okay";
+
+	/* cs35l41@40 cirrus,cs35l41 */
+	/* cs35l41@42 cirrus,cs35l41 */
+};
+
+&pm8350c_gpios {
+	eusb2_reset_ctrl_default: eusb2_reset_ctrl_default {
+		pins = "gpio7";
+		function = "normal";
+		input-enable;
+		output-enable;
+		bias-disable;
+		power-source = <1>;	/* 1.8V */
+		qcom,drive-strength = <2>;
+	};
+};
+
+&qupv3_id_0 {
+	status = "okay";
+};
+
+&qupv3_id_1 {
+	status = "okay";
+};
+
+&qupv3_id_2 {
+	status = "okay";
+};
+
+&tlmm {
+	gpio-reserved-ranges = <28 4>;
+};
+
+&ufs_mem_hc {
+	status = "okay";
+
+	reset-gpios = <&tlmm 210 GPIO_ACTIVE_LOW>;
+
+	vcc-supply = <&pm8350_l7>;
+	vcc-max-microamp = <1100000>;
+	vccq-supply = <&pm8350_l9>;
+	vccq-max-microamp = <1200000>;
+};
+
+&ufs_mem_phy {
+	status = "okay";
+
+	vdda-phy-supply = <&pm8350_l5>;
+	vdda-pll-supply = <&pm8350c_l10>;
+};
+
+&usb_1 {
+	/* USB 2.0 only */
+	qcom,select-utmi-as-pipe-clk;
+	status = "okay";
+};
+
+&usb_1_dwc3 {
+	dr_mode = "peripheral";
+	maximum-speed = "high-speed";
+	/* Remove USB3 phy */
+	phys = <&usb_1_hsphy>;
+	phy-names = "usb2-phy";
+};
+
+&usb_1_hsphy {
+	vdd-supply = <&pm8350_l5>;
+	vdda12-supply = <&pm8350c_l10>;
+
+	phys = <&nxp_eusb2_repeater>;
+
+	status = "okay";
+};

From 90c1b5fc58cb2a1912be16168f3f1e7c7eb67a81 Mon Sep 17 00:00:00 2001
From: Jens Reidel <adrian@travitia.xyz>
Date: Fri, 2 Feb 2024 04:53:20 +0100
Subject: [PATCH 699/707] WIP: arm64: dts: qcom: sm8450-xiaomi-cupid: Enable
 wlan

Signed-off-by: Jens Reidel <adrian@travitia.xyz>
---
 .../boot/dts/qcom/sm8450-xiaomi-cupid.dts     | 60 +++++++++++--------
 arch/arm64/boot/dts/qcom/sm8450.dtsi          | 10 ++++
 2 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/arch/arm64/boot/dts/qcom/sm8450-xiaomi-cupid.dts b/arch/arm64/boot/dts/qcom/sm8450-xiaomi-cupid.dts
index 7ce2ee99f2644f..11abd8efcdf8a1 100644
--- a/arch/arm64/boot/dts/qcom/sm8450-xiaomi-cupid.dts
+++ b/arch/arm64/boot/dts/qcom/sm8450-xiaomi-cupid.dts
@@ -170,23 +170,6 @@
 		regulator-always-on;
 		regulator-boot-on;
 	};
-
-	/* This is a hack and taken from qcom,cnss-qca6490 in downstream */
-	wlan_regulator: wlan-regulator {
-		compatible = "regulator-wlan";
-		regulator-name = "wlan";
-
-		enable-gpio = <&tlmm 80 GPIO_ACTIVE_HIGH>;
-
-		supply-count = <6>;
-
-		vin0-supply = <&pmr735a_s2>; // vdd-wlan-aon
-		vin1-supply = <&pm8350_s11>; // vdd-wlan-dig
-		vin2-supply = <&pm8350_s10>; // vdd-wlan-io
-		vin3-supply = <&pm8350c_s1>; // vdd-wlan-rfa1
-		vin4-supply = <&pm8350_s12>; // vdd-wlan-rfa2
-		vin5-supply = <&pmr735a_l7>; // wlan-ant-switch
-	};
 };
 
 
@@ -717,18 +700,30 @@
 };
 
 &pcie0 {
-	/*
-	vdda-supply = <&wlan_regulator>;
+	status = "okay";
+};
 
-	wake-gpios = <&tlmm 96 GPIO_ACTIVE_HIGH>;
-	perst-gpios = <&tlmm 94 GPIO_ACTIVE_LOW>;
+/* TODO: Add support for qca6490-pmu and use that as the supplies so it will power on properly
+&pcieport0 {
+	wifi@0 {
+		compatible = "pci17cb,1103";
+		reg = <0x10000 0x0 0x0 0x0 0x0>;
 
-	pinctrl-0 = <&pcie0_default_state>, <&pmk8550_sleep_clk>;
-	pinctrl-names = "default";
-	*/
+		pinctrl-names = "default", "sleep";
+		pinctrl-0 = <&cnss_wlan_en_active>;
+		pinctrl-1 = <&cnss_wlan_en_sleep>;
 
-	status = "okay";
+		enable-gpios = <&tlmm 80 GPIO_ACTIVE_HIGH>;
+
+		vddio-supply = <&pm8350_s10>;
+		vdd-supply = <&pmr735a_l7>; // actually wlan-ant-switch-supply
+		vdd-aon-suppply = <&pmr735a_s2>;
+		vdd-dig-supply = <&pm8350_s11>;
+		vdd-rfa1-supply = <&pm8350c_s1>;
+		vdd-rfa2-supply = <&pm8350_s12>;
+	};
 };
+/*
 
 &pcie0_phy {
 	vdda-phy-supply = <&pm8350_l5>;
@@ -860,6 +855,21 @@
 		bias-disable;
 		output-low;
 	};
+
+	cnss_wlan_en_active: cnss-wlan-en-active-state {
+		pins = "gpio80";
+		function = "gpio";
+		drive-strength = <16>;
+		bias-pull-up;
+	};
+
+	cnss_wlan_en_sleep: cnss-wlan-en-sleep-state {
+		pins = "gpio80";
+		function = "gpio";
+		drive-strength = <2>;
+		bias-pull-down;
+		output-low;
+	};
 };
 
 &pm8350_gpios {
diff --git a/arch/arm64/boot/dts/qcom/sm8450.dtsi b/arch/arm64/boot/dts/qcom/sm8450.dtsi
index ba9f4a32bcd6c9..bd7622f3ed2f94 100644
--- a/arch/arm64/boot/dts/qcom/sm8450.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8450.dtsi
@@ -1850,6 +1850,16 @@
 			pinctrl-0 = <&pcie0_default_state>;
 
 			status = "disabled";
+
+			pcieport0: pcie@0 {
+				device_type = "pci";
+				reg = <0x0 0x0 0x0 0x0 0x0>;
+				#address-cells = <3>;
+				#size-cells = <2>;
+				ranges;
+
+				bus-range = <0x01 0xff>;
+			};
 		};
 
 		pcie0_phy: phy@1c06000 {

From f914f549b0017329d92e974f6a1bb9e1db149acc Mon Sep 17 00:00:00 2001
From: Jens Reidel <adrian@travitia.xyz>
Date: Fri, 2 Feb 2024 07:34:29 +0100
Subject: [PATCH 700/707] cupid: Improve audio

Signed-off-by: Jens Reidel <adrian@travitia.xyz>
---
 .../boot/dts/qcom/sm8450-xiaomi-cupid.dts     | 153 ++++++++++++++----
 1 file changed, 123 insertions(+), 30 deletions(-)

diff --git a/arch/arm64/boot/dts/qcom/sm8450-xiaomi-cupid.dts b/arch/arm64/boot/dts/qcom/sm8450-xiaomi-cupid.dts
index 11abd8efcdf8a1..46317363c2bbc2 100644
--- a/arch/arm64/boot/dts/qcom/sm8450-xiaomi-cupid.dts
+++ b/arch/arm64/boot/dts/qcom/sm8450-xiaomi-cupid.dts
@@ -584,13 +584,13 @@
 		mode-switch;
 		orientation-switch;
 
-		/*
 		port {
+			/*
 			fsa4480_sbu_mux: endpoint {
 				remote-endpoint = <&pmic_glink_sbu>;
 			};
+			*/
 		};
-		*/
 	};
 
 	/* nq @ 64 */
@@ -723,7 +723,7 @@
 		vdd-rfa2-supply = <&pm8350_s12>;
 	};
 };
-/*
+*/
 
 &pcie0_phy {
 	vdda-phy-supply = <&pm8350_l5>;
@@ -848,6 +848,68 @@
 		bias-pull-down;
 	};
 
+	tert_tdm_clk_active: tert-tdm-clk-active-state {
+		pins = "gpio121";
+		function = "mi2s2_sck";
+		drive-strength = <6>;
+		bias-disable;
+		output-high;
+	};
+
+	tert_tdm_clk_sleep: tert-tdm-clk-sleep-state {
+		pins = "gpio121";
+		function = "gpio";
+		drive-strength = <2>;
+		bias-pull-down;
+		input-enable;
+	};
+
+	tert_tdm_ws_active: tert-tdm-ws-active-state {
+		pins = "gpio123";
+		function = "mi2s2_ws";
+		drive-strength = <8>;
+		bias-disable;
+		output-high;
+	};
+
+	tert_tdm_ws_sleep: tert-tdm-ws-sleep-state {
+		pins = "gpio123";
+		function = "gpio";
+		drive-strength = <2>;
+		bias-pull-down;
+		input-enable;
+	};
+
+	tert_tdm_din_active: tert-tdm-din-active-state {
+		pins = "gpio122";
+		function = "mi2s2_data0";
+		drive-strength = <8>;
+		bias-disable;
+	};
+
+	tert_tdm_din_sleep: tert-tdm-din-sleep-state {
+		pins = "gpio122";
+		function = "gpio";
+		drive-strength = <2>;
+		bias-pull-down;
+		input-enable;
+	};
+
+	tert_tdm_dout_active: tert-tdm-dout-active-state {
+		pins = "gpio124";
+		function = "mi2s2_data1";
+		drive-strength = <8>;
+		bias-disable;
+	};
+
+	tert_tdm_dout_sleep: tert-tdm-dout-sleep-state {
+		pins = "gpio124";
+		function = "gpio";
+		drive-strength = <2>;
+		bias-pull-down;
+		input-enable;
+	};
+
 	wcd_default: wcd-reset-n-active-state {
 		pins = "gpio43";
 		function = "gpio";
@@ -911,29 +973,62 @@
 
 &sound {
 	compatible = "qcom,sm8450-sndcard";
+	pinctrl-names = "default", "sleep";
+	pinctrl-0 = <&tert_tdm_clk_active &tert_tdm_ws_active
+			&tert_tdm_din_active &tert_tdm_dout_active>;
+	pinctrl-1 = <&tert_tdm_clk_sleep &tert_tdm_ws_sleep
+			&tert_tdm_din_sleep &tert_tdm_dout_sleep>;
 	model = "Xiaomi 12";
 	/*
-	audio-routing = "SpkrLeft IN", "WSA_SPK1 OUT",
-			"SpkrRight IN", "WSA_SPK2 OUT",
-			"IN1_HPHL", "HPHL_OUT",
-			"IN2_HPHR", "HPHR_OUT",
-			"AMIC1", "MIC BIAS1",
-			"AMIC2", "MIC BIAS2",
-			"AMIC3", "MIC BIAS3",
-			"AMIC4", "MIC BIAS3",
-			"AMIC5", "MIC BIAS4",
-			"VA DMIC0", "MIC BIAS3",
-			"VA DMIC1", "MIC BIAS3",
-			"VA DMIC2", "MIC BIAS1",
-			"VA DMIC3", "MIC BIAS1",
-			"TX DMIC0", "MIC BIAS3",
-			"TX DMIC1", "MIC BIAS3",
-			"TX DMIC2", "MIC BIAS1",
-			"TX DMIC3", "MIC BIAS1",
-			"TX SWR_INPUT0", "ADC1_OUTPUT",
-			"TX SWR_INPUT1", "ADC2_OUTPUT",
-			"TX SWR_INPUT2", "ADC3_OUTPUT",
-			"TX SWR_INPUT3", "ADC4_OUTPUT";
+	audio-routing = "AMIC1", "Analog Mic1",
+		"AMIC1", "MIC BIAS1",
+		"AMIC2", "Analog Mic2",
+		"AMIC2", "MIC BIAS2",
+		"AMIC3", "Analog Mic3",
+		"AMIC3", "MIC BIAS3",
+		"AMIC4", "Analog Mic4",
+		"AMIC4", "MIC BIAS3",
+		"AMIC5", "Analog Mic5",
+		"AMIC5", "MIC BIAS4",
+		"VA AMIC1", "Analog Mic1",
+		"VA AMIC1", "VA MIC BIAS1",
+		"VA AMIC2", "Analog Mic2",
+		"VA AMIC2", "VA MIC BIAS2",
+		"VA AMIC3", "Analog Mic3",
+		"VA AMIC3", "VA MIC BIAS3",
+		"VA AMIC4", "Analog Mic4",
+		"VA AMIC4", "VA MIC BIAS3",
+		"VA AMIC5", "Analog Mic5",
+		"VA AMIC5", "VA MIC BIAS4",
+		"TX DMIC0", "Digital Mic0",
+		"Digital Mic0", "MIC BIAS3",
+		"TX DMIC1", "Digital Mic1",
+		"Digital Mic1", "MIC BIAS3",
+		"TX DMIC2", "Digital Mic2",
+		"Digital Mic2", "MIC BIAS1",
+		"TX DMIC3", "Digital Mic3",
+		"Digital Mic3", "MIC BIAS1",
+		"IN1_HPHL", "HPHL_OUT",
+		"IN2_HPHR", "HPHR_OUT",
+		"IN3_AUX", "AUX_OUT",
+		"RX_TX DEC0_INP", "TX DEC0 MUX",
+		"RX_TX DEC1_INP", "TX DEC1 MUX",
+		"RX_TX DEC2_INP", "TX DEC2 MUX",
+		"RX_TX DEC3_INP", "TX DEC3 MUX",
+		"TX SWR_INPUT", "WCD_TX_OUTPUT",
+		"VA SWR_INPUT", "VA_SWR_CLK",
+		"VA SWR_INPUT", "WCD_TX_OUTPUT",
+		"VA_AIF1 CAP", "VA_SWR_CLK",
+		"VA_AIF2 CAP", "VA_SWR_CLK",
+		"VA_AIF3 CAP", "VA_SWR_CLK",
+		"VA DMIC0", "Digital Mic0",
+		"VA DMIC1", "Digital Mic1",
+		"VA DMIC2", "Digital Mic2",
+		"VA DMIC3", "Digital Mic3",
+		"Digital Mic0", "VA MIC BIAS3",
+		"Digital Mic1", "VA MIC BIAS3",
+		"Digital Mic2", "VA MIC BIAS1",
+		"Digital Mic3", "VA MIC BIAS1";
 	*/
 
 	wcd-playback-dai-link {
@@ -968,23 +1063,21 @@
 		};
 	};
 
-	/*
-	wsa-dai-link {
-		link-name = "WSA Playback";
+	speaker-playback-dai-link {
+		link-name = "Primary Spkr Playback";
 
 		cpu {
-			sound-dai = <&q6apmbedai WSA_CODEC_DMA_RX_0>;
+			sound-dai = <&q6apmbedai TERTIARY_TDM_RX_0>;
 		};
 
 		codec {
-			sound-dai = <&cs35l41_t>, <&cs35l41_b>, <&wsamacro 0>; //, <&swr0 0>, <&wsamacro 0>;
+			sound-dai = <&cs35l41_t 0>, <&cs35l41_b 0>;
 		};
 
 		platform {
 			sound-dai = <&q6apm>;
 		};
 	};
-	*/
 
 	va-dai-link {
 		link-name = "VA Capture";

From 746f884559baaf8608f12782a066f9cba7c7565d Mon Sep 17 00:00:00 2001
From: BotchedRPR <thenxguy0@gmail.com>
Date: Fri, 2 Feb 2024 09:37:04 +0000
Subject: [PATCH 701/707] [DO NOT MERGE] [WIP] arm64: dts: qcom: Add
 device-tree for Samsung Galaxy Z Fold4

---
 arch/arm64/boot/dts/qcom/Makefile             |   1 +
 .../boot/dts/qcom/sm8475-samsung-q4q.dts      | 738 ++++++++++++++++++
 2 files changed, 739 insertions(+)
 create mode 100644 arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts

diff --git a/arch/arm64/boot/dts/qcom/Makefile b/arch/arm64/boot/dts/qcom/Makefile
index ffc6bd5faa7e30..ebcc9c5a41ad78 100644
--- a/arch/arm64/boot/dts/qcom/Makefile
+++ b/arch/arm64/boot/dts/qcom/Makefile
@@ -235,6 +235,7 @@ dtb-$(CONFIG_ARCH_QCOM)	+= sm8450-sony-xperia-nagara-pdx223.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8450-sony-xperia-nagara-pdx224.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8450-xiaomi-cupid.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8475-nothing-pong.dtb
+dtb-$(CONFIG_ARCH_QCOM) += sm8475-samsung-q4q.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8475-xiaomi-diting.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8550-hdk.dtb
 dtb-$(CONFIG_ARCH_QCOM)	+= sm8550-mtp.dtb
diff --git a/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts b/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts
new file mode 100644
index 00000000000000..ed2fab26ac5d8d
--- /dev/null
+++ b/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts
@@ -0,0 +1,738 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, The Linux Foundation. All rights reserved.
+ */
+
+/dts-v1/;
+
+#include <dt-bindings/arm/qcom,ids.h>
+#include <dt-bindings/regulator/qcom,rpmh-regulator.h>
+#include <dt-bindings/leds/common.h>
+
+#include "sm8475.dtsi"
+#include "pm8350.dtsi"
+#include "pm8350b.dtsi"
+#include "pm8350c.dtsi"
+#include "pm8450.dtsi"
+#include "pmk8350.dtsi"
+#include "pmr735a.dtsi"
+
+/ {
+	model = "Galaxy Z Fold4";
+	chassis-type = "phablet";
+
+	qcom,msm-id = <530 0x10000 0>;
+
+	chosen {
+
+		framebuffer0: framebuffer@b8000000 {
+			compatible = "simple-framebuffer";
+			reg = <0 0xb8000000 0 (2176 * 1812 * 4)>;
+			width = <1812>;
+			height = <2176>;
+			stride = <(1812 * 4)>;
+			format = "a8r8g8b8";
+			status= "okay";
+		};
+
+	};
+
+	reserved-memory {
+		/* Clear this up, this is awful */
+		tme_crash_dump_region@808a0000 {
+			reg = <0x00 0x808a0000 0x00 0x40000>;
+			no-map;
+		};
+
+		cpusys_vm_region@e0600000 {
+			reg = <0x00 0xe0600000 0x00 0x400000>;
+			no-map;
+		};
+
+		smem_region@80900000 {
+			reg = <0x00 0x80900000 0x00 0x200000>;
+			no-map;
+		};
+
+		hyp_region@80000000 {
+			reg = <0x00 0x80000000 0x00 0x600000>;
+			no-map;
+		};
+
+		sec_qcom_rdx_bootdev_region@800C00000 {
+			reg = <0x08 0xc00000 0x00 0xad00000>;
+		};
+
+		video_region@85700000 {
+			reg = <0x00 0x83e00000 0x00 0x700000>;
+			no-map;
+		};
+
+		cdsp_secure_heap_region@80c00000 {
+			reg = <0x00 0x80c00000 0x00 0x600000>;
+			no-map;
+		};
+
+		trust_ui_vm_region@e0b00000 {
+			reg = <0x00 0xe0b00000 0x00 0x45f2000>;
+			no-map;
+		};
+
+		ipa_gsi_region@8b910000 {
+			reg = <0x00 0x8b910000 0x00 0xa000>;
+			no-map;
+		};
+
+		mpss_region@8bc00000 {
+			reg = <0x00 0x8bc00000 0x00 0x13200000>;
+			no-map;
+		};
+
+		spss_region_region@8ba00000 {
+			reg = <0x00 0x8ba00000 0x00 0x180000>;
+			no-map;
+		};
+
+		uh_guest_region {
+			reg = <0x00 0xb1000000 0x00 0x1a00000>;
+		};
+
+		splash_region {
+			reg = <0x00 0xb8000000 0x00 0x2b00000>;
+			label = "cont_splash_region";
+		};
+
+		tags_region@e8900000 {
+			reg = <0x00 0xe8900000 0x00 0x1200000>;
+			no-map;
+		};
+
+		camera_region@9f500000 {
+			reg = <0x00 0x9f500000 0x00 0x800000>;
+			no-map;
+		};
+
+		sec_debug_region_log@8001FF000 {
+			reg = <0x08 0x1ff000 0x00 0x901000>;
+		};
+
+		xbl_ramdump_region@80640000 {
+			reg = <0x00 0xa7d00000 0x00 0x300000>;
+			no-map;
+		};
+
+		trust_ui_vm_swiotlb@e5100000 {
+			reg = <0x00 0xe5100000 0x00 0x100000>;
+			gunyah-label = <0x12>;
+			no-map;
+		};
+
+		trust_ui_vm_qrtr@e50f3000 {
+			reg = <0x00 0xe50f3000 0x00 0x9000>;
+			no-map;
+		};
+
+		adsp_region@85e00000 {
+			reg = <0x00 0x84500000 0x00 0x3b00000>;
+			no-map;
+		};
+
+		ipa_fw_region@8b900000 {
+			reg = <0x00 0x8b900000 0x00 0x10000>;
+			no-map;
+		};
+
+		kaslr_region {
+			reg = <0x00 0xb01ff000 0x00 0x1000>;
+			no-map = <0x00 0x00>;
+		};
+
+		cvp_region@9ee00000 {
+			reg = <0x00 0x9ee00000 0x00 0x700000>;
+			no-map;
+		};
+
+		tme_log_region@808e0000 {
+			reg = <0x00 0x808e0000 0x00 0x4000>;
+			no-map;
+		};
+
+		aop_cmd_db_region@80860000 {
+			reg = <0x00 0x80860000 0x00 0x20000>;
+			no-map;
+		};
+
+		gpu_microcode_region@8b91a000 {
+			reg = <0x00 0x8b91a000 0x00 0x2000>;
+			no-map;
+		};
+
+		slpi_region@88000000 {
+			reg = <0x00 0x88000000 0x00 0x1700000>;
+			no-map;
+		};
+
+		trust_ui_vm_dump@e50f2000 {
+			reg = <0x00 0xe50f2000 0x00 0x1000>;
+			no-map;
+		};
+
+		uh_heap_region {
+			reg = <0x00 0xb0200000 0x00 0x40000>;
+		};
+
+		uefi_log_region@808e4000 {
+			reg = <0x00 0x808e4000 0x00 0x10000>;
+			no-map;
+		};
+
+		aop_config_region@80880000 {
+			reg = <0x00 0x80880000 0x00 0x20000>;
+			no-map;
+		};
+
+		sec_debug_region_pool@800100000 {
+			reg = <0x08 0x100000 0x00 0xff000>;
+		};
+
+		qtee_region@e9b00000 {
+			reg = <0x00 0xe9b00000 0x00 0x500000>;
+			no-map;
+		};
+
+		cpucp_fw_region@80b00000 {
+			reg = <0x00 0x80b00000 0x00 0x100000>;
+			no-map;
+		};
+
+		xbl_sc_region@a6e00000 {
+			reg = <0x00 0xa6e00000 0x00 0x40000>;
+			no-map;
+		};
+
+		spu_modem_shared_mem@8bbe0000 {
+			reg = <0x00 0x8bbe0000 0x00 0x20000>;
+			no-map;
+		};
+
+		cdsp_region@89900000 {
+			reg = <0x00 0x89900000 0x00 0x2000000>;
+			no-map;
+		};
+
+		aop_image_region@80800000 {
+			reg = <0x00 0x80800000 0x00 0x60000>;
+			no-map;
+		};
+
+		spu_tz_shared_mem@8bb80000 {
+			reg = <0x00 0x8bb80000 0x00 0x60000>;
+			no-map;
+		};
+
+		xbl_dtlog_region@80600000 {
+			reg = <0x00 0x80600000 0x00 0x40000>;
+			no-map;
+		};
+
+		hdm_region@800B01000 {
+			reg = <0x08 0xb01000 0x00 0x1000>;
+			no-map;
+		};
+
+		tz_stat_region@e8800000 {
+			reg = <0x00 0xe8800000 0x00 0x100000>;
+			no-map;
+		};
+	};
+};
+
+
+&apps_rsc {
+	regulators-0 {
+		compatible = "qcom,pm8350-rpmh-regulators";
+		qcom,pmic-id = "b";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+		vdd-s4-supply = <&vph_pwr>;
+		vdd-s5-supply = <&vph_pwr>;
+		vdd-s6-supply = <&vph_pwr>;
+		vdd-s7-supply = <&vph_pwr>;
+		vdd-s8-supply = <&vph_pwr>;
+		vdd-s9-supply = <&vph_pwr>;
+		vdd-s10-supply = <&vph_pwr>;
+		vdd-s11-supply = <&vph_pwr>;
+		vdd-s12-supply = <&vph_pwr>;
+
+		vdd-l1-l4-supply = <&pm8350_s11>;
+		vdd-l2-l7-supply = <&vreg_bob>;
+		vdd-l3-l5-supply = <&pm8350_s11>;
+		vdd-l6-l9-l10-supply = <&pm8350_s12>;
+		*/
+
+		/*
+		 * ARC regulators:
+		 * s5 - gfx.lvl
+		 * l8 - lcx.lvl
+		 */
+
+		pm8350_s10: smps10 {
+			regulator-name = "pm8350_s10";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1800000>;
+		};
+
+		pm8350_s11: smps11 {
+			regulator-name = "pm8350_s11";
+			regulator-min-microvolt = <382000>;
+			regulator-max-microvolt = <1170000>;
+		};
+
+		pm8350_s12: smps12 {
+			regulator-name = "pm8350_s12";
+			regulator-min-microvolt = <1224000>;
+			regulator-max-microvolt = <2040000>;
+		};
+
+		pm8350_l1: ldo1 {
+			regulator-name = "pm8350_l1";
+			regulator-min-microvolt = <830000>;
+			regulator-max-microvolt = <920000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l2: ldo2 {
+			regulator-name = "pm8350_l2";
+			regulator-min-microvolt = <2700000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l3: ldo3 {
+			regulator-name = "pm8350_l3";
+			regulator-min-microvolt = <870000>;
+			regulator-max-microvolt = <970000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l5: ldo5 {
+			regulator-name = "pm8350_l5";
+			regulator-min-microvolt = <720000>;
+			regulator-max-microvolt = <958000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l6: ldo6 {
+			regulator-name = "pm8350_l6";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1216000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l7: ldo7 {
+			regulator-name = "pm8350_l7";
+			regulator-min-microvolt = <2400000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350_l9: ldo9 {
+			regulator-name = "pm8350_l9";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1304000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+	};
+
+	regulators-1 {
+		compatible = "qcom,pm8350c-rpmh-regulators";
+		qcom,pmic-id = "c";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+		vdd-s4-supply = <&vph_pwr>;
+		vdd-s5-supply = <&vph_pwr>;
+		vdd-s6-supply = <&vph_pwr>;
+		vdd-s7-supply = <&vph_pwr>;
+		vdd-s8-supply = <&vph_pwr>;
+		vdd-s9-supply = <&vph_pwr>;
+		vdd-s10-supply = <&vph_pwr>;
+
+		vdd-l1-l12-supply = <&pm8350c_s1>;
+		vdd-l2-l8-supply = <&pm8350c_s1>;
+		vdd-l3-l4-l5-l7-l13-supply = <&vreg_bob>;
+		vdd-l6-l9-l11-supply = <&vreg_bob>;
+		vdd-l10-supply = <&pm8350_s12>;
+
+		vdd-bob-supply = <&vph_pwr>;
+		*/
+
+		/*
+		 * ARC regulators:
+		 * s2 - mxc.lvl
+		 * s4 - mss.lvl
+		 * s6 - cx.lvl
+		 */
+
+		pm8350c_s1: smps1 {
+			regulator-name = "pm8350c_s1";
+			regulator-min-microvolt = <1900000>;
+			regulator-max-microvolt = <2024000>;
+		};
+
+		pm8350c_s10: smps10 {
+			regulator-name = "pm8350c_s10";
+			regulator-min-microvolt = <1052000>;
+			regulator-max-microvolt = <1170000>;
+		};
+
+		vreg_bob: bob {
+			regulator-name = "vreg_bob";
+			regulator-min-microvolt = <3008000>;
+			regulator-max-microvolt = <3960000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_AUTO>;
+		};
+
+		pm8350c_l2: ldo2 {
+			regulator-name = "pm8350c_l2";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1980000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l3: ldo3 {
+			regulator-name = "pm8350c_l3";
+			regulator-min-microvolt = <2700000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l4: ldo4 {
+			regulator-name = "pm8350c_l4";
+			regulator-min-microvolt = <1620000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l5: ldo5 {
+			regulator-name = "pm8350c_l5";
+			regulator-min-microvolt = <1620000>;
+			regulator-max-microvolt = <3300000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l6: ldo6 {
+			regulator-name = "pm8350c_l6";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l7: ldo7 {
+			regulator-name = "pm8350c_l7";
+			regulator-min-microvolt = <3000000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l8: ldo8 {
+			regulator-name = "pm8350c_l8";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <2000000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l9: ldo9 {
+			regulator-name = "pm8350c_l9";
+			regulator-min-microvolt = <2700000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l10: ldo10 {
+			regulator-name = "pm8350c_l10";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1304000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l12: ldo12 {
+			regulator-name = "pm8350c_l12";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1980000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8350c_l13: ldo13 {
+			regulator-name = "pm8350c_l13";
+			regulator-min-microvolt = <2700000>;
+			regulator-max-microvolt = <3544000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+	};
+
+	regulators-2 {
+		compatible = "qcom,pm8450-rpmh-regulators";
+		qcom,pmic-id = "h";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+		vdd-s4-supply = <&vph_pwr>;
+		vdd-s5-supply = <&vph_pwr>;
+		vdd-s6-supply = <&vph_pwr>;
+
+		vdd-l2-supply = <&vreg_bob>;
+		vdd-l3-supply = <&vreg_bob>;
+		vdd-l4-supply = <&vreg_bob>;
+		*/
+
+		/*
+		 * ARC regulators:
+		 * S2 - ebi.lvl
+		 * S4 - mmcx.lvl
+		 * S6 - mx.lvl
+		 * L1 - lmx.lvl
+		 */
+
+		pm8450_s3: smps3 {
+			regulator-name = "pm8450_s3";
+			regulator-min-microvolt = <470000>;
+			regulator-max-microvolt = <570000>;
+		};
+
+		pm8450_l2: ldo2 {
+			regulator-name = "pm8450_l2";
+			regulator-min-microvolt = <820000>;
+			regulator-max-microvolt = <958000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+
+		pm8450_l3: ldo3 {
+			regulator-name = "pm8450_l3";
+			regulator-min-microvolt = <866000>;
+			regulator-max-microvolt = <958000>;
+			regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+		};
+	};
+
+	regulators-3 {
+		compatible = "qcom,pmr735a-rpmh-regulators";
+		qcom,pmic-id = "e";
+
+		/*
+		vdd-s1-supply = <&vph_pwr>;
+		vdd-s2-supply = <&vph_pwr>;
+		vdd-s3-supply = <&vph_pwr>;
+
+		vdd-l1-l2-supply = <&pmr735a_s2>;
+		vdd-l3-supply = <&pmr735a_s1>;
+		vdd-l4-supply = <&pm8350c_s1>;
+		vdd-l5-l6-supply = <&pm8350c_s1>;
+		vdd-l7-bob-supply = <&vreg_bob>;
+		*/
+
+		pmr735a_s2: smps2 {
+			regulator-name = "pmr735a_s2";
+			regulator-min-microvolt = <500000>;
+			regulator-max-microvolt = <1040000>;
+		};
+
+		pmr735a_s3: smps3 {
+			regulator-name = "pmr735a_s3";
+			regulator-min-microvolt = <300000>;
+			regulator-max-microvolt = <2352000>;
+		};
+
+		pmr735a_l1: ldo1 {
+			regulator-name = "pmr735a_l1";
+			regulator-min-microvolt = <800000>;
+			regulator-max-microvolt = <880000>;
+		};
+
+		pmr735a_l2: ldo2 {
+			regulator-name = "pmr735a_l2";
+			regulator-min-microvolt = <480000>;
+			regulator-max-microvolt = <1200000>;
+		};
+
+		pmr735a_l3: ldo3 {
+			regulator-name = "pmr735a_l3";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1200000>;
+		};
+
+		pmr735a_l4: ldo4 {
+			regulator-name = "pmr735a_l4";
+			regulator-min-microvolt = <1776000>;
+			regulator-max-microvolt = <1776000>;
+		};
+
+		pmr735a_l5: ldo5 {
+			regulator-name = "pmr735a_l5";
+			regulator-min-microvolt = <880000>;
+			regulator-max-microvolt = <920000>;
+		};
+
+		pmr735a_l6: ldo6 {
+			regulator-name = "pmr735a_l6";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1200000>;
+		};
+
+		pmr735a_l7: ldo7 {
+			regulator-name = "pmr735a_l7";
+			regulator-min-microvolt = <2800000>;
+			regulator-max-microvolt = <2800000>;
+		};
+	};
+};
+
+&gpi_dma0 {
+	status = "okay";
+};
+
+&gpi_dma1 {
+	status = "okay";
+};
+
+&gpi_dma2 {
+	status = "okay";
+};
+
+&i2c5 {
+	clock-frequency = <100000>;
+	status = "okay";
+
+	/* nq@64 rtc6226 */
+	/* fsa4480@42 qcom,fsa4480-i2c */
+	/* redriver@1c onnn,redriver */
+	/* aw20036_led@3a awinic,aw20036_led */
+	/* eusb2_repeater@4f nxp,eusb2-repeater */
+
+	nxp_eusb2_repeater: eusb2_repeater@4f {
+		compatible = "nxp,eusb2-repeater";
+		reg = <0x4f>;
+		vdd18-supply = <&pm8350_s10>;
+		vdd3-supply = <&pm8350_l2>;
+		reset-gpio = <&pm8350c_gpios 7 GPIO_ACTIVE_HIGH>;
+		pinctrl-names = "default";
+		pinctrl-0 = <&eusb2_reset_ctrl_default>;
+		#phy-cells = <0>;
+		qcom,param-override-seq =
+				/* Rx squelch detection threshold to 110mV; default is 125mV */
+				<0x40 0x06
+				/*
+				 * Tx Deemphasis to 2dB, Tx Deemphasis bit duration to 0.8UI;
+				 * default is 0 for both
+				 */
+				0x22 0x07
+				/* Output Voltage Swing to 550mV; default is 450mV */
+				0x64 0x08>;
+	};
+};
+
+&i2c9 {
+	clock-frequency = <1000000>;
+	status = "okay";
+
+	/* nq@28(ts) qcom,sn-nci */
+};
+
+&i2c13 {
+	clock-frequency = <1000000>;
+	status = "okay";
+
+	/* tfa98xx@34 */
+	/* tfa98xx@35 */
+};
+
+&i2c18 {
+	clock-frequency = <1000000>;
+	status = "okay";
+
+	/* haptic_hv@5a awinic,aw8692x */
+};
+
+&pm8350c_gpios {
+	eusb2_reset_ctrl_default: eusb2_reset_ctrl_default {
+		pins = "gpio7";
+		function = "normal";
+		input-enable;
+		output-enable;
+		bias-disable;
+		power-source = <1>;	/* 1.8V */
+		qcom,drive-strength = <2>;
+	};
+};
+
+&qupv3_id_0 {
+	status = "okay";
+};
+
+&qupv3_id_1 {
+	status = "okay";
+};
+
+&qupv3_id_2 {
+	status = "okay";
+};
+
+&spi4 {
+	clock-frequency = <20000000>;
+	status = "okay";
+
+	/* goodix-berlin@0 goodix,brl-d */
+};
+
+&tlmm {
+	gpio-reserved-ranges = <28 4>;
+};
+
+&ufs_mem_hc {
+	status = "okay";
+
+	reset-gpios = <&tlmm 210 GPIO_ACTIVE_LOW>;
+
+	vcc-supply = <&pm8350_l7>;
+	vcc-max-microamp = <1100000>;
+	vccq-supply = <&pm8350_l9>;
+	vccq-max-microamp = <1200000>;
+	vdd-hba-supply = <&pm8350_l9>;
+};
+
+&ufs_mem_phy {
+	status = "okay";
+
+	vdda-phy-supply = <&pm8350_l5>;
+	vdda-pll-supply = <&pm8350_l6>;
+};
+
+&usb_1 {
+	/* USB 2.0 only */
+	qcom,select-utmi-as-pipe-clk;
+	status = "okay";
+};
+
+&usb_1_dwc3 {
+	dr_mode = "peripheral";
+	maximum-speed = "high-speed";
+	/* Remove USB3 phy */
+	phys = <&usb_1_hsphy>;
+	phy-names = "usb2-phy";
+};
+
+&usb_1_hsphy {
+	vdd-supply = <&pm8350_l5>;
+	vdda12-supply = <&pm8350c_l10>;
+
+	phys = <&nxp_eusb2_repeater>;
+
+	status = "okay";
+};

From a9b8271c0b1dcfe02daaa03f4fa73616edc61a04 Mon Sep 17 00:00:00 2001
From: BotchedRPR <thenxguy0@gmail.com>
Date: Fri, 2 Feb 2024 10:27:58 +0000
Subject: [PATCH 702/707] q4q: disable framebuffer

---
 arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts b/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts
index ed2fab26ac5d8d..458b0db856e146 100644
--- a/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts
+++ b/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts
@@ -23,20 +23,6 @@
 
 	qcom,msm-id = <530 0x10000 0>;
 
-	chosen {
-
-		framebuffer0: framebuffer@b8000000 {
-			compatible = "simple-framebuffer";
-			reg = <0 0xb8000000 0 (2176 * 1812 * 4)>;
-			width = <1812>;
-			height = <2176>;
-			stride = <(1812 * 4)>;
-			format = "a8r8g8b8";
-			status= "okay";
-		};
-
-	};
-
 	reserved-memory {
 		/* Clear this up, this is awful */
 		tme_crash_dump_region@808a0000 {

From db07b3ce3597f2aa5c027aa2e7178a92d23ccdc4 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Fri, 2 Feb 2024 19:33:11 +0300
Subject: [PATCH 703/707] arm64: dts: qcom: nothing-pong: add panel

---
 .../boot/dts/qcom/sm8475-nothing-pong.dts     | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/sm8475-nothing-pong.dts b/arch/arm64/boot/dts/qcom/sm8475-nothing-pong.dts
index 4be3995041fcdc..e83cc7a064c96e 100644
--- a/arch/arm64/boot/dts/qcom/sm8475-nothing-pong.dts
+++ b/arch/arm64/boot/dts/qcom/sm8475-nothing-pong.dts
@@ -417,6 +417,10 @@
 	};
 };
 
+&dispcc {
+	status = "okay";
+};
+
 &gpi_dma0 {
 	status = "okay";
 };
@@ -429,6 +433,14 @@
 	status = "okay";
 };
 
+&gpu {
+	status = "okay";
+
+	zap-shader {
+		firmware-name = "qcom/a730_zap.mbn";
+	};
+};
+
 &i2c5 {
 	clock-frequency = <100000>;
 	status = "okay";
@@ -483,6 +495,42 @@
 	/* haptic_hv@5a awinic,aw8692x */
 };
 
+&mdss {
+	status = "okay";
+};
+
+&mdss_dsi0 {
+	vdda-supply = <&pm8350_l6>;
+	status = "okay";
+
+	panel@0 {
+		compatible = "mdss,nt37705-visionox-amoled-120hz";
+		reg = <0>;
+
+		reset-gpios = <&tlmm 0 GPIO_ACTIVE_HIGH>;
+
+		vddd-supply = <&pm8350c_l10>;
+		vci-supply = <&pm8350c_l13>;
+		vddio-supply = <&pm8350c_l12>;
+
+		port {
+			panel_in: endpoint {
+				remote-endpoint = <&mdss_dsi0_out>;
+			};
+		};
+	};
+};
+
+&mdss_dsi0_out {
+	data-lanes = <0 1 2 3>;
+	remote-endpoint = <&panel_in>;
+};
+
+&mdss_dsi0_phy {
+	vdds-supply = <&pm8350_l5>;
+	status = "okay";
+};
+
 &pm8350c_gpios {
 	eusb2_reset_ctrl_default: eusb2_reset_ctrl_default {
 		pins = "gpio7";

From 4f932c0044cb22b1f3bfb86721120c5640a30f52 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Fri, 2 Feb 2024 20:27:13 +0300
Subject: [PATCH 704/707] include: drm_mipi_dsi: add support for 10bit panels

Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
---
 include/drm/drm_mipi_dsi.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/drm/drm_mipi_dsi.h b/include/drm/drm_mipi_dsi.h
index c0aec0d4d664e7..8b58bb02773366 100644
--- a/include/drm/drm_mipi_dsi.h
+++ b/include/drm/drm_mipi_dsi.h
@@ -141,6 +141,7 @@ struct mipi_dsi_host *of_find_mipi_dsi_host_by_node(struct device_node *node);
 #define MIPI_DSI_HS_PKT_END_ALIGNED	BIT(12)
 
 enum mipi_dsi_pixel_format {
+	MIPI_DSI_FMT_RGB101010,
 	MIPI_DSI_FMT_RGB888,
 	MIPI_DSI_FMT_RGB666,
 	MIPI_DSI_FMT_RGB666_PACKED,
@@ -212,6 +213,9 @@ struct mipi_dsi_device {
 static inline int mipi_dsi_pixel_format_to_bpp(enum mipi_dsi_pixel_format fmt)
 {
 	switch (fmt) {
+	case MIPI_DSI_FMT_RGB101010:
+		return 30;
+
 	case MIPI_DSI_FMT_RGB888:
 	case MIPI_DSI_FMT_RGB666:
 		return 24;

From 326b646efcb726ebf1d90320c90e9403d0cb44c8 Mon Sep 17 00:00:00 2001
From: BotchedRPR <thenxguy0@gmail.com>
Date: Fri, 2 Feb 2024 19:39:57 +0000
Subject: [PATCH 705/707] samsung-q4q: fixups, by JIaxyga

Thank you! This should fix up i2c, and clean the overall device tree up.
---
 .../boot/dts/qcom/sm8475-samsung-q4q.dts      | 49 +++----------------
 1 file changed, 7 insertions(+), 42 deletions(-)

diff --git a/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts b/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts
index 458b0db856e146..a82b5b706412ed 100644
--- a/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts
+++ b/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts
@@ -8,6 +8,7 @@
 #include <dt-bindings/arm/qcom,ids.h>
 #include <dt-bindings/regulator/qcom,rpmh-regulator.h>
 #include <dt-bindings/leds/common.h>
+#include <dt-bindings/pinctrl/qcom,pmic-gpio.h>
 
 #include "sm8475.dtsi"
 #include "pm8350.dtsi"
@@ -21,7 +22,7 @@
 	model = "Galaxy Z Fold4";
 	chassis-type = "phablet";
 
-	qcom,msm-id = <530 0x10000 0>;
+	qcom,msm-id = <QCOM_ID_SM8475 0x10000>;
 
 	reserved-memory {
 		/* Clear this up, this is awful */
@@ -584,32 +585,18 @@
 	status = "okay";
 };
 
-&gpi_dma1 {
-	status = "okay";
-};
-
-&gpi_dma2 {
-	status = "okay";
-};
-
 &i2c5 {
 	clock-frequency = <100000>;
 	status = "okay";
 
-	/* nq@64 rtc6226 */
-	/* fsa4480@42 qcom,fsa4480-i2c */
-	/* redriver@1c onnn,redriver */
-	/* aw20036_led@3a awinic,aw20036_led */
-	/* eusb2_repeater@4f nxp,eusb2-repeater */
-
-	nxp_eusb2_repeater: eusb2_repeater@4f {
+	nxp_eusb2_repeater: repeater@4f {
 		compatible = "nxp,eusb2-repeater";
 		reg = <0x4f>;
 		vdd18-supply = <&pm8350_s10>;
 		vdd3-supply = <&pm8350_l2>;
 		reset-gpio = <&pm8350c_gpios 7 GPIO_ACTIVE_HIGH>;
 		pinctrl-names = "default";
-		pinctrl-0 = <&eusb2_reset_ctrl_default>;
+		pinctrl-0 = <&eusb2_reset_default>;
 		#phy-cells = <0>;
 		qcom,param-override-seq =
 				/* Rx squelch detection threshold to 110mV; default is 125mV */
@@ -624,37 +611,15 @@
 	};
 };
 
-&i2c9 {
-	clock-frequency = <1000000>;
-	status = "okay";
-
-	/* nq@28(ts) qcom,sn-nci */
-};
-
-&i2c13 {
-	clock-frequency = <1000000>;
-	status = "okay";
-
-	/* tfa98xx@34 */
-	/* tfa98xx@35 */
-};
-
-&i2c18 {
-	clock-frequency = <1000000>;
-	status = "okay";
-
-	/* haptic_hv@5a awinic,aw8692x */
-};
-
 &pm8350c_gpios {
-	eusb2_reset_ctrl_default: eusb2_reset_ctrl_default {
+	eusb2_reset_default: eusb2-reset-default-state {
 		pins = "gpio7";
-		function = "normal";
+		function = PMIC_GPIO_FUNC_NORMAL;
 		input-enable;
 		output-enable;
 		bias-disable;
 		power-source = <1>;	/* 1.8V */
-		qcom,drive-strength = <2>;
+		qcom,drive-strength = <PMIC_GPIO_STRENGTH_MED>;
 	};
 };
 

From 6e8ecfc7f1a8968a446ca9f9588d365769b83e97 Mon Sep 17 00:00:00 2001
From: BotchedRPR <thenxguy0@gmail.com>
Date: Fri, 2 Feb 2024 21:04:33 +0000
Subject: [PATCH 706/707] q4q: HACK - disable tlmm

---
 arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts b/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts
index a82b5b706412ed..991d773c6ab176 100644
--- a/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts
+++ b/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts
@@ -643,11 +643,12 @@
 };
 
 &tlmm {
+	status = "disabled";
 	gpio-reserved-ranges = <28 4>;
 };
 
 &ufs_mem_hc {
-	status = "okay";
+	status = "disabled";
 
 	reset-gpios = <&tlmm 210 GPIO_ACTIVE_LOW>;
 

From ebe7e2b5275237d1fd7bc83373fb2b304800bab6 Mon Sep 17 00:00:00 2001
From: BotchedRPR <thenxguy0@gmail.com>
Date: Fri, 2 Feb 2024 22:14:15 +0000
Subject: [PATCH 707/707] q4q: switch to ti eusb2-repeater

---
 .../boot/dts/qcom/sm8475-samsung-q4q.dts      | 20 ++++++-------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts b/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts
index 991d773c6ab176..a8a0298690cc27 100644
--- a/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts
+++ b/arch/arm64/boot/dts/qcom/sm8475-samsung-q4q.dts
@@ -590,7 +590,7 @@
 	status = "okay";
 
 	nxp_eusb2_repeater: repeater@4f {
-		compatible = "nxp,eusb2-repeater";
+		compatible = "ti,eusb2-repeater";
 		reg = <0x4f>;
 		vdd18-supply = <&pm8350_s10>;
 		vdd3-supply = <&pm8350_l2>;
@@ -598,16 +598,8 @@
 		pinctrl-names = "default";
 		pinctrl-0 = <&eusb2_reset_default>;
 		#phy-cells = <0>;
-		qcom,param-override-seq =
-				/* Rx squelch detection threshold to 110mV; default is 125mV */
-				<0x40 0x06
-				/*
-				 * Tx Deemphasis to 2dB, Tx Deemphasis bit duration to 0.8UI;
-				 * default is 0 for both
-				 */
-				0x22 0x07
-				/* Output Voltage Swing to 550mV; default is 450mV */
-				0x64 0x08>;
+		qcom,param-override-seq = <0x7d 0x70 0x3e 0x71 0x76 0x73 0x70 0x79>;
+		qcom,param-host-override-seq = <0x7d 0x70 0x3e 0x71 0x76 0x73 0x70 0x79>;
 	};
 };
 
@@ -643,12 +635,12 @@
 };
 
 &tlmm {
-	status = "disabled";
-	gpio-reserved-ranges = <28 4>;
+	status = "okay";
+	gpio-reserved-ranges = <36 37 38 39 50 93>;
 };
 
 &ufs_mem_hc {
-	status = "disabled";
+	status = "okay";
 
 	reset-gpios = <&tlmm 210 GPIO_ACTIVE_LOW>;